From 4d094b0c20c54ff14068e9ada9c34a781493ac66 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 7 Aug 2023 15:12:44 +0800
Subject: [PATCH] Fix typos (#56008)

---
 test/collective/fleet/hybrid_parallel_mp_amp.py    |  4 ++--
 test/collective/fleet/hybrid_parallel_mp_bf16.py   |  4 ++--
 .../fleet/hybrid_parallel_mp_broadcast_obj.py      |  4 ++--
 .../fleet/hybrid_parallel_mp_clip_grad.py          |  4 ++--
 test/collective/fleet/hybrid_parallel_mp_fp16.py   |  4 ++--
 test/collective/fleet/hybrid_parallel_mp_layers.py |  2 +-
 test/collective/fleet/hybrid_parallel_mp_model.py  |  6 +++---
 ...rid_parallel_mp_model_with_sequence_parallel.py |  6 +++---
 test/collective/fleet/hybrid_parallel_mp_random.py |  2 +-
 test/collective/fleet/hybrid_parallel_pp_amp.py    |  2 +-
 test/collective/fleet/hybrid_parallel_pp_bf16.py   |  2 +-
 .../fleet/hybrid_parallel_pp_clip_grad.py          |  6 +++---
 .../fleet/hybrid_parallel_pp_embedding.py          |  2 +-
 test/collective/fleet/hybrid_parallel_pp_fp16.py   |  2 +-
 .../fleet/hybrid_parallel_pp_recompute.py          |  2 +-
 .../fleet/hybrid_parallel_pp_save_load.py          |  2 +-
 ...rid_parallel_pp_save_load_with_virtual_stage.py |  2 +-
 .../fleet/hybrid_parallel_pp_transformer.py        |  2 +-
 .../fleet/hybrid_parallel_pp_transformer_save.py   |  2 +-
 ...allel_pp_transformer_save_with_virtual_stage.py |  2 +-
 ...brid_parallel_pp_transformer_unbalanced_data.py |  4 ++--
 ...d_parallel_pp_transformer_with_virtual_stage.py |  2 +-
 test/collective/fleet/hybrid_parallel_qat.py       |  2 +-
 .../fleet/hybrid_parallel_sharding_model.py        |  2 +-
 .../fleet/hybrid_parallel_shared_weight.py         |  2 +-
 .../parallel_dygraph_no_sync_gradient_check.py     |  2 +-
 .../fleet/test_fleet_static_mp_layers.py           |  2 +-
 test/legacy_test/auto_parallel_gpt_model.py        | 10 +++++-----
 test/legacy_test/benchmark_sum_op.py               |  4 ++--
 test/legacy_test/dist_fleet_ctr.py                 |  4 ++--
 test/legacy_test/dist_hapi_mnist_dynamic.py        |  2 +-
 test/legacy_test/dist_hapi_mnist_static.py         |  2 +-
 test/legacy_test/dist_hapi_pure_fp16_static.py     |  4 ++--
 test/legacy_test/gradient_checker.py               | 14 +++++++-------
 test/legacy_test/hybrid_parallel_pp_alexnet.py     |  6 +++---
 .../parallel_dygraph_dataparallel_with_pylayer.py  |  2 +-
 .../legacy_test/parallel_dygraph_gradient_check.py |  2 +-
 ...arallel_dygraph_gradient_check_in_eager_mode.py |  2 +-
 .../parallel_dygraph_dataparallel_with_pylayer.py  |  2 +-
 test/xpu/parallel_dygraph_gradient_check.py        |  2 +-
 ...arallel_dygraph_gradient_check_in_eager_mode.py |  2 +-
 41 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/test/collective/fleet/hybrid_parallel_mp_amp.py b/test/collective/fleet/hybrid_parallel_mp_amp.py
index 2ae5fcf7029..7b139c09664 100644
--- a/test/collective/fleet/hybrid_parallel_mp_amp.py
+++ b/test/collective/fleet/hybrid_parallel_mp_amp.py
@@ -14,13 +14,13 @@
 
 import unittest
 
-from hybrid_parallel_mp_model import TestDistMPTraning
+from hybrid_parallel_mp_model import TestDistMPTraining
 
 import paddle
 from paddle.distributed import fleet
 
 
-class TestMPClipGrad(TestDistMPTraning):
+class TestMPClipGrad(TestDistMPTraining):
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
         scheduler = paddle.optimizer.lr.ExponentialDecay(
diff --git a/test/collective/fleet/hybrid_parallel_mp_bf16.py b/test/collective/fleet/hybrid_parallel_mp_bf16.py
index ae977f98917..2ddf1868dd0 100644
--- a/test/collective/fleet/hybrid_parallel_mp_bf16.py
+++ b/test/collective/fleet/hybrid_parallel_mp_bf16.py
@@ -14,14 +14,14 @@
 
 import unittest
 
-from hybrid_parallel_mp_model import TestDistMPTraning
+from hybrid_parallel_mp_model import TestDistMPTraining
 
 import paddle
 from paddle.distributed import fleet
 from paddle.distributed.utils.nccl_utils import check_nccl_version_for_bf16
 
 
-class TestMPFP16(TestDistMPTraning):
+class TestMPFP16(TestDistMPTraining):
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0)
         scheduler = paddle.optimizer.lr.ExponentialDecay(
diff --git a/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py b/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py
index b376afa1a99..9872bbf20e9 100644
--- a/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py
+++ b/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py
@@ -19,7 +19,7 @@ import numpy as np
 from hybrid_parallel_mp_model import (
     SimpleDPNet,
     SimpleMPNet,
-    TestDistMPTraning,
+    TestDistMPTraining,
     parallel_matmul,
     set_random_seed,
 )
@@ -58,7 +58,7 @@ class SimpleDPMultimodalNet(SimpleDPNet):
         return x
 
 
-class TestMPBroadcastObj(TestDistMPTraning):
+class TestMPBroadcastObj(TestDistMPTraining):
     def build_model_optimizer(self):
         hcg = fleet.get_hybrid_communicate_group()
         word_size = hcg.get_model_parallel_world_size()
diff --git a/test/collective/fleet/hybrid_parallel_mp_clip_grad.py b/test/collective/fleet/hybrid_parallel_mp_clip_grad.py
index 1f674675ecb..dab3998f981 100644
--- a/test/collective/fleet/hybrid_parallel_mp_clip_grad.py
+++ b/test/collective/fleet/hybrid_parallel_mp_clip_grad.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from hybrid_parallel_mp_model import TestDistMPTraning
+from hybrid_parallel_mp_model import TestDistMPTraining
 
 import paddle
 
@@ -22,7 +22,7 @@ import paddle
 # log.setLevel(logging.WARNING)
 
 
-class TestMPClipGrad(TestDistMPTraning):
+class TestMPClipGrad(TestDistMPTraining):
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
         scheduler = paddle.optimizer.lr.ExponentialDecay(
diff --git a/test/collective/fleet/hybrid_parallel_mp_fp16.py b/test/collective/fleet/hybrid_parallel_mp_fp16.py
index 5d10ac0a7ae..ca5fdc3a406 100644
--- a/test/collective/fleet/hybrid_parallel_mp_fp16.py
+++ b/test/collective/fleet/hybrid_parallel_mp_fp16.py
@@ -14,13 +14,13 @@
 
 import unittest
 
-from hybrid_parallel_mp_model import TestDistMPTraning
+from hybrid_parallel_mp_model import TestDistMPTraining
 
 import paddle
 from paddle.distributed import fleet
 
 
-class TestMPFP16(TestDistMPTraning):
+class TestMPFP16(TestDistMPTraining):
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0)
         scheduler = paddle.optimizer.lr.ExponentialDecay(
diff --git a/test/collective/fleet/hybrid_parallel_mp_layers.py b/test/collective/fleet/hybrid_parallel_mp_layers.py
index 751bc9255c1..4a7b223c687 100644
--- a/test/collective/fleet/hybrid_parallel_mp_layers.py
+++ b/test/collective/fleet/hybrid_parallel_mp_layers.py
@@ -115,7 +115,7 @@ class SimpleEmbedding(paddle.nn.Layer):
         return output
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
diff --git a/test/collective/fleet/hybrid_parallel_mp_model.py b/test/collective/fleet/hybrid_parallel_mp_model.py
index 44ceb368b9b..08ae8f51e47 100644
--- a/test/collective/fleet/hybrid_parallel_mp_model.py
+++ b/test/collective/fleet/hybrid_parallel_mp_model.py
@@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer):
         return x
 
 
-class TestDistMPSyncTraning(unittest.TestCase):
+class TestDistMPSyncTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
@@ -348,7 +348,7 @@ class TestDistMPSyncTraning(unittest.TestCase):
         )
 
 
-class TestDistMPSyncModelTraning(TestDistMPSyncTraning):
+class TestDistMPSyncModelTraining(TestDistMPSyncTraining):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
@@ -368,7 +368,7 @@ class TestDistMPSyncModelTraning(TestDistMPSyncTraning):
         fleet.init(is_collective=True, strategy=strategy)
 
 
-class TestDistMPTraning(unittest.TestCase):
+class TestDistMPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
diff --git a/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py b/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py
index fa78482601f..a4f11294f38 100644
--- a/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py
+++ b/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py
@@ -200,7 +200,7 @@ class SimpleDPNet(paddle.nn.Layer):
         return x
 
 
-class TestDistSPSyncTraning(unittest.TestCase):
+class TestDistSPSyncTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
@@ -370,7 +370,7 @@ class TestDistSPSyncTraning(unittest.TestCase):
         )
 
 
-class TestDistSPSyncModelTraning(TestDistSPSyncTraning):
+class TestDistSPSyncModelTraining(TestDistSPSyncTraining):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
@@ -390,7 +390,7 @@ class TestDistSPSyncModelTraning(TestDistSPSyncTraning):
         fleet.init(is_collective=True, strategy=strategy)
 
 
-class TestDistSPTraning(unittest.TestCase):
+class TestDistSPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
diff --git a/test/collective/fleet/hybrid_parallel_mp_random.py b/test/collective/fleet/hybrid_parallel_mp_random.py
index 717e983e262..00877818b1e 100644
--- a/test/collective/fleet/hybrid_parallel_mp_random.py
+++ b/test/collective/fleet/hybrid_parallel_mp_random.py
@@ -20,7 +20,7 @@ import paddle
 from paddle.distributed import fleet
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
diff --git a/test/collective/fleet/hybrid_parallel_pp_amp.py b/test/collective/fleet/hybrid_parallel_pp_amp.py
index 7c9f973ee13..f3fe88a9161 100644
--- a/test/collective/fleet/hybrid_parallel_pp_amp.py
+++ b/test/collective/fleet/hybrid_parallel_pp_amp.py
@@ -34,7 +34,7 @@ batch_size = 4
 micro_batch_size = 2
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_bf16.py b/test/collective/fleet/hybrid_parallel_pp_bf16.py
index 70b3aec1515..f260cd88f2f 100644
--- a/test/collective/fleet/hybrid_parallel_pp_bf16.py
+++ b/test/collective/fleet/hybrid_parallel_pp_bf16.py
@@ -35,7 +35,7 @@ batch_size = 4
 micro_batch_size = 2
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_clip_grad.py b/test/collective/fleet/hybrid_parallel_pp_clip_grad.py
index 695e3ee90ef..76b0fa90c05 100644
--- a/test/collective/fleet/hybrid_parallel_pp_clip_grad.py
+++ b/test/collective/fleet/hybrid_parallel_pp_clip_grad.py
@@ -17,12 +17,12 @@ import unittest
 
 sys.path.append("../../legacy_test")
 
-from hybrid_parallel_pp_alexnet import TestDistPPTraning
+from hybrid_parallel_pp_alexnet import TestDistPPTraining
 
 import paddle
 
 
-class TestPPClipGrad(TestDistPPTraning):
+class TestPPClipGrad(TestDistPPTraining):
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
         scheduler = paddle.optimizer.lr.PiecewiseDecay(
@@ -36,7 +36,7 @@ class TestPPClipGrad(TestDistPPTraning):
         return scheduler, optimizer
 
 
-class TestPPClipGradParamGroup(TestDistPPTraning):
+class TestPPClipGradParamGroup(TestDistPPTraining):
     def build_optimizer(self, model):
         grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
         scheduler = paddle.optimizer.lr.PiecewiseDecay(
diff --git a/test/collective/fleet/hybrid_parallel_pp_embedding.py b/test/collective/fleet/hybrid_parallel_pp_embedding.py
index 2dd335eb0ef..d485e77a799 100644
--- a/test/collective/fleet/hybrid_parallel_pp_embedding.py
+++ b/test/collective/fleet/hybrid_parallel_pp_embedding.py
@@ -120,7 +120,7 @@ class SimpleNetPipe(Layer):
         return feat
 
 
-class TestDistEmbeddingTraning(unittest.TestCase):
+class TestDistEmbeddingTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_fp16.py b/test/collective/fleet/hybrid_parallel_pp_fp16.py
index 84430f4be37..c6c107a852a 100644
--- a/test/collective/fleet/hybrid_parallel_pp_fp16.py
+++ b/test/collective/fleet/hybrid_parallel_pp_fp16.py
@@ -38,7 +38,7 @@ batch_size = 4
 micro_batch_size = 2
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_recompute.py b/test/collective/fleet/hybrid_parallel_pp_recompute.py
index 793f2effd09..fd03b562f25 100644
--- a/test/collective/fleet/hybrid_parallel_pp_recompute.py
+++ b/test/collective/fleet/hybrid_parallel_pp_recompute.py
@@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer):
         )
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_save_load.py b/test/collective/fleet/hybrid_parallel_pp_save_load.py
index a2ab6a0654d..cfa7c6961ae 100644
--- a/test/collective/fleet/hybrid_parallel_pp_save_load.py
+++ b/test/collective/fleet/hybrid_parallel_pp_save_load.py
@@ -30,7 +30,7 @@ micro_batch_size = 2
 vocab_size = 128
 
 
-class TestDistPPSaveLoadTraning(unittest.TestCase):
+class TestDistPPSaveLoadTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py
index 5d6152f4e9f..8e922792cdb 100644
--- a/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py
+++ b/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py
@@ -33,7 +33,7 @@ micro_batch_size = 2
 vocab_size = 128
 
 
-class TestDistPPSaveLoadTraning(unittest.TestCase):
+class TestDistPPSaveLoadTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer.py b/test/collective/fleet/hybrid_parallel_pp_transformer.py
index 216f37796da..18986e3df34 100644
--- a/test/collective/fleet/hybrid_parallel_pp_transformer.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer.py
@@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer):
         )
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_save.py b/test/collective/fleet/hybrid_parallel_pp_transformer_save.py
index a8cf970f73d..1bd865e107e 100644
--- a/test/collective/fleet/hybrid_parallel_pp_transformer_save.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer_save.py
@@ -29,7 +29,7 @@ vocab_size = 128
 transformer_layer_num = 8
 
 
-class TestDistPPSaveTraning(unittest.TestCase):
+class TestDistPPSaveTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py
index 372cbe7f48d..fea23b62661 100644
--- a/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py
@@ -33,7 +33,7 @@ vocab_size = 128
 transformer_layer_num = 8
 
 
-class TestDistPPSaveTraning(unittest.TestCase):
+class TestDistPPSaveTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py b/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py
index 60b1e1052bd..bc43f514ea9 100644
--- a/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py
@@ -17,7 +17,7 @@ import unittest
 import numpy as np
 from hybrid_parallel_pp_transformer import (
     ModelPipe,
-    TestDistPPTraning,
+    TestDistPPTraining,
     batch_size,
     length,
     micro_batch_size,
@@ -30,7 +30,7 @@ import paddle.distributed as dist
 from paddle.distributed import fleet
 
 
-class TestDistPPTraningUnbalancedData(TestDistPPTraning):
+class TestDistPPTrainingUnbalancedData(TestDistPPTraining):
     def test_pp_model(self):
         hcg = fleet.get_hybrid_communicate_group()
         word_size = hcg.get_model_parallel_world_size()
diff --git a/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py b/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
index 3d7991727ee..35a17f17ace 100644
--- a/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
@@ -137,7 +137,7 @@ class ModelPipe(PipelineLayer):
         )
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/hybrid_parallel_qat.py b/test/collective/fleet/hybrid_parallel_qat.py
index 11662d849eb..00bc0f746e7 100644
--- a/test/collective/fleet/hybrid_parallel_qat.py
+++ b/test/collective/fleet/hybrid_parallel_qat.py
@@ -235,7 +235,7 @@ class SimpleDPNet(nn.Layer):
         return x
 
 
-class TestDistMPTraning(unittest.TestCase):
+class TestDistMPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
diff --git a/test/collective/fleet/hybrid_parallel_sharding_model.py b/test/collective/fleet/hybrid_parallel_sharding_model.py
index bb1ace6a6a4..41343d2dbda 100644
--- a/test/collective/fleet/hybrid_parallel_sharding_model.py
+++ b/test/collective/fleet/hybrid_parallel_sharding_model.py
@@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer):
         return x
 
 
-class TestDistMPTraning(unittest.TestCase):
+class TestDistMPTraining(unittest.TestCase):
     def setUp(self):
         random.seed(2021)
         np.random.seed(2021)
diff --git a/test/collective/fleet/hybrid_parallel_shared_weight.py b/test/collective/fleet/hybrid_parallel_shared_weight.py
index c110d5d00a4..f54b9947687 100644
--- a/test/collective/fleet/hybrid_parallel_shared_weight.py
+++ b/test/collective/fleet/hybrid_parallel_shared_weight.py
@@ -152,7 +152,7 @@ class SimpleNetPipe(PipelineLayer):
         super().__init__(layers=self.descs, loss_fn=LossNet(), **kwargs)
 
 
-class TestDistEmbeddingTraning(unittest.TestCase):
+class TestDistEmbeddingTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
diff --git a/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py b/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
index a3f4d5f0b16..85f24f61d79 100644
--- a/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
+++ b/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.share_net(tmp)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_gpus(self):
         self.trainer_id = dist.get_rank()
         dist.init_parallel_env()
diff --git a/test/collective/fleet/test_fleet_static_mp_layers.py b/test/collective/fleet/test_fleet_static_mp_layers.py
index 6f20943a128..3cd07943d11 100644
--- a/test/collective/fleet/test_fleet_static_mp_layers.py
+++ b/test/collective/fleet/test_fleet_static_mp_layers.py
@@ -66,7 +66,7 @@ class EmbeddingNet(paddle.nn.Layer):
         return output
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "2"
         os.environ[
diff --git a/test/legacy_test/auto_parallel_gpt_model.py b/test/legacy_test/auto_parallel_gpt_model.py
index 1be27f9bc80..4d5e1955e23 100644
--- a/test/legacy_test/auto_parallel_gpt_model.py
+++ b/test/legacy_test/auto_parallel_gpt_model.py
@@ -35,7 +35,7 @@ def init_global():
 
 class MultiHeadAttention(nn.Layer):
     """
-    Attention mapps queries and a set of key-value pairs to outputs, and
+    Attention maps queries and a set of key-value pairs to outputs, and
     Multi-Head Attention performs multiple parallel attention to jointly attending
     to information from different representation subspaces.
     """
@@ -114,7 +114,7 @@ class MultiHeadAttention(nn.Layer):
 
     def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
         """
-        Prapares linear projected queries, keys and values for usage of subsequnt
+        Prepares linear projected queries, keys and values for usage of subsequent
         multiple parallel attention. If `cache` is not None, using cached results
         to reduce redundant calculations.
         """
@@ -203,7 +203,7 @@ class MultiHeadAttention(nn.Layer):
 
     def gen_cache(self, key, value=None, type=Cache):
         """
-        Generates cache for `forward` usage in inference accroding to arguments.
+        Generates cache for `forward` usage in inference according to arguments.
         The generated cache is an instance of `MultiHeadAttention.Cache` or an
         instance of `MultiHeadAttention.StaticCache`.
         """
@@ -573,7 +573,7 @@ class GPTEmbeddings(nn.Layer):
             ones = paddle.ones_like(input_ids, dtype="int64")
             seq_length = paddle.cumsum(ones, axis=-1)
             position_ids = seq_length - ones
-        input_embedings = self.word_embeddings(input_ids)
+        input_embeddings = self.word_embeddings(input_ids)
         if _global_parallel_strategy == "mp":
             auto.shard_tensor(
                 self.word_embeddings.weight, _global_process_mesh, ["x", None]
@@ -592,7 +592,7 @@ class GPTEmbeddings(nn.Layer):
             )
 
         position_embeddings = self.position_embeddings(position_ids)
-        embeddings = input_embedings + position_embeddings
+        embeddings = input_embeddings + position_embeddings
         embeddings = self.dropout(embeddings)
         return embeddings
 
diff --git a/test/legacy_test/benchmark_sum_op.py b/test/legacy_test/benchmark_sum_op.py
index 6854fd7f208..7bc15957e1c 100644
--- a/test/legacy_test/benchmark_sum_op.py
+++ b/test/legacy_test/benchmark_sum_op.py
@@ -59,7 +59,7 @@ class TestSumOp(BenchmarkSuite):
 
     def test_timeit_output(self):
         """
-        perf the op, time cost will be averged in iters.
+        perf the op, time cost will be averaged in iters.
         output example
         >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
         >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
@@ -68,7 +68,7 @@ class TestSumOp(BenchmarkSuite):
 
     def test_timeit_grad(self):
         """
-        perf the op gradient, time cost will be averged in iters.
+        perf the op gradient, time cost will be averaged in iters.
         output example
         >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
         >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653
diff --git a/test/legacy_test/dist_fleet_ctr.py b/test/legacy_test/dist_fleet_ctr.py
index a5634a0cfba..419344edfae 100644
--- a/test/legacy_test/dist_fleet_ctr.py
+++ b/test/legacy_test/dist_fleet_ctr.py
@@ -129,7 +129,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             dnn_out = fc
 
         # build lr model
-        lr_embbding = paddle.static.nn.embedding(
+        lr_embedding = paddle.static.nn.embedding(
             is_distributed=False,
             input=lr_data,
             size=[lr_input_dim, 1],
@@ -141,7 +141,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             padding_idx=0,
         )
         lr_pool = paddle.static.nn.sequence_lod.sequence_pool(
-            input=lr_embbding.squeeze(-2), pool_type="sum"
+            input=lr_embedding.squeeze(-2), pool_type="sum"
         )
 
         merge_layer = paddle.concat([dnn_out, lr_pool], axis=1)
diff --git a/test/legacy_test/dist_hapi_mnist_dynamic.py b/test/legacy_test/dist_hapi_mnist_dynamic.py
index 7fa896cf3dd..66b5f66119b 100644
--- a/test/legacy_test/dist_hapi_mnist_dynamic.py
+++ b/test/legacy_test/dist_hapi_mnist_dynamic.py
@@ -52,7 +52,7 @@ def compute_accuracy(pred, gt):
 @unittest.skipIf(
     not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
 )
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_dynamic_multiple_gpus(self):
         device = set_device('gpu')
 
diff --git a/test/legacy_test/dist_hapi_mnist_static.py b/test/legacy_test/dist_hapi_mnist_static.py
index 9d9b488f907..c465ef7fe85 100644
--- a/test/legacy_test/dist_hapi_mnist_static.py
+++ b/test/legacy_test/dist_hapi_mnist_static.py
@@ -52,7 +52,7 @@ def compute_accuracy(pred, gt):
 @unittest.skipIf(
     not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
 )
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_static_multiple_gpus(self):
         paddle.enable_static()
         device = set_device('gpu')
diff --git a/test/legacy_test/dist_hapi_pure_fp16_static.py b/test/legacy_test/dist_hapi_pure_fp16_static.py
index f63866991fe..3506b22d31f 100644
--- a/test/legacy_test/dist_hapi_pure_fp16_static.py
+++ b/test/legacy_test/dist_hapi_pure_fp16_static.py
@@ -26,10 +26,10 @@ from paddle.vision.models import LeNet
 @unittest.skipIf(
     not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
 )
-class TestDistTraningWithPureFP16(unittest.TestCase):
+class TestDistTrainingWithPureFP16(unittest.TestCase):
     def test_amp_training_purefp16(self):
         if not fluid.is_compiled_with_cuda():
-            self.skipTest('module not tested when ONLY_CPU compling')
+            self.skipTest('module not tested when ONLY_CPU compiling')
         data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32)
         label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
 
diff --git a/test/legacy_test/gradient_checker.py b/test/legacy_test/gradient_checker.py
index 23e69c6b8ae..c24691eda15 100644
--- a/test/legacy_test/gradient_checker.py
+++ b/test/legacy_test/gradient_checker.py
@@ -269,7 +269,7 @@ def grad_check(
     if program is None:
         program = fluid.default_main_program()
 
-    # init variable in strtup program
+    # init variable in startup program
     scope = fluid.executor.global_scope()
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
@@ -595,7 +595,7 @@ def get_static_double_grad(
     if program is None:
         program = fluid.default_main_program()
 
-    # init variable in strtup program
+    # init variable in startup program
     scope = fluid.executor.global_scope()
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
@@ -657,7 +657,7 @@ def get_eager_double_grad(
         the second order derivative and the inputs of second order derivative's calculation
         will be returned for higher order derivative's calculation.
         If 'return_mid_result' set False.
-        A list of numpy array that stores second derivative result calulated by dygraph.
+        A list of numpy array that stores second derivative result calculated by dygraph.
     """
     if isinstance(place, fluid.CPUPlace):
         paddle.set_device("cpu")
@@ -684,7 +684,7 @@ def get_eager_double_grad(
     )
     d_inputs = [d_input for d_input in d_inputs if d_input is not None]
 
-    # calcluate second derivative
+    # calculate second derivative
     inputs = inputs + dys
     ddys = []
     if return_mid_result:
@@ -808,7 +808,7 @@ def get_static_triple_grad(
         program (Program|None): a Program with forward pass.
             If None, use fluid.default_main_program().
     Returns:
-        A list of numpy array that stores third derivative result calulated by static graph.
+        A list of numpy array that stores third derivative result calculated by static graph.
     """
     if program is None:
         program = fluid.default_main_program()
@@ -858,13 +858,13 @@ def get_eager_triple_grad(
         place (fluid.CPUPlace or fluid.CUDAPlace): the device.
         return_mid_result (list[Tensor], list[Tensor]): If set True, the
     Returns:
-        A list of numpy array that stores second derivative result calulated by dygraph
+        A list of numpy array that stores second derivative result calculated by dygraph
     """
     dd_y, dd_x = get_eager_double_grad(
         func, x_init, dy_init, place, return_mid_result=True
     )
 
-    # calcluate third derivative
+    # calculate third derivative
     dddys = []
     for dd_yi in dd_y:
         dd_yi.stop_gradient = False
diff --git a/test/legacy_test/hybrid_parallel_pp_alexnet.py b/test/legacy_test/hybrid_parallel_pp_alexnet.py
index d8e433456d1..dc461176da1 100644
--- a/test/legacy_test/hybrid_parallel_pp_alexnet.py
+++ b/test/legacy_test/hybrid_parallel_pp_alexnet.py
@@ -41,7 +41,7 @@ batch_size = 4
 micro_batch_size = 2
 
 
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
@@ -136,7 +136,7 @@ class TestDistPPTraning(unittest.TestCase):
             )
 
 
-class TestDistPPDelayScaleLoss(TestDistPPTraning):
+class TestDistPPDelayScaleLoss(TestDistPPTraining):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 1
@@ -158,7 +158,7 @@ class TestDistPPDelayScaleLoss(TestDistPPTraning):
         fleet.init(is_collective=True, strategy=strategy)
 
 
-class TestDistPPMainGrad(TestDistPPTraning):
+class TestDistPPMainGrad(TestDistPPTraining):
     def wrapper_mix_precision(self, model, optimizer):
         model = MixPrecisionLayer(model, dtype="float16")
         optimizer = MixPrecisionOptimizer(optimizer)
diff --git a/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py b/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py
index febf4b5f87d..87167fd7318 100644
--- a/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py
+++ b/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py
@@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.linear(inputs)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_gpus(self):
         self.trainer_id = dist.get_rank()
         dist.init_parallel_env()
diff --git a/test/legacy_test/parallel_dygraph_gradient_check.py b/test/legacy_test/parallel_dygraph_gradient_check.py
index a6c47b65d8f..5566b74f9aa 100644
--- a/test/legacy_test/parallel_dygraph_gradient_check.py
+++ b/test/legacy_test/parallel_dygraph_gradient_check.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.share_net(tmp)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_gpus(self):
         dist.init_parallel_env()
         self.trainer_id = dist.get_rank()
diff --git a/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py b/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py
index df66ff7616a..7cb8c326a3e 100644
--- a/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py
+++ b/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.share_net(tmp)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_gpus(self):
         self.trainer_id = dist.get_rank()
         self.pg = dist.init_parallel_env()
diff --git a/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py b/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py
index 6db6e9e62a5..805bf251acc 100644
--- a/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py
+++ b/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py
@@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.linear(inputs)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_xpus(self):
         self.trainer_id = dist.get_rank()
         dist.init_parallel_env()
diff --git a/test/xpu/parallel_dygraph_gradient_check.py b/test/xpu/parallel_dygraph_gradient_check.py
index 9de687f524a..4f9ac5e3b98 100644
--- a/test/xpu/parallel_dygraph_gradient_check.py
+++ b/test/xpu/parallel_dygraph_gradient_check.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.share_net(tmp)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_xpus(self):
         dist.init_parallel_env()
         self.trainer_id = dist.get_rank()
diff --git a/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py b/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
index f0e46b2db27..497b4b025a2 100644
--- a/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
+++ b/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
         return self.share_net(tmp)
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
     def test_multiple_xpus(self):
         self.trainer_id = dist.get_rank()
         self.pg = dist.init_parallel_env()
-- 
GitLab