Fix typos (#56008)

4d094b0c · co63oc · GitHub · c1913a5f · 4d094b0c · 4d094b0c
41 changed file
--- a/test/collective/fleet/hybrid_parallel_mp_amp.py
+++ b/test/collective/fleet/hybrid_parallel_mp_amp.py
@@ -14,13 +14,13 @@
 import unittest
-from hybrid_parallel_mp_model import TestDistMPTraning
+from hybrid_parallel_mp_model import TestDistMPTraining
 import paddle
 from paddle.distributed import fleet
-class TestMPClipGrad(TestDistMPTraning):
+class TestMPClipGrad(TestDistMPTraining):
    def build_optimizer(self, model):
        grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
        scheduler = paddle.optimizer.lr.ExponentialDecay(

--- a/test/collective/fleet/hybrid_parallel_mp_bf16.py
+++ b/test/collective/fleet/hybrid_parallel_mp_bf16.py
@@ -14,14 +14,14 @@
 import unittest
-from hybrid_parallel_mp_model import TestDistMPTraning
+from hybrid_parallel_mp_model import TestDistMPTraining
 import paddle
 from paddle.distributed import fleet
 from paddle.distributed.utils.nccl_utils import check_nccl_version_for_bf16
-class TestMPFP16(TestDistMPTraning):
+class TestMPFP16(TestDistMPTraining):
    def build_optimizer(self, model):
        grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0)
        scheduler = paddle.optimizer.lr.ExponentialDecay(

--- a/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py
+++ b/test/collective/fleet/hybrid_parallel_mp_broadcast_obj.py
@@ -19,7 +19,7 @@ import numpy as np
 from hybrid_parallel_mp_model import (
    SimpleDPNet,
    SimpleMPNet,
-    TestDistMPTraning,
+    TestDistMPTraining,
    parallel_matmul,
    set_random_seed,
 )
@@ -58,7 +58,7 @@ class SimpleDPMultimodalNet(SimpleDPNet):
        return x
-class TestMPBroadcastObj(TestDistMPTraning):
+class TestMPBroadcastObj(TestDistMPTraining):
    def build_model_optimizer(self):
        hcg = fleet.get_hybrid_communicate_group()
        word_size = hcg.get_model_parallel_world_size()

--- a/test/collective/fleet/hybrid_parallel_mp_clip_grad.py
+++ b/test/collective/fleet/hybrid_parallel_mp_clip_grad.py
@@ -14,7 +14,7 @@
 import unittest
-from hybrid_parallel_mp_model import TestDistMPTraning
+from hybrid_parallel_mp_model import TestDistMPTraining
 import paddle
@@ -22,7 +22,7 @@ import paddle
 # log.setLevel(logging.WARNING)
-class TestMPClipGrad(TestDistMPTraning):
+class TestMPClipGrad(TestDistMPTraining):
    def build_optimizer(self, model):
        grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
        scheduler = paddle.optimizer.lr.ExponentialDecay(

--- a/test/collective/fleet/hybrid_parallel_mp_fp16.py
+++ b/test/collective/fleet/hybrid_parallel_mp_fp16.py
@@ -14,13 +14,13 @@
 import unittest
-from hybrid_parallel_mp_model import TestDistMPTraning
+from hybrid_parallel_mp_model import TestDistMPTraining
 import paddle
 from paddle.distributed import fleet
-class TestMPFP16(TestDistMPTraning):
+class TestMPFP16(TestDistMPTraining):
    def build_optimizer(self, model):
        grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0)
        scheduler = paddle.optimizer.lr.ExponentialDecay(

--- a/test/collective/fleet/hybrid_parallel_mp_layers.py
+++ b/test/collective/fleet/hybrid_parallel_mp_layers.py
@@ -115,7 +115,7 @@ class SimpleEmbedding(paddle.nn.Layer):
        return output
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 2

--- a/test/collective/fleet/hybrid_parallel_mp_model.py
+++ b/test/collective/fleet/hybrid_parallel_mp_model.py
@@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer):
        return x
-class TestDistMPSyncTraning(unittest.TestCase):
+class TestDistMPSyncTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 2
@@ -348,7 +348,7 @@ class TestDistMPSyncTraning(unittest.TestCase):
        )
-class TestDistMPSyncModelTraning(TestDistMPSyncTraning):
+class TestDistMPSyncModelTraining(TestDistMPSyncTraining):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 2
@@ -368,7 +368,7 @@ class TestDistMPSyncModelTraning(TestDistMPSyncTraning):
        fleet.init(is_collective=True, strategy=strategy)
-class TestDistMPTraning(unittest.TestCase):
+class TestDistMPTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 2

--- a/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py
+++ b/test/collective/fleet/hybrid_parallel_mp_model_with_sequence_parallel.py
@@ -200,7 +200,7 @@ class SimpleDPNet(paddle.nn.Layer):
        return x
-class TestDistSPSyncTraning(unittest.TestCase):
+class TestDistSPSyncTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 2
@@ -370,7 +370,7 @@ class TestDistSPSyncTraning(unittest.TestCase):
        )
-class TestDistSPSyncModelTraning(TestDistSPSyncTraning):
+class TestDistSPSyncModelTraining(TestDistSPSyncTraining):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 2
@@ -390,7 +390,7 @@ class TestDistSPSyncModelTraning(TestDistSPSyncTraning):
        fleet.init(is_collective=True, strategy=strategy)
-class TestDistSPTraning(unittest.TestCase):
+class TestDistSPTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 2

--- a/test/collective/fleet/hybrid_parallel_mp_random.py
+++ b/test/collective/fleet/hybrid_parallel_mp_random.py
@@ -20,7 +20,7 @@ import paddle
 from paddle.distributed import fleet
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 2

--- a/test/collective/fleet/hybrid_parallel_pp_amp.py
+++ b/test/collective/fleet/hybrid_parallel_pp_amp.py
@@ -34,7 +34,7 @@ batch_size = 4
 micro_batch_size = 2
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1

--- a/test/collective/fleet/hybrid_parallel_pp_bf16.py
+++ b/test/collective/fleet/hybrid_parallel_pp_bf16.py
@@ -35,7 +35,7 @@ batch_size = 4
 micro_batch_size = 2
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1

--- a/test/collective/fleet/hybrid_parallel_pp_clip_grad.py
+++ b/test/collective/fleet/hybrid_parallel_pp_clip_grad.py
@@ -17,12 +17,12 @@ import unittest
 sys.path.append("../../legacy_test")
-from hybrid_parallel_pp_alexnet import TestDistPPTraning
+from hybrid_parallel_pp_alexnet import TestDistPPTraining
 import paddle
-class TestPPClipGrad(TestDistPPTraning):
+class TestPPClipGrad(TestDistPPTraining):
    def build_optimizer(self, model):
        grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
        scheduler = paddle.optimizer.lr.PiecewiseDecay(
@@ -36,7 +36,7 @@ class TestPPClipGrad(TestDistPPTraning):
        return scheduler, optimizer
-class TestPPClipGradParamGroup(TestDistPPTraning):
+class TestPPClipGradParamGroup(TestDistPPTraining):
    def build_optimizer(self, model):
        grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
        scheduler = paddle.optimizer.lr.PiecewiseDecay(

--- a/test/collective/fleet/hybrid_parallel_pp_embedding.py
+++ b/test/collective/fleet/hybrid_parallel_pp_embedding.py
@@ -120,7 +120,7 @@ class SimpleNetPipe(Layer):
        return feat
-class TestDistEmbeddingTraning(unittest.TestCase):
+class TestDistEmbeddingTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1

--- a/test/collective/fleet/hybrid_parallel_pp_fp16.py
+++ b/test/collective/fleet/hybrid_parallel_pp_fp16.py
@@ -38,7 +38,7 @@ batch_size = 4
 micro_batch_size = 2
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1

--- a/test/collective/fleet/hybrid_parallel_pp_recompute.py
+++ b/test/collective/fleet/hybrid_parallel_pp_recompute.py
@@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer):
        )
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1

--- a/test/collective/fleet/hybrid_parallel_pp_save_load.py
+++ b/test/collective/fleet/hybrid_parallel_pp_save_load.py
@@ -30,7 +30,7 @@ micro_batch_size = 2
 vocab_size = 128
-class TestDistPPSaveLoadTraning(unittest.TestCase):
+class TestDistPPSaveLoadTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1

--- a/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py
+++ b/test/collective/fleet/hybrid_parallel_pp_save_load_with_virtual_stage.py
@@ -33,7 +33,7 @@ micro_batch_size = 2
 vocab_size = 128
-class TestDistPPSaveLoadTraning(unittest.TestCase):
+class TestDistPPSaveLoadTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1

--- a/test/collective/fleet/hybrid_parallel_pp_transformer.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer.py
@@ -138,7 +138,7 @@ class ModelPipe(PipelineLayer):
        )
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1

--- a/test/collective/fleet/hybrid_parallel_pp_transformer_save.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer_save.py
@@ -29,7 +29,7 @@ vocab_size = 128
 transformer_layer_num = 8
-class TestDistPPSaveTraning(unittest.TestCase):
+class TestDistPPSaveTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1

--- a/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer_save_with_virtual_stage.py
@@ -33,7 +33,7 @@ vocab_size = 128
 transformer_layer_num = 8
-class TestDistPPSaveTraning(unittest.TestCase):
+class TestDistPPSaveTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1

--- a/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py
@@ -17,7 +17,7 @@ import unittest
 import numpy as np
 from hybrid_parallel_pp_transformer import (
    ModelPipe,
-    TestDistPPTraning,
+    TestDistPPTraining,
    batch_size,
    length,
    micro_batch_size,
@@ -30,7 +30,7 @@ import paddle.distributed as dist
 from paddle.distributed import fleet
-class TestDistPPTraningUnbalancedData(TestDistPPTraning):
+class TestDistPPTrainingUnbalancedData(TestDistPPTraining):
    def test_pp_model(self):
        hcg = fleet.get_hybrid_communicate_group()
        word_size = hcg.get_model_parallel_world_size()

--- a/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
+++ b/test/collective/fleet/hybrid_parallel_pp_transformer_with_virtual_stage.py
@@ -137,7 +137,7 @@ class ModelPipe(PipelineLayer):
        )
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1

--- a/test/collective/fleet/hybrid_parallel_qat.py
+++ b/test/collective/fleet/hybrid_parallel_qat.py
@@ -235,7 +235,7 @@ class SimpleDPNet(nn.Layer):
        return x
-class TestDistMPTraning(unittest.TestCase):
+class TestDistMPTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 2

--- a/test/collective/fleet/hybrid_parallel_sharding_model.py
+++ b/test/collective/fleet/hybrid_parallel_sharding_model.py
@@ -180,7 +180,7 @@ class SimpleDPNet(paddle.nn.Layer):
        return x
-class TestDistMPTraning(unittest.TestCase):
+class TestDistMPTraining(unittest.TestCase):
    def setUp(self):
        random.seed(2021)
        np.random.seed(2021)

--- a/test/collective/fleet/hybrid_parallel_shared_weight.py
+++ b/test/collective/fleet/hybrid_parallel_shared_weight.py
@@ -152,7 +152,7 @@ class SimpleNetPipe(PipelineLayer):
        super().__init__(layers=self.descs, loss_fn=LossNet(), **kwargs)
-class TestDistEmbeddingTraning(unittest.TestCase):
+class TestDistEmbeddingTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1

--- a/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
+++ b/test/collective/fleet/parallel_dygraph_no_sync_gradient_check.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
        return self.share_net(tmp)
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
    def test_multiple_gpus(self):
        self.trainer_id = dist.get_rank()
        dist.init_parallel_env()

--- a/test/collective/fleet/test_fleet_static_mp_layers.py
+++ b/test/collective/fleet/test_fleet_static_mp_layers.py
@@ -66,7 +66,7 @@ class EmbeddingNet(paddle.nn.Layer):
        return output
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
    def setUp(self):
        os.environ["PADDLE_TRAINER_ID"] = "2"
        os.environ[

--- a/test/legacy_test/auto_parallel_gpt_model.py
+++ b/test/legacy_test/auto_parallel_gpt_model.py
@@ -35,7 +35,7 @@ def init_global():
 class MultiHeadAttention(nn.Layer):
    """
-    Attention mapps queries and a set of key-value pairs to outputs, and
+    Attention maps queries and a set of key-value pairs to outputs, and
    Multi-Head Attention performs multiple parallel attention to jointly attending
    to information from different representation subspaces.
    """
@@ -114,7 +114,7 @@ class MultiHeadAttention(nn.Layer):
    def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
        """
-        Prapares linear projected queries, keys and values for usage of subsequnt
+        Prepares linear projected queries, keys and values for usage of subsequent
        multiple parallel attention. If `cache` is not None, using cached results
        to reduce redundant calculations.
        """
@@ -203,7 +203,7 @@ class MultiHeadAttention(nn.Layer):
    def gen_cache(self, key, value=None, type=Cache):
        """
-        Generates cache for `forward` usage in inference accroding to arguments.
+        Generates cache for `forward` usage in inference according to arguments.
        The generated cache is an instance of `MultiHeadAttention.Cache` or an
        instance of `MultiHeadAttention.StaticCache`.
        """
@@ -573,7 +573,7 @@ class GPTEmbeddings(nn.Layer):
            ones = paddle.ones_like(input_ids, dtype="int64")
            seq_length = paddle.cumsum(ones, axis=-1)
            position_ids = seq_length - ones
-        input_embedings = self.word_embeddings(input_ids)
+        input_embeddings = self.word_embeddings(input_ids)
        if _global_parallel_strategy == "mp":
            auto.shard_tensor(
                self.word_embeddings.weight, _global_process_mesh, ["x", None]
@@ -592,7 +592,7 @@ class GPTEmbeddings(nn.Layer):
            )
        position_embeddings = self.position_embeddings(position_ids)
-        embeddings = input_embedings + position_embeddings
+        embeddings = input_embeddings + position_embeddings
        embeddings = self.dropout(embeddings)
        return embeddings

--- a/test/legacy_test/benchmark_sum_op.py
+++ b/test/legacy_test/benchmark_sum_op.py
@@ -59,7 +59,7 @@ class TestSumOp(BenchmarkSuite):
    def test_timeit_output(self):
        """
-        perf the op, time cost will be averged in iters.
+        perf the op, time cost will be averaged in iters.
        output example
        >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
        >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
@@ -68,7 +68,7 @@ class TestSumOp(BenchmarkSuite):
    def test_timeit_grad(self):
        """
-        perf the op gradient, time cost will be averged in iters.
+        perf the op gradient, time cost will be averaged in iters.
        output example
        >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
        >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653

--- a/test/legacy_test/dist_fleet_ctr.py
+++ b/test/legacy_test/dist_fleet_ctr.py
@@ -129,7 +129,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
            dnn_out = fc
        # build lr model
-        lr_embbding = paddle.static.nn.embedding(
+        lr_embedding = paddle.static.nn.embedding(
            is_distributed=False,
            input=lr_data,
            size=[lr_input_dim, 1],
@@ -141,7 +141,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
            padding_idx=0,
        )
        lr_pool = paddle.static.nn.sequence_lod.sequence_pool(
-            input=lr_embbding.squeeze(-2), pool_type="sum"
+            input=lr_embedding.squeeze(-2), pool_type="sum"
        )
        merge_layer = paddle.concat([dnn_out, lr_pool], axis=1)

--- a/test/legacy_test/dist_hapi_mnist_dynamic.py
+++ b/test/legacy_test/dist_hapi_mnist_dynamic.py
@@ -52,7 +52,7 @@ def compute_accuracy(pred, gt):
 @unittest.skipIf(
    not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
 )
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
    def test_dynamic_multiple_gpus(self):
        device = set_device('gpu')

--- a/test/legacy_test/dist_hapi_mnist_static.py
+++ b/test/legacy_test/dist_hapi_mnist_static.py
@@ -52,7 +52,7 @@ def compute_accuracy(pred, gt):
 @unittest.skipIf(
    not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
 )
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
    def test_static_multiple_gpus(self):
        paddle.enable_static()
        device = set_device('gpu')

--- a/test/legacy_test/dist_hapi_pure_fp16_static.py
+++ b/test/legacy_test/dist_hapi_pure_fp16_static.py
@@ -26,10 +26,10 @@ from paddle.vision.models import LeNet
 @unittest.skipIf(
    not fluid.is_compiled_with_cuda(), 'CPU testing is not supported'
 )
-class TestDistTraningWithPureFP16(unittest.TestCase):
+class TestDistTrainingWithPureFP16(unittest.TestCase):
    def test_amp_training_purefp16(self):
        if not fluid.is_compiled_with_cuda():
-            self.skipTest('module not tested when ONLY_CPU compling')
+            self.skipTest('module not tested when ONLY_CPU compiling')
        data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32)
        label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)

--- a/test/legacy_test/gradient_checker.py
+++ b/test/legacy_test/gradient_checker.py
@@ -269,7 +269,7 @@ def grad_check(
    if program is None:
        program = fluid.default_main_program()
-    # init variable in strtup program
+    # init variable in startup program
    scope = fluid.executor.global_scope()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
@@ -595,7 +595,7 @@ def get_static_double_grad(
    if program is None:
        program = fluid.default_main_program()
-    # init variable in strtup program
+    # init variable in startup program
    scope = fluid.executor.global_scope()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
@@ -657,7 +657,7 @@ def get_eager_double_grad(
        the second order derivative and the inputs of second order derivative's calculation
        will be returned for higher order derivative's calculation.
        If 'return_mid_result' set False.
-        A list of numpy array that stores second derivative result calulated by dygraph.
+        A list of numpy array that stores second derivative result calculated by dygraph.
    """
    if isinstance(place, fluid.CPUPlace):
        paddle.set_device("cpu")
@@ -684,7 +684,7 @@ def get_eager_double_grad(
    )
    d_inputs = [d_input for d_input in d_inputs if d_input is not None]
-    # calcluate second derivative
+    # calculate second derivative
    inputs = inputs + dys
    ddys = []
    if return_mid_result:
@@ -808,7 +808,7 @@ def get_static_triple_grad(
        program (Program|None): a Program with forward pass.
            If None, use fluid.default_main_program().
    Returns:
-        A list of numpy array that stores third derivative result calulated by static graph.
+        A list of numpy array that stores third derivative result calculated by static graph.
    """
    if program is None:
        program = fluid.default_main_program()
@@ -858,13 +858,13 @@ def get_eager_triple_grad(
        place (fluid.CPUPlace or fluid.CUDAPlace): the device.
        return_mid_result (list[Tensor], list[Tensor]): If set True, the
    Returns:
-        A list of numpy array that stores second derivative result calulated by dygraph
+        A list of numpy array that stores second derivative result calculated by dygraph
    """
    dd_y, dd_x = get_eager_double_grad(
        func, x_init, dy_init, place, return_mid_result=True
    )
-    # calcluate third derivative
+    # calculate third derivative
    dddys = []
    for dd_yi in dd_y:
        dd_yi.stop_gradient = False

--- a/test/legacy_test/hybrid_parallel_pp_alexnet.py
+++ b/test/legacy_test/hybrid_parallel_pp_alexnet.py
@@ -41,7 +41,7 @@ batch_size = 4
 micro_batch_size = 2
-class TestDistPPTraning(unittest.TestCase):
+class TestDistPPTraining(unittest.TestCase):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1
@@ -136,7 +136,7 @@ class TestDistPPTraning(unittest.TestCase):
            )
-class TestDistPPDelayScaleLoss(TestDistPPTraning):
+class TestDistPPDelayScaleLoss(TestDistPPTraining):
    def setUp(self):
        strategy = fleet.DistributedStrategy()
        self.model_parallel_size = 1
@@ -158,7 +158,7 @@ class TestDistPPDelayScaleLoss(TestDistPPTraning):
        fleet.init(is_collective=True, strategy=strategy)
-class TestDistPPMainGrad(TestDistPPTraning):
+class TestDistPPMainGrad(TestDistPPTraining):
    def wrapper_mix_precision(self, model, optimizer):
        model = MixPrecisionLayer(model, dtype="float16")
        optimizer = MixPrecisionOptimizer(optimizer)

--- a/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py
+++ b/test/legacy_test/parallel_dygraph_dataparallel_with_pylayer.py
@@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer):
        return self.linear(inputs)
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
    def test_multiple_gpus(self):
        self.trainer_id = dist.get_rank()
        dist.init_parallel_env()

--- a/test/legacy_test/parallel_dygraph_gradient_check.py
+++ b/test/legacy_test/parallel_dygraph_gradient_check.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
        return self.share_net(tmp)
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
    def test_multiple_gpus(self):
        dist.init_parallel_env()
        self.trainer_id = dist.get_rank()

--- a/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py
+++ b/test/legacy_test/parallel_dygraph_gradient_check_in_eager_mode.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
        return self.share_net(tmp)
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
    def test_multiple_gpus(self):
        self.trainer_id = dist.get_rank()
        self.pg = dist.init_parallel_env()

--- a/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py
+++ b/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py
@@ -62,7 +62,7 @@ class SimpleNet(paddle.nn.Layer):
        return self.linear(inputs)
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
    def test_multiple_xpus(self):
        self.trainer_id = dist.get_rank()
        dist.init_parallel_env()

--- a/test/xpu/parallel_dygraph_gradient_check.py
+++ b/test/xpu/parallel_dygraph_gradient_check.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
        return self.share_net(tmp)
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
    def test_multiple_xpus(self):
        dist.init_parallel_env()
        self.trainer_id = dist.get_rank()

--- a/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
+++ b/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
@@ -64,7 +64,7 @@ class SimpleNet(paddle.nn.Layer):
        return self.share_net(tmp)
-class TestDistTraning(unittest.TestCase):
+class TestDistTraining(unittest.TestCase):
    def test_multiple_xpus(self):
        self.trainer_id = dist.get_rank()
        self.pg = dist.init_parallel_env()