diff --git a/python/paddle/audio/backends/backend.py b/python/paddle/audio/backends/backend.py
index 86d15953af3f63dc3db50e61bb9206d3130d57ea..f10c291a54f30c1532ccf5702f48246cc0553544 100644
--- a/python/paddle/audio/backends/backend.py
+++ b/python/paddle/audio/backends/backend.py
@@ -126,7 +126,7 @@ def save(
         filepath: saved path
         src: the audio tensor
         sample_rate: the number of samples of audio per second.
-        channels_first: src channel infomation
+        channels_first: src channel information
             if True, means input tensor is (channels, time)
             if False, means input tensor is (time, channels)
         encoding:encoding format, wave_backend only support PCM16 now.
diff --git a/python/paddle/audio/datasets/esc50.py b/python/paddle/audio/datasets/esc50.py
index 412c3916bf1e9faff521495495e3aa639c2800c3..3f2da64df4210ef43c5f63c54e7739132a795b94 100644
--- a/python/paddle/audio/datasets/esc50.py
+++ b/python/paddle/audio/datasets/esc50.py
@@ -37,7 +37,7 @@ class ESC50(AudioClassificationDataset):
     Args:
        mode (str, optional): It identifies the dataset mode (train or dev). Default:train.
        split (int, optional): It specify the fold of dev dataset. Default:1.
-       feat_type (str, optional): It identifies the feature type that user wants to extrace of an audio file. Default:raw.
+       feat_type (str, optional): It identifies the feature type that user wants to extract of an audio file. Default:raw.
        archive(dict, optional): it tells where to download the audio archive. Default:None.
 
     Returns:
diff --git a/python/paddle/audio/datasets/tess.py b/python/paddle/audio/datasets/tess.py
index 6ded358d054a448f07a46885c5b9561264743cfc..4c150c8c2102887153e25af7d50176a7efc4990c 100644
--- a/python/paddle/audio/datasets/tess.py
+++ b/python/paddle/audio/datasets/tess.py
@@ -39,7 +39,7 @@ class TESS(AudioClassificationDataset):
        mode (str, optional): It identifies the dataset mode (train or dev). Defaults to train.
        n_folds (int, optional): Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset. Defaults to 5.
        split (int, optional): It specify the fold of dev dataset. Defaults to 1.
-       feat_type (str, optional): It identifies the feature type that user wants to extrace of an audio file. Defaults to raw.
+       feat_type (str, optional): It identifies the feature type that user wants to extract of an audio file. Defaults to raw.
        archive(dict): it tells where to download the audio archive. Defaults to None.
 
     Returns:
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index ce017ef98540d3165a0e82e469f3bb303eb88b98..81a2dadf53debbad304c0098aa1ca03f54eb83b6 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -34,7 +34,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
 
         retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
             like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
-            :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+            :code:`retain_graph` to True, then the grads will be retained. Thus, setting it to False is much more memory-efficient.
             Defaults to False.
 
     Returns:
@@ -79,7 +79,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
         assert in_out_list is not None, "{} should not be None".format(name)
 
         if isinstance(in_out_list, (list, tuple)):
-            assert len(in_out_list) > 0, "{} connot be empyt".format(name)
+            assert len(in_out_list) > 0, "{} connot be empty".format(name)
             for each_var in in_out_list:
                 assert isinstance(
                     each_var, (paddle.Tensor, core.eager.Tensor)
diff --git a/python/paddle/autograd/saved_tensors_hooks.py b/python/paddle/autograd/saved_tensors_hooks.py
index 8906dd98b49916296edb72e404e8dcc765557cc4..d2be6b5e6bf52bb06aa1cedf3e6d87b0c88b1cc3 100644
--- a/python/paddle/autograd/saved_tensors_hooks.py
+++ b/python/paddle/autograd/saved_tensors_hooks.py
@@ -29,7 +29,7 @@ class saved_tensors_hooks:
             of the original tensor. `pack_hook` will also be called while any
             tensor need be saved by `PyLayerContext.save_for_backward`. If a tensor
             saved for backward is no need buffer, `pack_hook` will not be called.
-            Only the thensor saved for backward is LoDTensor, `pack_hook` will be
+            Only the tensor saved for backward is LoDTensor, `pack_hook` will be
             called.
         unpack_hook (function): The unpack hook will be called every time the
             backward need use the saved inputs/outputs tensors. Then you can reload
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
index 12929475eccbe25ef92fac76fbdfa230a61000e0..3bddf1aa66d696d1860ca0efe75635c4f82f1e0f 100644
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -95,8 +95,8 @@ def corpus_reader(data_path, words_name, props_name):
 
                 if len(label) == 0:  # end of sentence
                     for i in range(len(one_seg[0])):
-                        a_kind_lable = [x[i] for x in one_seg]
-                        labels.append(a_kind_lable)
+                        a_kind_label = [x[i] for x in one_seg]
+                        labels.append(a_kind_label)
 
                     if len(labels) >= 1:
                         verb_list = []
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 6d6de32096ec1ee2422d0d2db3ae82ead64690e5..cac16f371057f775503ba570475b71ded53f6007 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -14,7 +14,7 @@
 """
 This module will download dataset from
 http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
-and parse train/test set intopaddle reader creators.
+and parse train/test dataset into paddle reader creators.
 
 This set contains images of flowers belonging to 102 different categories.
 The images were acquired by searching the web and taking pictures. There are a
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index 7236bdbf53978e9d903c65a35899c657418b3f63..23fababd09447557c8d8e4eab7d5814229f58c01 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -89,9 +89,9 @@ def batch_images_from_tar(
     :type data_file: string
     :param dataset_name: 'train','test' or 'valid'
     :type dataset_name: string
-    :param img2label: a dic with image file name as key
+    :param img2label: a dict with image file name as key
                     and image's label as value
-    :type img2label: dic
+    :type img2label: dict
     :param num_per_batch: image number per batch file
     :type num_per_batch: int
     :return: path of list file containing paths of batch file
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
index a4c0e10d9c111e58e5e6423fb533bb985395f304..0064475e41bb0cb5cc095065447236053ccb9ae5 100644
--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -108,7 +108,7 @@ def reader_creator(filename, word_idx, n, data_type):
                         continue
                     yield src_seq, trg_seq
                 else:
-                    assert False, 'Unknow data type'
+                    assert False, 'Unknown data type'
 
     return reader
 
diff --git a/python/paddle/distributed/auto_parallel/cluster_v2.py b/python/paddle/distributed/auto_parallel/cluster_v2.py
index 06cc8d0e47038d5e91130e2ef10dff642a173189..7770aa3ab47a810e92718a4ba3f664d3b281e948 100644
--- a/python/paddle/distributed/auto_parallel/cluster_v2.py
+++ b/python/paddle/distributed/auto_parallel/cluster_v2.py
@@ -48,7 +48,7 @@ class DeviceMesh(core.DeviceMesh):
     The class `DeviceMesh` describes the topology of physical devices.
 
     Args:
-        mesh (list|numpy.array): an N-dimensional array describes the toplogy
+        mesh (list|numpy.array): an N-dimensional array describes the topology
             of logical processes.
         dim_names (list, optional): the i-th element of this list gives the name of the
             i-th dimension.
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 8979239df5f11c7344e1127fba5e3c53fa709830..fcd767f53b3c457ba0e1364d1065840b18117a5d 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -257,7 +257,7 @@ class Completer:
                                 tensor_desc.name(), compatible_dims_mapping
                             )
                             changed = True
-            # Find the most compatible implemenetations from the distributed operator
+            # Find the most compatible implementations from the distributed operator
             op_dist_impls = find_compatible_distributed_operator_impls(
                 dist_op, fwd=True
             )
@@ -329,7 +329,7 @@ class Completer:
                                 tensor_desc.name(), compatible_dims_mapping
                             )
                             changed = True
-            # Find the most compatible implemenetations from the distributed operator
+            # Find the most compatible implementations from the distributed operator
             op_dist_impls = find_compatible_distributed_operator_impls(
                 dist_op, fwd=False
             )
@@ -685,7 +685,7 @@ class Completer:
             cond_tensor_related_nodes.extend(
                 _find_nodes_related_to_cond(cond_tensor_node)
             )
-            # Step 2.3: Add the StepScops output of while_op
+            # Step 2.3: Add the StepScopes output of while_op
             stepscopes_tensor_name = while_op_node.op().output("StepScopes")[0]
             stepscopes_tensor_node = None
             for output_node in while_op_node.outputs:
@@ -1397,7 +1397,7 @@ class Completer:
                 )
                 forward_var = vars[forward_var_name]
 
-                # TODO complete other attribte for grad var
+                # TODO complete other attribute for grad var
                 tensor_dist_attr = TensorDistAttr()
                 process_mesh = (
                     self._dist_context.get_tensor_dist_attr_for_program(
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 8ee92eb958a07f4de420dc161035e0b7d3e4b0ea..b8a7bec6dad51d9f5804f7945daaa7274e21261b 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -1047,7 +1047,7 @@ class DistributedOperatorContext:
         # NOTE Support correct parallelism for high-order differential model.
         # by default exceed_backward_init_op is False and it means we are in Forward phase; After exceed_backward_init_op = True,
         # it means we are in Backward phase.
-        # And the final sulotion should be revise high-order differential logic for these two phases in future.
+        # And the final solution should be revise high-order differential logic for these two phases in future.
         self._exceed_backward_init_op = False
 
     def __deepcopy__(self, memo):
diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py
index 7da0dc7dacd70479f7e200262c18d18499b05cce..5af0dd12f3ff918a5a9f82a21b7602d274496430 100644
--- a/python/paddle/distributed/auto_parallel/dist_loader.py
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -146,7 +146,7 @@ class DistributedDataLoaderFromGenerator(DistributedDataLoaderBase):
                 steps_per_epoch = len(self.dataset) // self.batch_size
         except:
             raise ValueError(
-                "Pleace set `steps_per_epoch` or implement `__len__` methond in dataset class."
+                "Please set `steps_per_epoch` or implement `__len__` method in dataset class."
             )
         return steps_per_epoch
 
diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py
index 89fd71df129c1ca6ebcc859f323601a3bdf1ce1d..7e64c7a56f03fdd382e54ef6dc4aa12c8695ef33 100644
--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -328,7 +328,7 @@ class DistributedOperatorHelper:
         elif isinstance(output, Variable):
             new_output = [output]
         else:
-            raise ValueError("Unrecognized outpout.")
+            raise ValueError("Unrecognized output.")
 
         if self._out_dims_mappings:
             assert len(new_output) == len(
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index f098564a3082dbcbc6f429a3c664904d0da487d4..fe0af50768b7aa0993de5e2732344693399773a3 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -247,7 +247,7 @@ class Engine:
                 labels = sample[split:]
         else:
             raise TypeError(
-                "Data should be a Dataset or IterableDatset, but received {}.".format(
+                "Data should be a Dataset or IterableDataset, but received {}.".format(
                     type(data).__name__
                 )
             )
@@ -699,7 +699,7 @@ class Engine:
     def _parallel(self, mode, all_ranks=False):
         # Parallelize program based on the planner's results
         # For now, the completer has to be passed to the planner,
-        # because we may use it to complete the annotation of the backwarkward and update.
+        # because we may use it to complete the annotation of the backward and update.
         parallelizer = Parallelizer(
             mode,
             self._planners[mode].completer,
diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py
index 333ef1d5b162e551b87282c2538979f2ea1d7c15..2072c2923cd9fb687e7b5fae3052ca5572983540 100644
--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -117,15 +117,15 @@ def shard_op(op, process_mesh=None, in_shard_specs=None, out_shard_specs=None):
             will be used. And an error will be raised if the current process mesh cannot be found.
             Default: None.
         in_shard_specs (list of list, optional): a list of list to describe the sharding specifications
-            for the inputs. Each item of `in_shard_specs` is a `shard_spec` between the correspoinding input
-            and `process_mesh`. If one item is None, the cooresponding input is replicated across all processes
-            If it is None, all inputs are replicated across all processes. Note that the lenght of the
+            for the inputs. Each item of `in_shard_specs` is a `shard_spec` between the corresponding input
+            and `process_mesh`. If one item is None, the corresponding input is replicated across all processes
+            If it is None, all inputs are replicated across all processes. Note that the length of the
             `in_shard_specs` should be equal to the actual number of inputs when calling this operation.
             Default: None.
         out_shard_specs (list of list, optional): a list of list to describe the sharding specifications
-            for the outputs. Each item of `out_shard_specs` is a `shard_spec` between the correspoinding output
-            and `process_mesh`. If one item is None, the cooresponding output is replicated across all processes
-            If it is None, all outputs are replicated across all processes. Note that the lenght of the
+            for the outputs. Each item of `out_shard_specs` is a `shard_spec` between the corresponding output
+            and `process_mesh`. If one item is None, the corresponding output is replicated across all processes
+            If it is None, all outputs are replicated across all processes. Note that the length of the
             `in_shard_specs` should be equal to the actual number of inputs when calling this operation.
             Default: None. Default: None.
 
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index faddf1542f8d942250bfa26fd95cf53485943745..ef9292e48bed783d754f77a9a588038d8cd98bb2 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -191,7 +191,7 @@ def register_distributed_operator_impl(op_type, dist_impl):
 
 def find_compatible_distributed_operator_impls(dist_op, fwd=True, partial=True):
     """
-    Here just return the first compatible implemention.
+    Here just return the first compatible implementation.
     This will be improved by cost model in the future.
     """
     op_type = dist_op.serial_op.type
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
index b95bdb38b61295c2d666dfc7ee708706524ce8b8..cb4060b2593eedbda9bc627927e48909703ec87b 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -78,7 +78,7 @@ def adopt_lookup_table_v1(ctx, main_block, src_op, Ids_var):
     )
     if not Ids_var.stop_gradient:
         raise NotImplementedError(
-            'Requiring the gradient of Ids of lookup_table(v1）dist op is not currently supported. Please open an issue with details on your use case so that we can prioritize adding this (for instance, adversarial training for language model).'
+            'Requiring the gradient of Ids of lookup_table(v1) dist op is not currently supported. Please open an issue with details on your use case so that we can prioritize adding this (for instance, adversarial training for language model).'
         )
 
     target_shape = list(Ids_var.shape[:-1])
@@ -405,7 +405,7 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
                 ctx, op_dist_attr.process_mesh, rank_id
             )
 
-        # A generalized method to caculate embedding offset using cartisian product
+        # A generalized method to calculate embedding offset using cartisian product
         relative_idx = _get_idx_in_axis(
             process_mesh_group,
             process_mesh_shape,
@@ -416,7 +416,7 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
         per_part_size = Weight_var.shape[0]
         relative_idx = relative_idx * per_part_size
 
-        # TODO caculate ring id
+        # TODO calculate ring id
         parallel_axis = embedding_row_dim_mapping
         group_ranks = _get_comm_group(
             process_mesh_group, process_mesh_shape, parallel_axis, rank_id
@@ -544,7 +544,7 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
             process_mesh = param_dist_attr.process_mesh
             dim_mapping = param_dist_attr.dims_mapping
 
-            # NOTE all not splited axis should be presented in mesh
+            # NOTE all not splitted axis should be presented in mesh
             for axis, size in enumerate(process_mesh.shape):
                 if size <= 1 or axis in dim_mapping:
                     pass
@@ -632,7 +632,7 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
         process_mesh_shape = dist_attr.process_mesh.shape
         process_mesh_group = dist_attr.process_mesh.process_ids
 
-        # A generalized method to caculate embedding offset using cartisian product
+        # A generalized method to calculate embedding offset using cartisian product
         relative_idx = _get_idx_in_axis(
             process_mesh_group,
             process_mesh_shape,
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
index 16eee22f33baf2e074f14dd9cc2b22a94ea7e882..f8e3014d6b6f7012f524c8072b3d183edff04ea6 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
@@ -114,7 +114,7 @@ class DistributedFillConstantBatchSizeLikeImpl0(DistributedOperatorImpl):
         x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
         out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
 
-        # only the batch size dimemsion of input and output are relative.
+        # only the batch size dimension of input and output are relative.
         dim_changed = compute_compatible_and_update_dim_mapping(
             [x_dims_mapping, out_dims_mapping], [0, 0]
         )
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 62f8577ff3e87cfae9f0fb08db0b9c49852719de..8266036c4ec8baa57edb2b8bace4cdeac36d92a0 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -377,7 +377,7 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
 
     # assert len(
     #     Y_var_dim_mapping
-    # ) == 2, "dist matmual only support Y operand with 2 dims now but Y({})'s dim is [{}]".format(
+    # ) == 2, "dist matmul only support Y operand with 2 dims now but Y({})'s dim is [{}]".format(
     #     Y_var.name, Y_var_dim_mapping)
     Y_var_partitioned = False
     for dim in Y_var_dim_mapping:
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
index 518ba4814bfd79104f6fdf8d15df3c6ea6e2b8b7..b45c9168b38255016da70bd73b2acef3c90d3434 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
@@ -51,14 +51,14 @@ class DistributedPNormImpl0(DistributedOperatorImpl):
 
     1. axis == None, isinstance(p, (int, float)), asvector = True
         1.1 x_dims_mapping == [0, -1, -1]
-            allgather input if it is splited by dp group
+            allgather input if it is splitted by dp group
         1.2 x_dims_mapping == [-1, 0, -1]
-            allgather, split and concat input if it is splited by mp group
+            allgather, split and concat input if it is splitted by mp group
     2. isinstance(axis, int), asvector = False
         1.1 axis == 0 and x_dims_mapping == [0, -1, -1]
             allgather input if it's input[0] is splited by dp group.
         1.2 axis == 1 and x_dims_mapping == [-1, 0, -1]
-            allgather, split and concat input if it's input[1] is splited by mp group
+            allgather, split and concat input if it's input[1] is splitted by mp group
     """
 
     def __init__(self, name):
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
index 2833fb358147f7317d0d6c40f88f892e79959722..5ed37e6d57005363e2c9da645f61a94a376d9498 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
@@ -67,7 +67,7 @@ class DistributedUpdateLossScalingImpl(DistributedOperatorImpl):
     @staticmethod
     def backward(ctx, *args, **kwargs):
 
-        # the backward function only filte the gradient with current rank id
+        # the backward function only filter the gradient with current rank id
         dist_op_context = ctx.dist_op_context
         main_block = dist_op_context.main_block
         backward_op = dist_op_context.cur_src_op
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index e665d5c43d963e4ce8824a264ca4015c7fbf8e65..d2463f33086376a7ef184f6e8a3e9f39611edc49 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -55,9 +55,9 @@ class AutoParallelizer:
     AutoParallelizer is the main controller class to do the auto parallel process.
     And the auto parallel process will be triggered in the wrapped parallelize function.
     To facilitate the auto parallelization, it will contain information about program, cluster and the
-    related context. In this basic version, the program information will be retrevied from
-    Fleet object, and the cluster information can be retrevied in the new created Cluster object,
-    and the context information can be retrevied in the new created DistributedContext.
+    related context. In this basic version, the program information will be retrieved from
+    Fleet object, and the cluster information can be retrieved in the new created Cluster object,
+    and the context information can be retrieved in the new created DistributedContext.
     """
 
     def __init__(self, fleet):
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index 03d744a6c0491dfe27a04f64576e67d82a3a2647..2dcd73163eecd742f610b7710845f54bcfc5a4fb 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -251,7 +251,7 @@ class Partitioner:
                     serial_ops[idx].desc.original_id()
                 ] = serial_ops[idx]
 
-        # partiiton
+        # partition
         appended_grad_times = 0
         for idx, op in enumerate(serial_ops):
 
@@ -263,7 +263,7 @@ class Partitioner:
                 if not op_dist_attr.is_recompute:
                     appended_grad_times += 1
 
-            # partititon input variables
+            # partition input variables
             for serial_input_varname in op.desc.input_arg_names():
                 if (
                     serial_input_varname
diff --git a/python/paddle/distributed/auto_parallel/planner_v2.py b/python/paddle/distributed/auto_parallel/planner_v2.py
index 5c9b7233b890e3828a1ad75a507f1417261d5b67..771faf4414f3d226c6d6ecc99d90461a52646b90 100755
--- a/python/paddle/distributed/auto_parallel/planner_v2.py
+++ b/python/paddle/distributed/auto_parallel/planner_v2.py
@@ -29,10 +29,10 @@ class Planner:
         self._dist_context._dist_op_context = default_ctx.dist_op_context
         self._dist_context.data_parallel = default_ctx.data_parallel
         if not is_naive_data_parallel(self._dist_context):
-            # Use SSA graph for complex parallism
+            # Use SSA graph for complex parallelism
             self._dist_context.initialize(with_graph=True)
         else:
-            # Use program for data parallel parallism
+            # Use program for data parallel parallelism
             self._dist_context.initialize(with_graph=False)
 
         self._completer = Completer(self._dist_context)
diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py
index 1debbaa325638886da874eadc1900c5954f3e4da..7f5ed62fa4efec152ba208f80fea4a5bfc053723 100644
--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -57,7 +57,7 @@ def new_process_group(ranks, group_id=None, force_new_group=False):
             cur_key = ''.join(map(str, sorted(pg.ranks)))
             if pg_id != 0 and new_key == cur_key:
                 return pg
-    # If not matching the existing one, construt a new process group
+    # If not matching the existing one, construct a new process group
     num_groups = len(_g_process_group_map)
     # Note: our process group may interfere with the original implementation
     # so the created group id should start from the original _new_ring_id()
diff --git a/python/paddle/distributed/auto_parallel/process_mesh_v2.py b/python/paddle/distributed/auto_parallel/process_mesh_v2.py
index a9e66e20c86a124391c4c58fd4d5768f72cea796..23fe66ab4bd28dd0104ec7f413a25d230299b166 100644
--- a/python/paddle/distributed/auto_parallel/process_mesh_v2.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh_v2.py
@@ -22,7 +22,7 @@ class ProcessMesh(core.ProcessMesh):
     The class `Processmesh` describes the topology of logical processes.
 
     Args:
-        mesh (list|numpy.array): an N-dimensional array describes the toplogy
+        mesh (list|numpy.array): an N-dimensional array describes the topology
             of logical processes.
         dim_names (list, optional): the i-th element of this list gives the name of the
             i-th dimension.
diff --git a/python/paddle/distributed/auto_parallel/strategy.py b/python/paddle/distributed/auto_parallel/strategy.py
index eb2e09b3a264316be91545c68a1056003510fbc6..41ddad975d5e5d701ea9851c6a72fa9d0bfeb5fa 100644
--- a/python/paddle/distributed/auto_parallel/strategy.py
+++ b/python/paddle/distributed/auto_parallel/strategy.py
@@ -35,7 +35,7 @@ class BaseConfig:
         for field, default_value in config.items():
             setattr(self, field, default_value)
 
-        # Overide attributes by the config_dict
+        # Override attributes by the config_dict
         if self._config_dict:
             self.from_dict(self._config_dict)
 
@@ -128,7 +128,7 @@ class FusedPassesConfig(BaseConfig):
 
 class Strategy(BaseConfig):
     """
-    The `Strategy` object is used to configure the paralleization and optimization beheviors.
+    The `Strategy` object is used to configure the parallelization and optimization behaviors.
 
     Args:
         config (dict|string, optional): If this is None, the default configurations will used.
diff --git a/python/paddle/distributed/auto_parallel/tuner/algorithms.py b/python/paddle/distributed/auto_parallel/tuner/algorithms.py
index 74e8f3e9ee3f1d7e3fd57fb1ddf0d8961fa2ceb9..4e5850dcae80de600df5c70fb9f456d74bae27bd 100644
--- a/python/paddle/distributed/auto_parallel/tuner/algorithms.py
+++ b/python/paddle/distributed/auto_parallel/tuner/algorithms.py
@@ -23,7 +23,7 @@ from .trial import TrialStatus
 
 class AlgorithmBase(ABC):
     """
-    An Tuning alogrithm is a class to find out an optimal configuration
+    An Tuning algorithm is a class to find out an optimal configuration
     given the selected tuning optimization pass(es) and the arguments to be tuned.
     Different optimization pass(es) will correspond to a different algorithm,
     where different search space **pruning rules** will applied.
@@ -71,7 +71,7 @@ class AlgorithmBase(ABC):
     @abstractmethod
     def update(self, results):
         """
-        Update the algorthim with the results of last trial. Using this information is used to
+        Update the algorithm with the results of last trial. Using this information is used to
         pruning the search space of the future trial.
         """
         pass
@@ -227,7 +227,7 @@ class ReccomputeCheckpointAlgorithm(AlgorithmBase):
             else:
                 self._trial_idx = self._total_num_trial
                 self._logger.info(
-                    "Recompute is unnecessary for this model size, which will reduce the Throughtput."
+                    "Recompute is unnecessary for this model size, which will reduce the Throughput."
                 )
         else:
             if self._trail_left >= self._trail_right:
diff --git a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
index 508a9e51b54b5672257c357b1f5835b421a526a1..952c32b3add1fc71e85c68673f7609825f8e9612 100644
--- a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
@@ -611,7 +611,7 @@ The best trial is: [{}], whose configuration is following:
 
     def tune(self):
         """
-        Performs the search for best hyperparameter configuations
+        Performs the search for best hyperparameter configurations
         for the selected optimization pass(es).
         """
 
diff --git a/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py b/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py
index 6a5fbbbdc010972ba3886aaf771044426572e5ac..3b3bfa7d3855f7fa38034d8801d32cdbc7a735a2 100644
--- a/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py
@@ -530,7 +530,7 @@ class ParallelTuner:
                 del self._concerned_dist_ops[op_id]
 
         print(
-            "Number of the concered dist ops",
+            "Number of the concerned dist ops",
             len(self._concerned_dist_ops),
             flush=True,
         )
@@ -631,7 +631,7 @@ class ParallelTuner:
                 direction = directions[i].random(self._seed)
                 size = sizes[i].random(self._seed)
                 if direction:
-                    # Substract 1 from size to avoid the overlapping of new starts
+                    # Subtract 1 from size to avoid the overlapping of new starts
                     new_start = start - (size - 1)
                 else:
                     new_start = start + size
@@ -788,7 +788,7 @@ class ParallelTuner:
                 dist_op.dist_attr.impl_idx = 0
 
     def _check_fused_softmax_mask_upper_triangle(self, dist_op):
-        """The last_but_one dim shoule be equal to last dim."""
+        """The last_but_one dim should be equal to last dim."""
         input_name = dist_op.serial_op.input_arg_names[0]
         input_dims_mapping = dist_op.dist_attr.get_input_dims_mapping(
             input_name
@@ -996,7 +996,7 @@ class ParallelTuner:
             self._dist_context.serial_main_program
         )
 
-        # Backup the intital parallel strategy
+        # Backup the initial parallel strategy
         self._init_parallel_strategy[0] = copy.deepcopy(
             self._dist_context._dist_tensors_for_program
         )
diff --git a/python/paddle/distributed/auto_parallel/tuner/profiler.py b/python/paddle/distributed/auto_parallel/tuner/profiler.py
index 4269a773645319ad8339656adb84f5430e0e54fd..d81db732f5cb372edb48e86bf017e28c6be50e36 100644
--- a/python/paddle/distributed/auto_parallel/tuner/profiler.py
+++ b/python/paddle/distributed/auto_parallel/tuner/profiler.py
@@ -73,7 +73,7 @@ def parse_args():
         "--ctx_filename",
         type=str,
         required=True,
-        help="the filename to the profile context file saved by optimizaiton tuner",
+        help="the filename to the profile context file saved by optimization tuner",
     )
 
     args = parser.parse_args()
diff --git a/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py b/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
index cdfc87868c546189c28cdedbb2ae2661b2839b49..6c08dd4d206da1dd7a447254016eaf528645e579 100644
--- a/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
@@ -81,7 +81,7 @@ class QKVPattern(BasePattern):
         # Pattern
         self.attrs["shard_spec"] = [
             [(1, 2, 3), [[-1, 0], [-1, 1]]],
-        ]  # 2-tuple list such as [(tensor_id, shard_sepc)]
+        ]  # 2-tuple list such as [(tensor_id, shard_spec)]
 
 
 def convert_to_graph(ops, block):
@@ -535,7 +535,7 @@ class ClusterPartitionUtil:
         ],
     ) -> list:
         """
-        Partiton cluster into possible device meshes.
+        Partition cluster into possible device meshes.
 
         Args:
             n (int): The number of nodes.
diff --git a/python/paddle/distributed/auto_parallel/tuner/trial.py b/python/paddle/distributed/auto_parallel/tuner/trial.py
index 2a52ae1e1a8723cad58598d0d2e1aa072521598f..d0662e4c8eed1e683a7d02ebc6019dd8df81373e 100644
--- a/python/paddle/distributed/auto_parallel/tuner/trial.py
+++ b/python/paddle/distributed/auto_parallel/tuner/trial.py
@@ -147,7 +147,7 @@ class OptimizationTunerTrial(Trial):
 
         draws = border + "\n"
         draws += h1_format.format("")
-        draws += h1_format.format("Tuned Configuartions Overview")
+        draws += h1_format.format("Tuned Configurations Overview")
         draws += h1_format.format("")
 
         for name in self._changed_configs:
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
index d7a5ffa52229eb3e6c172ad6a47a3c48d81047d5..bd3868cc80fd0b5ed4fded97d05c9ba5e6caef75 100644
--- a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
@@ -26,7 +26,7 @@ class TunableSpace:
     def __init__(self):
         # Tunable variables for this tunable variables
         self._variables = {}
-        # Specific values coresponding to each tunable variable
+        # Specific values corresponding to each tunable variable
         self._values = {}
 
     @property
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index 1792fd51f93234b98c0fa90a3e04248193a477e8..c4e64ee269fb8dcead1affea20e10081112f8460 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -273,7 +273,7 @@ def _get_comm_group(processes, shape, axis, rank):
     Given a rank and the processes mesh the rank belongs to,
     compute the communication peers of the rank based on the give axis in the mesh.
 
-    Example: 16 processes managed in a 4-Dimensinal mesh with shape of [2, 2, 2, 2].
+    Example: 16 processes managed in a 4-Dimensional mesh with shape of [2, 2, 2, 2].
     the rank communication peers of rank 0 (included) are following:
     in axis 0: [0, 1]
     in axis 1: [0, 2]
@@ -347,7 +347,7 @@ def _coordinate2linear_idx(mesh_shape, coordinate):
     # that the processes in mesh are
     #    1. starts from 0
     #    2. continuous
-    # it will be wrong if ths above condition doesnot meet,
+    # it will be wrong if ths above condition does not meet,
     # e.g. process_mesh = { process_groups = [7, 8, 9,10, 12, 13, 14, 15], mesh = [2, 4]}
     # if you want a more general mapping, you should use cartesian product
 
@@ -594,7 +594,7 @@ def save_distributed_checkpoint(
     dist_context=None,
 ):
     """
-    Save model parameter state, optimzer state, distributed attribute and
+    Save model parameter state, optimizer state, distributed attribute and
     additional information of each rank.
 
     Args: