Fix typos (#50894)

b8c06b6a · chenxujun · GitHub · f8ec430e · b8c06b6a · b8c06b6a
36 changed file
--- a/python/paddle/audio/backends/backend.py
+++ b/python/paddle/audio/backends/backend.py
@@ -126,7 +126,7 @@ def save(
        filepath: saved path
        src: the audio tensor
        sample_rate: the number of samples of audio per second.
-        channels_first: src channel infomation
+        channels_first: src channel information
            if True, means input tensor is (channels, time)
            if False, means input tensor is (time, channels)
        encoding:encoding format, wave_backend only support PCM16 now.

--- a/python/paddle/audio/datasets/esc50.py
+++ b/python/paddle/audio/datasets/esc50.py
@@ -37,7 +37,7 @@ class ESC50(AudioClassificationDataset):
    Args:
       mode (str, optional): It identifies the dataset mode (train or dev). Default:train.
       split (int, optional): It specify the fold of dev dataset. Default:1.
-       feat_type (str, optional): It identifies the feature type that user wants to extrace of an audio file. Default:raw.
+       feat_type (str, optional): It identifies the feature type that user wants to extract of an audio file. Default:raw.
       archive(dict, optional): it tells where to download the audio archive. Default:None.

    Returns:

--- a/python/paddle/audio/datasets/tess.py
+++ b/python/paddle/audio/datasets/tess.py
@@ -39,7 +39,7 @@ class TESS(AudioClassificationDataset):
       mode (str, optional): It identifies the dataset mode (train or dev). Defaults to train.
       n_folds (int, optional): Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset. Defaults to 5.
       split (int, optional): It specify the fold of dev dataset. Defaults to 1.
-       feat_type (str, optional): It identifies the feature type that user wants to extrace of an audio file. Defaults to raw.
+       feat_type (str, optional): It identifies the feature type that user wants to extract of an audio file. Defaults to raw.
       archive(dict): it tells where to download the audio archive. Defaults to None.

    Returns:

--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -34,7 +34,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False):

        retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
            like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
-            :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+            :code:`retain_graph` to True, then the grads will be retained. Thus, setting it to False is much more memory-efficient.
            Defaults to False.

    Returns:
@@ -79,7 +79,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
        assert in_out_list is not None, "{} should not be None".format(name)

        if isinstance(in_out_list, (list, tuple)):
-            assert len(in_out_list) > 0, "{} connot be empyt".format(name)
+            assert len(in_out_list) > 0, "{} connot be empty".format(name)
            for each_var in in_out_list:
                assert isinstance(
                    each_var, (paddle.Tensor, core.eager.Tensor)

--- a/python/paddle/autograd/saved_tensors_hooks.py
+++ b/python/paddle/autograd/saved_tensors_hooks.py
@@ -29,7 +29,7 @@ class saved_tensors_hooks:
            of the original tensor. `pack_hook` will also be called while any
            tensor need be saved by `PyLayerContext.save_for_backward`. If a tensor
            saved for backward is no need buffer, `pack_hook` will not be called.
-            Only the thensor saved for backward is LoDTensor, `pack_hook` will be
+            Only the tensor saved for backward is LoDTensor, `pack_hook` will be
            called.
        unpack_hook (function): The unpack hook will be called every time the
            backward need use the saved inputs/outputs tensors. Then you can reload

--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -95,8 +95,8 @@ def corpus_reader(data_path, words_name, props_name):

                if len(label) == 0:  # end of sentence
                    for i in range(len(one_seg[0])):
-                        a_kind_lable = [x[i] for x in one_seg]
-                        labels.append(a_kind_lable)
+                        a_kind_label = [x[i] for x in one_seg]
+                        labels.append(a_kind_label)

                    if len(labels) >= 1:
                        verb_list = []

--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -14,7 +14,7 @@
 """
 This module will download dataset from
 http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
-and parse train/test set intopaddle reader creators.
+and parse train/test dataset into paddle reader creators.

 This set contains images of flowers belonging to 102 different categories.
 The images were acquired by searching the web and taking pictures. There are a

--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -89,9 +89,9 @@ def batch_images_from_tar(
    :type data_file: string
    :param dataset_name: 'train','test' or 'valid'
    :type dataset_name: string
-    :param img2label: a dic with image file name as key
+    :param img2label: a dict with image file name as key
                    and image's label as value
-    :type img2label: dic
+    :type img2label: dict
    :param num_per_batch: image number per batch file
    :type num_per_batch: int
    :return: path of list file containing paths of batch file

--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -108,7 +108,7 @@ def reader_creator(filename, word_idx, n, data_type):
                        continue
                    yield src_seq, trg_seq
                else:
-                    assert False, 'Unknow data type'
+                    assert False, 'Unknown data type'

    return reader


--- a/python/paddle/distributed/auto_parallel/cluster_v2.py
+++ b/python/paddle/distributed/auto_parallel/cluster_v2.py
@@ -48,7 +48,7 @@ class DeviceMesh(core.DeviceMesh):
    The class `DeviceMesh` describes the topology of physical devices.

    Args:
-        mesh (list|numpy.array): an N-dimensional array describes the toplogy
+        mesh (list|numpy.array): an N-dimensional array describes the topology
            of logical processes.
        dim_names (list, optional): the i-th element of this list gives the name of the
            i-th dimension.

--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -257,7 +257,7 @@ class Completer:
                                tensor_desc.name(), compatible_dims_mapping
                            )
                            changed = True
-            # Find the most compatible implemenetations from the distributed operator
+            # Find the most compatible implementations from the distributed operator
            op_dist_impls = find_compatible_distributed_operator_impls(
                dist_op, fwd=True
            )
@@ -329,7 +329,7 @@ class Completer:
                                tensor_desc.name(), compatible_dims_mapping
                            )
                            changed = True
-            # Find the most compatible implemenetations from the distributed operator
+            # Find the most compatible implementations from the distributed operator
            op_dist_impls = find_compatible_distributed_operator_impls(
                dist_op, fwd=False
            )
@@ -685,7 +685,7 @@ class Completer:
            cond_tensor_related_nodes.extend(
                _find_nodes_related_to_cond(cond_tensor_node)
            )
-            # Step 2.3: Add the StepScops output of while_op
+            # Step 2.3: Add the StepScopes output of while_op
            stepscopes_tensor_name = while_op_node.op().output("StepScopes")[0]
            stepscopes_tensor_node = None
            for output_node in while_op_node.outputs:
@@ -1397,7 +1397,7 @@ class Completer:
                )
                forward_var = vars[forward_var_name]

-                # TODO complete other attribte for grad var
+                # TODO complete other attribute for grad var
                tensor_dist_attr = TensorDistAttr()
                process_mesh = (
                    self._dist_context.get_tensor_dist_attr_for_program(

--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -1047,7 +1047,7 @@ class DistributedOperatorContext:
        # NOTE Support correct parallelism for high-order differential model.
        # by default exceed_backward_init_op is False and it means we are in Forward phase; After exceed_backward_init_op = True,
        # it means we are in Backward phase.
-        # And the final sulotion should be revise high-order differential logic for these two phases in future.
+        # And the final solution should be revise high-order differential logic for these two phases in future.
        self._exceed_backward_init_op = False

    def __deepcopy__(self, memo):

--- a/python/paddle/distributed/auto_parallel/dist_loader.py
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -146,7 +146,7 @@ class DistributedDataLoaderFromGenerator(DistributedDataLoaderBase):
                steps_per_epoch = len(self.dataset) // self.batch_size
        except:
            raise ValueError(
-                "Pleace set `steps_per_epoch` or implement `__len__` methond in dataset class."
+                "Please set `steps_per_epoch` or implement `__len__` method in dataset class."
            )
        return steps_per_epoch


--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -328,7 +328,7 @@ class DistributedOperatorHelper:
        elif isinstance(output, Variable):
            new_output = [output]
        else:
-            raise ValueError("Unrecognized outpout.")
+            raise ValueError("Unrecognized output.")

        if self._out_dims_mappings:
            assert len(new_output) == len(

--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -247,7 +247,7 @@ class Engine:
                labels = sample[split:]
        else:
            raise TypeError(
-                "Data should be a Dataset or IterableDatset, but received {}.".format(
+                "Data should be a Dataset or IterableDataset, but received {}.".format(
                    type(data).__name__
                )
            )
@@ -699,7 +699,7 @@ class Engine:
    def _parallel(self, mode, all_ranks=False):
        # Parallelize program based on the planner's results
        # For now, the completer has to be passed to the planner,
-        # because we may use it to complete the annotation of the backwarkward and update.
+        # because we may use it to complete the annotation of the backward and update.
        parallelizer = Parallelizer(
            mode,
            self._planners[mode].completer,

--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -117,15 +117,15 @@ def shard_op(op, process_mesh=None, in_shard_specs=None, out_shard_specs=None):
            will be used. And an error will be raised if the current process mesh cannot be found.
            Default: None.
        in_shard_specs (list of list, optional): a list of list to describe the sharding specifications
-            for the inputs. Each item of `in_shard_specs` is a `shard_spec` between the correspoinding input
-            and `process_mesh`. If one item is None, the cooresponding input is replicated across all processes
-            If it is None, all inputs are replicated across all processes. Note that the lenght of the
+            for the inputs. Each item of `in_shard_specs` is a `shard_spec` between the corresponding input
+            and `process_mesh`. If one item is None, the corresponding input is replicated across all processes
+            If it is None, all inputs are replicated across all processes. Note that the length of the
            `in_shard_specs` should be equal to the actual number of inputs when calling this operation.
            Default: None.
        out_shard_specs (list of list, optional): a list of list to describe the sharding specifications
-            for the outputs. Each item of `out_shard_specs` is a `shard_spec` between the correspoinding output
-            and `process_mesh`. If one item is None, the cooresponding output is replicated across all processes
-            If it is None, all outputs are replicated across all processes. Note that the lenght of the
+            for the outputs. Each item of `out_shard_specs` is a `shard_spec` between the corresponding output
+            and `process_mesh`. If one item is None, the corresponding output is replicated across all processes
+            If it is None, all outputs are replicated across all processes. Note that the length of the
            `in_shard_specs` should be equal to the actual number of inputs when calling this operation.
            Default: None. Default: None.


--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -191,7 +191,7 @@ def register_distributed_operator_impl(op_type, dist_impl):

 def find_compatible_distributed_operator_impls(dist_op, fwd=True, partial=True):
    """
-    Here just return the first compatible implemention.
+    Here just return the first compatible implementation.
    This will be improved by cost model in the future.
    """
    op_type = dist_op.serial_op.type

--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -78,7 +78,7 @@ def adopt_lookup_table_v1(ctx, main_block, src_op, Ids_var):
    )
    if not Ids_var.stop_gradient:
        raise NotImplementedError(
-            'Requiring the gradient of Ids of lookup_table(v1）dist op is not currently supported. Please open an issue with details on your use case so that we can prioritize adding this (for instance, adversarial training for language model).'
+            'Requiring the gradient of Ids of lookup_table(v1) dist op is not currently supported. Please open an issue with details on your use case so that we can prioritize adding this (for instance, adversarial training for language model).'
        )

    target_shape = list(Ids_var.shape[:-1])
@@ -405,7 +405,7 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
                ctx, op_dist_attr.process_mesh, rank_id
            )

-        # A generalized method to caculate embedding offset using cartisian product
+        # A generalized method to calculate embedding offset using cartisian product
        relative_idx = _get_idx_in_axis(
            process_mesh_group,
            process_mesh_shape,
@@ -416,7 +416,7 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
        per_part_size = Weight_var.shape[0]
        relative_idx = relative_idx * per_part_size

-        # TODO caculate ring id
+        # TODO calculate ring id
        parallel_axis = embedding_row_dim_mapping
        group_ranks = _get_comm_group(
            process_mesh_group, process_mesh_shape, parallel_axis, rank_id
@@ -544,7 +544,7 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
            process_mesh = param_dist_attr.process_mesh
            dim_mapping = param_dist_attr.dims_mapping

-            # NOTE all not splited axis should be presented in mesh
+            # NOTE all not splitted axis should be presented in mesh
            for axis, size in enumerate(process_mesh.shape):
                if size <= 1 or axis in dim_mapping:
                    pass
@@ -632,7 +632,7 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl):
        process_mesh_shape = dist_attr.process_mesh.shape
        process_mesh_group = dist_attr.process_mesh.process_ids

-        # A generalized method to caculate embedding offset using cartisian product
+        # A generalized method to calculate embedding offset using cartisian product
        relative_idx = _get_idx_in_axis(
            process_mesh_group,
            process_mesh_shape,

--- a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
@@ -114,7 +114,7 @@ class DistributedFillConstantBatchSizeLikeImpl0(DistributedOperatorImpl):
        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)

-        # only the batch size dimemsion of input and output are relative.
+        # only the batch size dimension of input and output are relative.
        dim_changed = compute_compatible_and_update_dim_mapping(
            [x_dims_mapping, out_dims_mapping], [0, 0]
        )

--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -377,7 +377,7 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):

    # assert len(
    #     Y_var_dim_mapping
-    # ) == 2, "dist matmual only support Y operand with 2 dims now but Y({})'s dim is [{}]".format(
+    # ) == 2, "dist matmul only support Y operand with 2 dims now but Y({})'s dim is [{}]".format(
    #     Y_var.name, Y_var_dim_mapping)
    Y_var_partitioned = False
    for dim in Y_var_dim_mapping:

--- a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
@@ -51,14 +51,14 @@ class DistributedPNormImpl0(DistributedOperatorImpl):

    1. axis == None, isinstance(p, (int, float)), asvector = True
        1.1 x_dims_mapping == [0, -1, -1]
-            allgather input if it is splited by dp group
+            allgather input if it is splitted by dp group
        1.2 x_dims_mapping == [-1, 0, -1]
-            allgather, split and concat input if it is splited by mp group
+            allgather, split and concat input if it is splitted by mp group
    2. isinstance(axis, int), asvector = False
        1.1 axis == 0 and x_dims_mapping == [0, -1, -1]
            allgather input if it's input[0] is splited by dp group.
        1.2 axis == 1 and x_dims_mapping == [-1, 0, -1]
-            allgather, split and concat input if it's input[1] is splited by mp group
+            allgather, split and concat input if it's input[1] is splitted by mp group
    """

    def __init__(self, name):

--- a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
@@ -67,7 +67,7 @@ class DistributedUpdateLossScalingImpl(DistributedOperatorImpl):
    @staticmethod
    def backward(ctx, *args, **kwargs):

-        # the backward function only filte the gradient with current rank id
+        # the backward function only filter the gradient with current rank id
        dist_op_context = ctx.dist_op_context
        main_block = dist_op_context.main_block
        backward_op = dist_op_context.cur_src_op

--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -55,9 +55,9 @@ class AutoParallelizer:
    AutoParallelizer is the main controller class to do the auto parallel process.
    And the auto parallel process will be triggered in the wrapped parallelize function.
    To facilitate the auto parallelization, it will contain information about program, cluster and the
-    related context. In this basic version, the program information will be retrevied from
-    Fleet object, and the cluster information can be retrevied in the new created Cluster object,
-    and the context information can be retrevied in the new created DistributedContext.
+    related context. In this basic version, the program information will be retrieved from
+    Fleet object, and the cluster information can be retrieved in the new created Cluster object,
+    and the context information can be retrieved in the new created DistributedContext.
    """

    def __init__(self, fleet):

--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -251,7 +251,7 @@ class Partitioner:
                    serial_ops[idx].desc.original_id()
                ] = serial_ops[idx]

-        # partiiton
+        # partition
        appended_grad_times = 0
        for idx, op in enumerate(serial_ops):

@@ -263,7 +263,7 @@ class Partitioner:
                if not op_dist_attr.is_recompute:
                    appended_grad_times += 1

-            # partititon input variables
+            # partition input variables
            for serial_input_varname in op.desc.input_arg_names():
                if (
                    serial_input_varname

--- a/python/paddle/distributed/auto_parallel/planner_v2.py
+++ b/python/paddle/distributed/auto_parallel/planner_v2.py
@@ -29,10 +29,10 @@ class Planner:
        self._dist_context._dist_op_context = default_ctx.dist_op_context
        self._dist_context.data_parallel = default_ctx.data_parallel
        if not is_naive_data_parallel(self._dist_context):
-            # Use SSA graph for complex parallism
+            # Use SSA graph for complex parallelism
            self._dist_context.initialize(with_graph=True)
        else:
-            # Use program for data parallel parallism
+            # Use program for data parallel parallelism
            self._dist_context.initialize(with_graph=False)

        self._completer = Completer(self._dist_context)

--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -57,7 +57,7 @@ def new_process_group(ranks, group_id=None, force_new_group=False):
            cur_key = ''.join(map(str, sorted(pg.ranks)))
            if pg_id != 0 and new_key == cur_key:
                return pg
-    # If not matching the existing one, construt a new process group
+    # If not matching the existing one, construct a new process group
    num_groups = len(_g_process_group_map)
    # Note: our process group may interfere with the original implementation
    # so the created group id should start from the original _new_ring_id()

--- a/python/paddle/distributed/auto_parallel/process_mesh_v2.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh_v2.py
@@ -22,7 +22,7 @@ class ProcessMesh(core.ProcessMesh):
    The class `Processmesh` describes the topology of logical processes.

    Args:
-        mesh (list|numpy.array): an N-dimensional array describes the toplogy
+        mesh (list|numpy.array): an N-dimensional array describes the topology
            of logical processes.
        dim_names (list, optional): the i-th element of this list gives the name of the
            i-th dimension.

--- a/python/paddle/distributed/auto_parallel/strategy.py
+++ b/python/paddle/distributed/auto_parallel/strategy.py
@@ -35,7 +35,7 @@ class BaseConfig:
        for field, default_value in config.items():
            setattr(self, field, default_value)

-        # Overide attributes by the config_dict
+        # Override attributes by the config_dict
        if self._config_dict:
            self.from_dict(self._config_dict)

@@ -128,7 +128,7 @@ class FusedPassesConfig(BaseConfig):

 class Strategy(BaseConfig):
    """
-    The `Strategy` object is used to configure the paralleization and optimization beheviors.
+    The `Strategy` object is used to configure the parallelization and optimization behaviors.

    Args:
        config (dict|string, optional): If this is None, the default configurations will used.

--- a/python/paddle/distributed/auto_parallel/tuner/algorithms.py
+++ b/python/paddle/distributed/auto_parallel/tuner/algorithms.py
@@ -23,7 +23,7 @@ from .trial import TrialStatus

 class AlgorithmBase(ABC):
    """
-    An Tuning alogrithm is a class to find out an optimal configuration
+    An Tuning algorithm is a class to find out an optimal configuration
    given the selected tuning optimization pass(es) and the arguments to be tuned.
    Different optimization pass(es) will correspond to a different algorithm,
    where different search space **pruning rules** will applied.
@@ -71,7 +71,7 @@ class AlgorithmBase(ABC):
    @abstractmethod
    def update(self, results):
        """
-        Update the algorthim with the results of last trial. Using this information is used to
+        Update the algorithm with the results of last trial. Using this information is used to
        pruning the search space of the future trial.
        """
        pass
@@ -227,7 +227,7 @@ class ReccomputeCheckpointAlgorithm(AlgorithmBase):
            else:
                self._trial_idx = self._total_num_trial
                self._logger.info(
-                    "Recompute is unnecessary for this model size, which will reduce the Throughtput."
+                    "Recompute is unnecessary for this model size, which will reduce the Throughput."
                )
        else:
            if self._trail_left >= self._trail_right:

--- a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
@@ -611,7 +611,7 @@ The best trial is: [{}], whose configuration is following:

    def tune(self):
        """
-        Performs the search for best hyperparameter configuations
+        Performs the search for best hyperparameter configurations
        for the selected optimization pass(es).
        """


--- a/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py
@@ -530,7 +530,7 @@ class ParallelTuner:
                del self._concerned_dist_ops[op_id]

        print(
-            "Number of the concered dist ops",
+            "Number of the concerned dist ops",
            len(self._concerned_dist_ops),
            flush=True,
        )
@@ -631,7 +631,7 @@ class ParallelTuner:
                direction = directions[i].random(self._seed)
                size = sizes[i].random(self._seed)
                if direction:
-                    # Substract 1 from size to avoid the overlapping of new starts
+                    # Subtract 1 from size to avoid the overlapping of new starts
                    new_start = start - (size - 1)
                else:
                    new_start = start + size
@@ -788,7 +788,7 @@ class ParallelTuner:
                dist_op.dist_attr.impl_idx = 0

    def _check_fused_softmax_mask_upper_triangle(self, dist_op):
-        """The last_but_one dim shoule be equal to last dim."""
+        """The last_but_one dim should be equal to last dim."""
        input_name = dist_op.serial_op.input_arg_names[0]
        input_dims_mapping = dist_op.dist_attr.get_input_dims_mapping(
            input_name
@@ -996,7 +996,7 @@ class ParallelTuner:
            self._dist_context.serial_main_program
        )

-        # Backup the intital parallel strategy
+        # Backup the initial parallel strategy
        self._init_parallel_strategy[0] = copy.deepcopy(
            self._dist_context._dist_tensors_for_program
        )

--- a/python/paddle/distributed/auto_parallel/tuner/profiler.py
+++ b/python/paddle/distributed/auto_parallel/tuner/profiler.py
@@ -73,7 +73,7 @@ def parse_args():
        "--ctx_filename",
        type=str,
        required=True,
-        help="the filename to the profile context file saved by optimizaiton tuner",
+        help="the filename to the profile context file saved by optimization tuner",
    )

    args = parser.parse_args()

--- a/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
@@ -81,7 +81,7 @@ class QKVPattern(BasePattern):
        # Pattern
        self.attrs["shard_spec"] = [
            [(1, 2, 3), [[-1, 0], [-1, 1]]],
-        ]  # 2-tuple list such as [(tensor_id, shard_sepc)]
+        ]  # 2-tuple list such as [(tensor_id, shard_spec)]


 def convert_to_graph(ops, block):
@@ -535,7 +535,7 @@ class ClusterPartitionUtil:
        ],
    ) -> list:
        """
-        Partiton cluster into possible device meshes.
+        Partition cluster into possible device meshes.

        Args:
            n (int): The number of nodes.

--- a/python/paddle/distributed/auto_parallel/tuner/trial.py
+++ b/python/paddle/distributed/auto_parallel/tuner/trial.py
@@ -147,7 +147,7 @@ class OptimizationTunerTrial(Trial):

        draws = border + "\n"
        draws += h1_format.format("")
-        draws += h1_format.format("Tuned Configuartions Overview")
+        draws += h1_format.format("Tuned Configurations Overview")
        draws += h1_format.format("")

        for name in self._changed_configs:

--- a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
@@ -26,7 +26,7 @@ class TunableSpace:
    def __init__(self):
        # Tunable variables for this tunable variables
        self._variables = {}
-        # Specific values coresponding to each tunable variable
+        # Specific values corresponding to each tunable variable
        self._values = {}

    @property

--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -273,7 +273,7 @@ def _get_comm_group(processes, shape, axis, rank):
    Given a rank and the processes mesh the rank belongs to,
    compute the communication peers of the rank based on the give axis in the mesh.

-    Example: 16 processes managed in a 4-Dimensinal mesh with shape of [2, 2, 2, 2].
+    Example: 16 processes managed in a 4-Dimensional mesh with shape of [2, 2, 2, 2].
    the rank communication peers of rank 0 (included) are following:
    in axis 0: [0, 1]
    in axis 1: [0, 2]
@@ -347,7 +347,7 @@ def _coordinate2linear_idx(mesh_shape, coordinate):
    # that the processes in mesh are
    #    1. starts from 0
    #    2. continuous
-    # it will be wrong if ths above condition doesnot meet,
+    # it will be wrong if ths above condition does not meet,
    # e.g. process_mesh = { process_groups = [7, 8, 9,10, 12, 13, 14, 15], mesh = [2, 4]}
    # if you want a more general mapping, you should use cartesian product

@@ -594,7 +594,7 @@ def save_distributed_checkpoint(
    dist_context=None,
 ):
    """
-    Save model parameter state, optimzer state, distributed attribute and
+    Save model parameter state, optimizer state, distributed attribute and
    additional information of each rank.

    Args: