Fix spelling errors in comments and documents (#3486)

* fix some spelling error under doc/ * fix spelling error deepspeed/ --------- Co-authored-by: N Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: N Logan Adams <114770087+loadams@users.noreply.github.com>

Fix spelling errors in comments and documents (#3486)
* fix some spelling error under doc/ * fix spelling error deepspeed/ --------- Co-authored-by: N Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: N Logan Adams <114770087+loadams@users.noreply.github.com>
87edbc8d · digger-yu · GitHub · 41321180 · 87edbc8d · 87edbc8d
12 changed file
--- a/deepspeed/autotuning/autotuner.py
+++ b/deepspeed/autotuning/autotuner.py
@@ -50,7 +50,7 @@ class Autotuner:

        assert tabulate is not None, "Missing required package `tabulate`, please install with `pip install deepspeed[autotuning]`."

-        logger.debug(f"autotunning args={args}")
+        logger.debug(f"autotuning args={args}")

        self.user_config = self._get_user_config(args.user_args)
        assert self.user_config is not None, "DeepSpeed configuration is not provided"
@@ -802,7 +802,7 @@ class Autotuner:
        if tuning_micro_batch_sizes_overwritten:
            return tuning_micro_batch_sizes

-        # in a auto-detected tuning_micro_batch_sizs list, max_micro_batch_size might not be performant as the memory consumption is close to max
+        # in a auto-detected tuning_micro_batch_sizes list, max_micro_batch_size might not be performant as the memory consumption is close to max
        # try smaller values while gas stays the same
        # if finding a more performant mbs value, use it to replace max_micro_batch_size in the list
        min_micro_batch_size_with_same_gas = (tuning_micro_batch_sizes[-2] +
@@ -1100,7 +1100,7 @@ class Autotuner:

    def run_after_tuning(self):
        """ Launches the training with the optimal DeepSpeed configuration found through the autotuning process.
-            "ds_config_optimal.json" describing the optmimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
+            "ds_config_optimal.json" describing the optimal DeepSpeed configuration as well the command used to launch training "cmd_optimal.txt" are saved to self.results_dir.
        """
        if self.optimal_cmd:
            result = subprocess.Popen(self.optimal_cmd)

--- a/deepspeed/autotuning/constants.py
+++ b/deepspeed/autotuning/constants.py
@@ -4,7 +4,7 @@
 # DeepSpeed Team

 #########################################
-# autotunner implementation constants
+# autotuner implementation constants
 #########################################

 import os
@@ -117,7 +117,7 @@ MODEL_INFO_PROFILE = "profile"
 MODEL_INFO_PROFILE_DEFAULT = False
 MODEL_INFO_NUM_PARAMS = "num_params"
 MODEL_INFO_NUM_PARAMS_DEFAULT = None
-MODEL_INFO_HIDDEN_SIZE = "hideen_size"
+MODEL_INFO_HIDDEN_SIZE = "hidden_size"
 MODEL_INFO_HIDDEN_SIZE_DEFAULT = None
 MODEL_INFO_NUM_LAYERS = "num_layers"
 MODEL_INFO_NUM_LAYERS_DEFAULT = None
@@ -130,7 +130,7 @@ MODEL_INFO_KEY_DEFAULT_DICT = {
 }

 #########################################
-# autotunner search space constants
+# autotuner search space constants
 #########################################

 DEFAULT_HF_CONFIG = {

--- a/deepspeed/autotuning/tuner/base_tuner.py
+++ b/deepspeed/autotuning/tuner/base_tuner.py
@@ -39,7 +39,7 @@ class BaseTuner:
        i = 0
        try:
            while i < n_trials and self.has_next():
-                # Select the next batch of configuratiosn for evaluation
+                # Select the next batch of configuration for evaluation
                sampled_exps = self.next_batch(sample_size)
                # Generate experiments for measurement of performance
                exp_paths = write_experiments(sampled_exps, self.rm.exps_dir)
@@ -68,5 +68,5 @@ class BaseTuner:
                    break
            return i
        except:
-            logger.info("Tunner Error:", sys.exc_info()[0])
+            logger.info("Tuner Error:", sys.exc_info()[0])
            return i
--- a/deepspeed/autotuning/utils.py
+++ b/deepspeed/autotuning/utils.py
@@ -268,7 +268,7 @@ def prune_configs(configs, ignored_keys=[]):


 def get_tuning_keys(tuning_space: dict):
-    """Outputs the list of tunnable parameters in the tuning space dict.
+    """Outputs the list of tunable parameters in the tuning space dict.

    Args:
        tuning_space (dict): a configuration dictionary containing tunable parameters as lists of values.

--- a/deepspeed/compression/basic_layer.py
+++ b/deepspeed/compression/basic_layer.py
@@ -16,7 +16,7 @@ g_mpu = None

 class QuantAct(nn.Module):
    """
-    Class to quantize given activations. Note that when using this function, the input acttivation quantization range will be fixed for all
+    Class to quantize given activations. Note that when using this function, the input activation quantization range will be fixed for all
    tokens/images for inference. This generally will affect some accuracy but achieve better latency performance.
    Parameters:
    ----------

--- a/deepspeed/compression/compress.py
+++ b/deepspeed/compression/compress.py
@@ -197,7 +197,7 @@ def student_initialization(student_model, teacher_model, deepspeed_config):
        other_module_name (`list of string`)
            The modules will be used for student's reinitializedion
            Example 1: ['bert.pooler', 'bert.embeddings', 'classifier'], means we want to apply the weight in teacher's embedding/pooler/classier module to the student
-            Example 2: ['transformer.w', 'transformer.ln_f', 'lm_head'], means we want to apply the weight in teacher's embeddingn layers module to the student
+            Example 2: ['transformer.w', 'transformer.ln_f', 'lm_head'], means we want to apply the weight in teacher's embedding layers module to the student
    Note that teacher_layer should matches student layer
    '''
    assert len(student_layer) == len(teacher_layer)

--- a/deepspeed/compression/constants.py
+++ b/deepspeed/compression/constants.py
@@ -32,7 +32,7 @@ TEACHER_LAYER = "teacher_layer"
 OTHER_MODULE_NAME = "other_module_name"

 ####
-# Weight Quantzation
+# Weight Quantization
 ####
 WEIGHT_QUANTIZATION = "weight_quantization"


--- a/docs/_pages/training.md
+++ b/docs/_pages/training.md
@@ -484,7 +484,7 @@ The flops profiler can also be used as a standalone package. Please refer to the

 ### Autotuning

-The DeepSpeed Autotuner  uses model information, system information, and heuristics to efficiently tune Zero stage, micro batch size, and other Zero configurations. Using the autotuning feature requires no code change from DeepSpeed users. While `"autotuning": {"enabled": true}` is the minimal required to enable auotuning, there are other parameters users can define to configure the autotuning process. Below shows major parameters and their default values in the autotuning configuration. Please refer to the [Autotuning](/tutorials/autotuning) tutorial for more details.
+The DeepSpeed Autotuner  uses model information, system information, and heuristics to efficiently tune Zero stage, micro batch size, and other Zero configurations. Using the autotuning feature requires no code change from DeepSpeed users. While `"autotuning": {"enabled": true}` is the minimal required to enable autotuning, there are other parameters users can define to configure the autotuning process. Below shows major parameters and their default values in the autotuning configuration. Please refer to the [Autotuning](/tutorials/autotuning) tutorial for more details.

 ```json
 {

--- a/docs/_tutorials/MoQ-tutorial.md
+++ b/docs/_tutorials/MoQ-tutorial.md
@@ -134,7 +134,7 @@ python text-classification/run_glue.py \
  --deepspeed test.json
 ```

-Running this script will get `MPRC` accuracy and F1 metric results with MoQ quantization.
+Running this script will get `MRPC` accuracy and F1 metric results with MoQ quantization.


 ### Quantization with dynamic schedule using second-order information (Eigenvalue)

--- a/docs/_tutorials/large-models-w-deepspeed.md
+++ b/docs/_tutorials/large-models-w-deepspeed.md
@@ -28,7 +28,7 @@ Since, ZeRO is a replacement to data parallelism, it offers a seamless integrati

 ## Deciding which technology to use

-**3D Parallelism for GPT-2/GPT-3 like models**: If you are attempting to train a model whose architecture resembles very closely with GPT-2 or GPT-3, then we have already done the hard work of porting 3D parallelism to a GPT-2/GPT-3 architecture-based model and have created a training pipeline that you can use to efficiently train models with hundreds of billion or even trillions of parameters. Both Megatron-Turing NLG 530B and Big Science use a variation of this code base to scale the model training. You can find the code and tutorial to get started in the [DeepSpeed-Megatron GPT-3](https://github.com/microsoft/megatron-deepspeed) repo. For more information on 3D parallelism please chekcout the resources below:
+**3D Parallelism for GPT-2/GPT-3 like models**: If you are attempting to train a model whose architecture resembles very closely with GPT-2 or GPT-3, then we have already done the hard work of porting 3D parallelism to a GPT-2/GPT-3 architecture-based model and have created a training pipeline that you can use to efficiently train models with hundreds of billion or even trillions of parameters. Both Megatron-Turing NLG 530B and Big Science use a variation of this code base to scale the model training. You can find the code and tutorial to get started in the [DeepSpeed-Megatron GPT-3](https://github.com/microsoft/megatron-deepspeed) repo. For more information on 3D parallelism please checkout the resources below:

 [3D Parallelism Tutorial](https://www.deepspeed.ai/tutorials/pipeline/) A generic tutorial on how to port your model to use DeepSpeed 3D parallelism


--- a/docs/_tutorials/mixture-of-experts-inference.md
+++ b/docs/_tutorials/mixture-of-experts-inference.md
@@ -66,7 +66,7 @@ generate_samples_gpt.py \
       --num-attention-heads 32 \
       --max-position-embeddings 1024 \
       --tokenizer-type GPT2BPETokenizer \
-       --load $checpoint_path \
+       --load $checkpoint_path \
       --fp16 \
       --ds-inference \
 ```

--- a/docs/_tutorials/mixture-of-experts-nlg.md
+++ b/docs/_tutorials/mixture-of-experts-nlg.md
@@ -57,11 +57,11 @@ Regarding training data, we are not able to release our internal data but any pu
 Table 1: Zero-shot evaluation results (last six columns) for different dense and MoE NLG models. All zero-shot evaluation results use the accuracy metric.

 ### 2.4. Training MoS with reduced model size
-MoS, standing for Mixture-of-Students, is a staged distillation-based technique for compressing large MoE models. MoS further reduces the model size by 12.5%, leading to up 3.7x model size reduction when combined with PR-MoE over the standard MoE. The reduced model size helps reduce the latecy and cost during inference. To train an MoS model, one needs to specify a few additional parameters. We will use PR-MoE as an example:
+MoS, standing for Mixture-of-Students, is a staged distillation-based technique for compressing large MoE models. MoS further reduces the model size by 12.5%, leading to up 3.7x model size reduction when combined with PR-MoE over the standard MoE. The reduced model size helps reduce the latency and cost during inference. To train an MoS model, one needs to specify a few additional parameters. We will use PR-MoE as an example:

 `--mos`: This would enable Mixture-of-Students via knowledge distillation.

-`--load-teacher`: This specifies the path to the teacher model checkpoint. This is a mandatory argumentment for using MoS and the teacher model checkpoint can be obtained by either training a standard MoE or the PR-MoE.
+`--load-teacher`: This specifies the path to the teacher model checkpoint. This is a mandatory argument for using MoS and the teacher model checkpoint can be obtained by either training a standard MoE or the PR-MoE.

 `num-layers-teacher`, `--hidden-size-teacher`, `--hidden-size-teacher`, `--num-experts-teacher`: In addition to the teacher model checkpoint path, we also need to specify the model architecture of the teacher model such as its number of layers, hidden dimension size, and the number of experts per MoE layer. In the case of PR-MoE, we need to also provide a list of experts for the teacher model, where we remove a few expert layers from the teacher model.