From d2ba91aad1de640e2f51a34428c1af4bd78d2b48 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Sun, 23 Feb 2020 13:41:52 +0800
Subject: [PATCH] fix typo words (#22653)

---
 paddle/fluid/framework/data_set.h             |   2 +-
 .../fluid/framework/details/build_strategy.h  |   4 +-
 .../ir/conv_elementwise_add2_act_fuse_pass.cc |   2 +-
 .../ir/conv_elementwise_add_act_fuse_pass.cc  |   2 +-
 .../fuse_optimizer_op_pass.cc                 |   2 +-
 .../framework/ir/graph_pattern_detector.cc    |   2 +-
 .../framework/ir/multi_batch_merge_pass.cc    |   2 +-
 .../multi_devices_graph_pass.cc               |   2 +-
 paddle/fluid/framework/op_desc.cc             |   6 +-
 paddle/fluid/framework/operator.cc            |   2 +-
 paddle/fluid/framework/operator.h             |   4 +-
 paddle/fluid/framework/operator_test.cc       |   8 +-
 .../analysis/ir_passes/subgraph_util.cc       |   2 +-
 .../ir_passes/tensorrt_subgraph_pass.cc       |   2 +-
 paddle/fluid/inference/io.cc                  |   2 +-
 .../inference/tensorrt/convert/ut_helper.h    |   2 +-
 paddle/fluid/inference/tensorrt/engine.cc     |   7 +-
 .../fluid/operators/array_to_lod_tensor_op.cc |   2 +-
 .../fluid/operators/average_accumulates_op.cc |   9 +-
 .../operators/bilinear_tensor_product_op.h    |   2 +-
 paddle/fluid/operators/conv_transpose_op.cc   |   4 +-
 paddle/fluid/operators/crop_op.cc             |   2 +-
 paddle/fluid/operators/crop_tensor_op.cc      |   2 +-
 paddle/fluid/operators/crop_tensor_op.h       |   2 +-
 paddle/fluid/operators/cross_entropy_op.cc    |   2 +-
 paddle/fluid/operators/ctc_align_op.cc        |   2 +-
 paddle/fluid/operators/cumsum_op.cc           |   4 +-
 .../operators/deformable_psroi_pooling_op.cc  |   2 +-
 .../fluid/operators/detection/box_coder_op.cc |   4 +-
 .../detection/generate_mask_labels_op.cc      |   4 +-
 .../detection/generate_proposal_labels_op.cc  |   4 +-
 .../operators/detection/iou_similarity_op.cc  |   2 +-
 .../detection/locality_aware_nms_op.cc        |   2 +-
 .../operators/detection/multiclass_nms_op.cc  |   2 +-
 .../operators/detection/target_assign_op.cc   |   2 +-
 .../operators/detection/yolov3_loss_op.cc     |   6 +-
 .../test_elementwise_mul_op_dim.cc            |   2 +-
 .../fluid/operators/fused/fusion_group_op.cc  |   2 +-
 .../fusion_transpose_flatten_concat_op.cu.cc  |   2 +-
 paddle/fluid/operators/grid_sampler_op.cc     |   8 +-
 paddle/fluid/operators/gru_op.cc              |   2 +-
 paddle/fluid/operators/gru_unit_op.cc         |   2 +-
 .../operators/hierarchical_sigmoid_op.cc      |   2 +-
 paddle/fluid/operators/interpolate_op.cc      |   4 +-
 paddle/fluid/operators/lrn_op.cc              |   2 +-
 paddle/fluid/operators/math/matrix_bit_code.h |   2 +-
 paddle/fluid/operators/nce_op.cc              |   6 +-
 .../fluid/operators/pad_constant_like_op.cc   |   4 +-
 paddle/fluid/operators/prroi_pool_op.cu       |   2 +-
 paddle/fluid/operators/prroi_pool_op.h        |   2 +-
 paddle/fluid/operators/reader/read_op.cc      |   4 +-
 paddle/fluid/operators/reduce_ops/reduce_op.h |   2 +-
 paddle/fluid/operators/reshape_op.cc          |   6 +-
 paddle/fluid/operators/scatter_op.cc          |   2 +-
 paddle/fluid/operators/select_input_op.cc     |   2 +-
 .../operators/sequence_ops/sequence_pad_op.cc |   2 +-
 .../sequence_ops/sequence_pool_op.cc          |   4 +-
 .../sequence_topk_avg_pooling_op.cc           |   2 +-
 .../sequence_ops/sequence_unpad_op.cc         |   2 +-
 paddle/fluid/operators/shard_index_op.cc      |   4 +-
 .../fluid/operators/shrink_rnn_memory_op.cc   |   6 +-
 .../softmax_with_cross_entropy_op.cc          |   4 +-
 .../softmax_with_cross_entropy_op.cu          |   2 +-
 paddle/fluid/operators/spectral_norm_op.cc    |   4 +-
 .../operators/tensorrt/tensorrt_engine_op.h   |   2 +-
 paddle/fluid/operators/unfold_op.cc           |   2 +-
 paddle/fluid/operators/uniform_random_op.cc   |   2 +-
 paddle/fluid/operators/unsqueeze_op.cc        |   2 +-
 paddle/fluid/operators/warpctc_op.cc          |   4 +-
 paddle/fluid/platform/device_tracer.cc        |   2 +-
 paddle/fluid/pybind/imperative.cc             |   8 +-
 python/paddle/dataset/movielens.py            |   2 +-
 python/paddle/dataset/mq2007.py               |  12 +-
 python/paddle/distributed/launch.py           |  14 +-
 python/paddle/distributed/launch_ps.py        |   2 +-
 python/paddle/fluid/backward.py               |   2 +-
 python/paddle/fluid/contrib/layers/nn.py      |  14 +-
 .../paddle/fluid/contrib/layers/rnn_impl.py   |   4 +-
 .../paddle/fluid/contrib/memory_usage_calc.py |   7 +-
 .../contrib/quantize/quantize_transpiler.py   |   2 +-
 .../fluid/contrib/slim/core/compressor.py     |   4 +-
 .../fluid/contrib/slim/graph/graph_wrapper.py |  12 +-
 .../contrib/slim/nas/controller_server.py     |   2 +-
 .../contrib/slim/prune/auto_prune_strategy.py |   2 +-
 .../contrib/slim/prune/prune_strategy.py      |   2 +-
 .../paddle/fluid/contrib/slim/prune/pruner.py |   2 +-
 .../slim/quantization/quantization_pass.py    |  12 +-
 python/paddle/fluid/contrib/trainer.py        |  16 +--
 .../paddle/fluid/contrib/utils/hdfs_utils.py  |   4 +-
 .../fluid/contrib/utils/lookup_table_utils.py |   4 +-
 python/paddle/fluid/data.py                   |   4 +-
 python/paddle/fluid/data_feed_desc.py         |   2 +-
 python/paddle/fluid/data_feeder.py            |   8 +-
 python/paddle/fluid/dataset.py                |   4 +-
 python/paddle/fluid/debugger.py               |   2 +-
 python/paddle/fluid/distributed/downpour.py   |   2 +-
 .../paddle/fluid/distributed/ps_instance.py   |   2 +-
 .../fluid/dygraph/learning_rate_scheduler.py  |  18 +--
 python/paddle/fluid/dygraph/nn.py             |  18 +--
 .../fluid/dygraph/varbase_patch_methods.py    |   8 +-
 python/paddle/fluid/dygraph_grad_clip.py      |   2 +-
 python/paddle/fluid/executor.py               |   8 +-
 python/paddle/fluid/framework.py              |  58 ++++----
 .../fluid/incubate/data_generator/__init__.py |   6 +-
 .../fluid/incubate/fleet/base/role_maker.py   |   4 +-
 .../incubate/fleet/collective/__init__.py     |   4 +-
 .../fleet/parameter_server/pslib/__init__.py  |   2 +-
 .../pslib/optimizer_factory.py                |   2 +-
 .../fluid/incubate/fleet/utils/fleet_util.py  |   8 +-
 .../paddle/fluid/incubate/fleet/utils/hdfs.py |   4 +-
 python/paddle/fluid/initializer.py            |   2 +-
 python/paddle/fluid/input.py                  |   4 +-
 python/paddle/fluid/install_check.py          |   2 +-
 python/paddle/fluid/io.py                     |  20 +--
 python/paddle/fluid/layers/control_flow.py    |  26 ++--
 python/paddle/fluid/layers/detection.py       |  54 ++++----
 python/paddle/fluid/layers/distributions.py   |   2 +-
 python/paddle/fluid/layers/io.py              |   6 +-
 .../fluid/layers/learning_rate_scheduler.py   |   8 +-
 python/paddle/fluid/layers/loss.py            |  12 +-
 python/paddle/fluid/layers/nn.py              | 130 +++++++++---------
 python/paddle/fluid/layers/ops.py             |   2 +-
 python/paddle/fluid/layers/rnn.py             |  46 +++----
 python/paddle/fluid/layers/sequence_lod.py    |  16 +--
 python/paddle/fluid/layers/tensor.py          |  12 +-
 python/paddle/fluid/log_helper.py             |   2 +-
 python/paddle/fluid/metrics.py                |  16 +--
 python/paddle/fluid/nets.py                   |  12 +-
 python/paddle/fluid/optimizer.py              |  12 +-
 python/paddle/fluid/param_attr.py             |   2 +-
 python/paddle/fluid/profiler.py               |   2 +-
 .../paddle/fluid/tests/demo/pipeline_train.py |   2 +-
 .../fluid/tests/unittests/dist_transformer.py |  10 +-
 .../unittests/test_activation_nn_grad.py      |   2 +-
 .../unittests/test_elementwise_nn_grad.py     |  16 +--
 .../unittests/test_generate_proposals_op.py   |   2 +-
 ..._imperative_transformer_sorted_gradient.py |   2 +-
 .../unittests/test_linear_chain_crf_op.py     |   2 +-
 .../paddle/fluid/tests/unittests/test_nce.py  |   6 +-
 .../fluid/tests/unittests/test_nn_grad.py     |   2 +-
 .../fluid/tests/unittests/test_reshape_op.py  |   2 +-
 .../tests/unittests/test_static_save_load.py  |  40 +++---
 .../tests/unittests/transformer_model.py      |   8 +-
 .../fluid/transpiler/details/program_utils.py |   2 +-
 .../fluid/transpiler/distribute_transpiler.py |   6 +-
 .../fluid/transpiler/geo_sgd_transpiler.py    |   2 +-
 .../paddle/fluid/transpiler/ps_dispatcher.py  |   4 +-
 python/paddle/reader/decorator.py             |   4 +-
 python/paddle/utils/image_util.py             |   2 +-
 python/paddle/utils/plotcurve.py              |   2 +-
 python/paddle/utils/preprocess_img.py         |   2 +-
 python/paddle/utils/preprocess_util.py        |   8 +-
 152 files changed, 510 insertions(+), 511 deletions(-)

diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 94424a5ffaf..d82035c03ee 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -126,7 +126,7 @@ class Dataset {
   virtual void DestroyPreLoadReaders() = 0;
   // set preload thread num
   virtual void SetPreLoadThreadNum(int thread_num) = 0;
-  // seperate train thread and dataset thread
+  // separate train thread and dataset thread
   virtual void DynamicAdjustChannelNum(int channel_num) = 0;
   virtual void DynamicAdjustReadersNum(int thread_num) = 0;
   // set fleet send sleep seconds
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 738bbf51115..5388df6bc50 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -133,10 +133,10 @@ struct BuildStrategy {
   // The picture is here:
   // https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
   bool use_hierarchical_allreduce_{false};
-  // Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu
+  // Nccl ranks in a node when use hierarchical allreduce, it's set to gpu
   // cards' number in most cases.
   size_t hierarchical_allreduce_inter_nranks_{0};
-  // Nccl ranks bewteen nodes when use hierarchical allreduce, it's setted to
+  // Nccl ranks bewteen nodes when use hierarchical allreduce, it's set to
   // nodes number.
   size_t hierarchical_allreduce_exter_nranks_{0};
 
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index b4d6f683ce7..c3c02c30b9d 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -33,7 +33,7 @@ namespace ir {
   GET_IR_NODE(act_op);                 \
   GET_IR_NODE(act_out);
 
-// Inherient the basic infomation from `base_desc`, and modify some fields.
+// Inherient the basic information from `base_desc`, and modify some fields.
 framework::proto::OpDesc PrepareOpDesc(
     const framework::proto::OpDesc& base_desc, const std::string& bias,
     const std::string& bias1, const std::string& activation,
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index ba0a2fb9645..b15871ef03f 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -31,7 +31,7 @@ namespace ir {
   GET_IR_NODE(act_op);               \
   GET_IR_NODE(act_out);
 
-// Inherient the basic infomation from `base_desc`, and modify some fields.
+// Inherient the basic information from `base_desc`, and modify some fields.
 framework::proto::OpDesc PrepareOpDesc(
     const framework::proto::OpDesc& base_desc, const std::string& bias,
     const std::string& activation, const std::string& output) {
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index b55bbbe5aea..35bdfde96bc 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -382,7 +382,7 @@ const VarDesc *FuseOptimizerOpPass::GetVarDescFromVarsInfo(
     const std::string &var_name) const {
   auto grad_iter = vars_info.find(var_name);
   PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true,
-                    "The gradient varibale %s is not found.", var_name);
+                    "The gradient variable %s is not found.", var_name);
   PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true,
                     "The gradient var node %s is not found.", var_name);
   PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var(),
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 919364541e4..e0b7a4d3378 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -131,7 +131,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
 }
 
 // The intermediate Nodes can only link to the nodes inside the pattern, or this
-// subgraph will be droped.
+// subgraph will be dropped.
 void GraphPatternDetector::ValidateByNodeRole(
     std::vector<GraphPatternDetector::subgraph_t> *subgraphs) {
   std::vector<GraphPatternDetector::subgraph_t> result;
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index a8720ff4bfb..b075cde3212 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -179,7 +179,7 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const {
         ir::Node* var = nullptr;
         auto updated_var = UpdateGradVarDesc(in_node->Var(), i, grad_names,
                                              bn_vars_need_rename);
-        // should be initialized by startup, how to initilize tensor in the
+        // should be initialized by startup, how to initialize tensor in the
         // scope?
         if (node->Name() == "batch_norm" &&
             bn_vars_need_rename.find(in_node->Name()) !=
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 79b50993556..935931b8150 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -1041,7 +1041,7 @@ void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
     // There are 4 conditions:
     // 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS.
     // Need to broadcast received parameters to other GPU.
-    // 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to
+    // 2. GPU && AllReduce: AllReduce all gradient to each GPU. Need to
     // broadcast received parameters to other GPU.
     // 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to
     // broadcast received parameters to other scope.
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 87a99afc9ae..cbb2c79c5c4 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -80,7 +80,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     PADDLE_ENFORCE_EQ(
         in_var_names.size(), out_var_names.size(),
         platform::errors::PreconditionNotMet(
-            "Op [%s]:  Input var number shoule be equal with output var number",
+            "Op [%s]:  Input var number should be equal with output var number",
             op_.Type()));
 
     for (size_t i = 0; i < in_var_names.size(); ++i) {
@@ -663,7 +663,7 @@ void OpDesc::Flush() {
 
 void OpDesc::CheckAttrs() {
   PADDLE_ENFORCE(!Type().empty(),
-                 "CheckAttr() can not be called before type is setted.");
+                 "CheckAttr() can not be called before type is set.");
   auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
   if (checker == nullptr) {
     // checker is not configured. That operator could be generated by Paddle,
@@ -706,7 +706,7 @@ void OpDesc::InferShape(const BlockDesc &block) const {
 void OpDesc::InferVarType(BlockDesc *block) const {
   // There are a few places that var type can be set.
   // When VarDesc is created, default set to LOD_TENSOR.
-  // When output variable is created, default is defaut set to LOD_TENSOR.
+  // When output variable is created, default is default set to LOD_TENSOR.
   // We limit here to be the only place that operator defines its customized
   // var type inference. Hence, we don't do any "default" setting here.
   auto &info = OpInfoMap::Instance().Get(this->Type());
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index df773c044bf..6ffe3d87136 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -654,7 +654,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
     PADDLE_ENFORCE_EQ(
         in_var_list.size(), out_var_list.size(),
         platform::errors::PreconditionNotMet(
-            "Op [%s]: Input var size should be equal with ouput var size",
+            "Op [%s]: Input var size should be equal with output var size",
             op_.Type()));
 
     auto& out_var_names = op_.Outputs(out);
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index f30620a0a7f..97d2dad06c8 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -53,8 +53,8 @@ constexpr char kEmptyVarName[] = "@EMPTY@";
 constexpr char kTempVarName[] = "@TEMP@";
 
 /// If a variable's name has a certain suffix, it means that the
-/// variable is the gradient of another varibale.
-/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
+/// variable is the gradient of another variable.
+/// e.g. Variable "x@GRAD" is the gradient of variable "x".
 constexpr char kGradVarSuffix[] = "@GRAD";
 
 constexpr size_t kGradVarSuffixSize = 5U;
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 7bed06b0a3d..77c98a08cf0 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -340,7 +340,7 @@ class IndicateLoDTensorDataTypeTestProtoMaker : public OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("LoDTensor", "Input of Tensor type Variable.");
-    AddComment("This Op is only for IndicateVarDataType inferface test.");
+    AddComment("This Op is only for IndicateVarDataType interface test.");
   }
 };
 
@@ -362,7 +362,7 @@ class IndicateSelectedRowsDataTypeTestProtoMaker
  public:
   void Make() {
     AddInput("SelectedRows", "Input of SelectedRows type Variable.");
-    AddComment("This Op is only for IndicateVarDataType inferface test.");
+    AddComment("This Op is only for IndicateVarDataType interface test.");
   }
 };
 
@@ -382,7 +382,7 @@ class IndicateOtherDataTypeTestProtoMaker : public OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("Other", "Input of Other type Variable");
-    AddComment("This Op is only for IndicateVarDataType inferface test.");
+    AddComment("This Op is only for IndicateVarDataType interface test.");
   }
 };
 
@@ -572,7 +572,7 @@ class GetSetLoDLevelTestMaker : public OpProtoAndCheckerMaker {
   void Make() {
     AddInput("X", "(LoDTensor) Input Variable.");
     AddOutput("Out", "(LoDTensor) Output Variable.");
-    AddComment("This Op is only for Get/SetLoDLevel inferface test.");
+    AddComment("This Op is only for Get/SetLoDLevel interface test.");
   }
 };
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
index 699e9eb01de..22f17d440e5 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -112,7 +112,7 @@ void RenameAndGetOutputs(
     std::unordered_map<std::string, std::string> *output_name_map,
     const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
     bool trt_and_not_int8) {
-  //// In the normal case, the paddle-trt exists bug when runing the googlenet.
+  //// In the normal case, the paddle-trt exists bug when running the googlenet.
   // When there are more than two convolutions of 1 * 1 with the same input, the
   // paddle-tensorrt will do the merging optimization, which fuse those conv
   // into one conv, and then trigger bug. So,  We should use strategy to avoid
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 397411ccf87..2b6418bbf8a 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -223,7 +223,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   auto use_static_engine = Get<bool>("use_static_engine");
   // TODO(NHZlX)
   // There are models with the same structure but the different parameters,
-  // when runing in the 'use_serialize' mode, there is a bug.
+  // when running in the 'use_serialize' mode, there is a bug.
   auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
                                       std::to_string(0));
   auto predictor_id = Get<int>("predictor_id");
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 8b379457a2d..c497ab384b5 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -137,7 +137,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                  "model version %ld is not supported.",
                  main_program->Version());
 
-  // model_from_memory is false in seperate parameters.
+  // model_from_memory is false in separate parameters.
   LoadPersistables(executor, scope, *main_program, dirname, "",
                    false /* model_from_memory */);
   return main_program;
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 97affafb4bf..3c48c8192f6 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -101,7 +101,7 @@ class TRTConvertValidation {
     DeclVar(name, dim_vec);
   }
 
-  // Declare a parameter varaible in the scope.
+  // Declare a parameter variable in the scope.
   void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
     DeclVar(name, dims, true);
   }
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 771ad702448..e7f7a842cf5 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -104,10 +104,9 @@ void TensorRTEngine::FreezeNetwork() {
 
       for (auto &t : all_t) {
         if (!quant_dynamic_range_.count(t)) {
-          VLOG(3)
-              << "We are in trt int8 mode(not calibration), scale not setted"
-              << " for tensor " << t->getName()
-              << ", this might be ok when trt does not need this range";
+          VLOG(3) << "We are in trt int8 mode(not calibration), scale not set"
+                  << " for tensor " << t->getName()
+                  << ", this might be ok when trt does not need this range";
         }
       }
       std::unordered_set<std::string> all_out_t_name;
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 4f656ab165f..402815c7e63 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -172,7 +172,7 @@ class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
              "(std::vector<LodTensor>) A vector of tensors that is going to "
              "be casted to a big LoDTensor.");
     AddInput("RankTable",
-             "(LoDRankTable) RankTable provides the coarse lod infomation to "
+             "(LoDRankTable) RankTable provides the coarse lod information to "
              "build the output LoDTensor. See "
              "'paddle/framework/lod_rank_table.h' for more details.");
     AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index 5b3dbcd65ec..e2cde218db3 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -132,7 +132,7 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
         "(Tensor<int64_t>), The accumulating times of previous window with "
         "shape [1].");
     AddInput("in_num_updates",
-             "(Tensor<int64_t>), The total number of batches used by trainning "
+             "(Tensor<int64_t>), The total number of batches used by training "
              "before this batch with shape [1].");
 
     AddOutput("out_sum_1",
@@ -155,10 +155,9 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
         "out_old_num_accumulates",
         "(Tensor<int64_t>) The accumulating times of previous window with "
         "shape [1].");
-    AddOutput(
-        "out_num_updates",
-        "(Tensor<int64_t>), The total number of batches used by trainning "
-        "before this batch with shape [1].");
+    AddOutput("out_num_updates",
+              "(Tensor<int64_t>), The total number of batches used by training "
+              "before this batch with shape [1].");
 
     AddAttr<float>("average_window",
                    "(float, default 0) "
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h
index 8e2f25dfcf5..8f6c9b60dca 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
@@ -49,7 +49,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
-    // Create the intermediate variable to caculate the result of
+    // Create the intermediate variable to calculate the result of
     // Input(X) multiplied by Input(Weight_i), the formula is:
     // left_mul = X Weight_i.
     Tensor left_mul;
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 81f099d7c1c..39ba1054740 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -267,7 +267,7 @@ void Conv2DTransposeOpMaker::Make() {
                "workspace is a section of GPU memory which will be "
                "allocated/freed each time the operator runs, larger "
                "workspace size can increase performance but also requires "
-               "better hardward. This size should be carefully setted.")
+               "better hardward. This size should be carefully set.")
       .SetDefault(platform::GetDefaultConvWorkspaceSizeLimitMB());
   AddComment(R"DOC(
 Convolution2D Transpose Operator.
@@ -368,7 +368,7 @@ void Conv3DTransposeOpMaker::Make() {
                "workspace is a section of GPU memory which will be "
                "allocated/freed each time the operator runs, larger "
                "workspace size can increase performance but also requires "
-               "better hardward. This size should be carefully setted.")
+               "better hardward. This size should be carefully set.")
       .SetDefault(platform::GetDefaultConvWorkspaceSizeLimitMB());
   AddComment(R"DOC(
 Convolution3D Transpose Operator.
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index fc73f938a9b..5626d2bf655 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -36,7 +36,7 @@ class CropOp : public framework::OperatorWithKernel {
       auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
       PADDLE_ENFORCE_EQ(
           int64_t(shape.size()), x_dim.size(),
-          "Shape size should be equal to dimention size of input tensor.");
+          "Shape size should be equal to dimension size of input tensor.");
       std::vector<int64_t> tensor_shape(shape.size());
       for (size_t i = 0; i < shape.size(); ++i) {
         tensor_shape[i] = static_cast<int64_t>(shape[i]);
diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc
index 83047ac8850..5a06c50c89f 100644
--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ b/paddle/fluid/operators/crop_tensor_op.cc
@@ -82,7 +82,7 @@ class CropTensorOp : public framework::OperatorWithKernel {
     }
     PADDLE_ENFORCE_EQ(int64_t(shape.size()), x_dim.size(),
                       "Attr(shape)'size of Op(crop_tensor) should be equal to "
-                      "dimention size of input tensor.");
+                      "dimension size of input tensor.");
     std::vector<int64_t> out_shape(shape.size(), -1);
     for (size_t i = 0; i < shape.size(); ++i) {
       if (shape[i] > 0) {
diff --git a/paddle/fluid/operators/crop_tensor_op.h b/paddle/fluid/operators/crop_tensor_op.h
index 5337510b4f0..4c6b70d889a 100644
--- a/paddle/fluid/operators/crop_tensor_op.h
+++ b/paddle/fluid/operators/crop_tensor_op.h
@@ -157,7 +157,7 @@ void CropTensorFunction(const framework::ExecutionContext& context) {
 
   // get shape from Input(ShapeTensor) of Input(Shape)
   std::vector<int> shape = GetShape(context);
-  // out_dims setted by arrt(shape)
+  // out_dims set by arrt(shape)
   if (shape.size() == 0) {
     for (int i = 0; i < out_dims.size(); ++i) {
       shape.push_back(out_dims[i]);
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index ab5d45b800d..0553619a8b4 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -203,7 +203,7 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
               "represents the cross entropy loss.");
     AddAttr<bool>("soft_label",
                   "(bool, default false), a flag indicating whether to "
-                  "interpretate the given labels as soft labels.")
+                  "interpretant the given labels as soft labels.")
         .SetDefault(false);
     AddAttr<int>("ignore_index",
                  "(int, default -100), Specifies a target value that is"
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
index be386a8eb84..8af29133f1a 100644
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
@@ -63,7 +63,7 @@ class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
               "sequence in Output.")
         .AsDispensable();
     AddAttr<int>("blank",
-                 "(int, default: 0), the blank label setted in Connectionist "
+                 "(int, default: 0), the blank label set in Connectionist "
                  "Temporal Classification (CTC) op.")
         .SetDefault(0);
     AddAttr<bool>("merge_repeated",
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 89cd5c69774..835bfcc484d 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -33,8 +33,8 @@ class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of cumsum operator");
     AddOutput("Out", "Output of cumsum operator");
     AddAttr<int>("axis",
-                 "The dimenstion to accumulate along. -1 means the last "
-                 "dimenstion [default -1].")
+                 "The dimension to accumulate along. -1 means the last "
+                 "dimension [default -1].")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
     AddAttr<bool>("exclusive",
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
index e76649e8283..e95fb0d45cc 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
@@ -67,7 +67,7 @@ class DeformablePSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
         "the number of groups which input channels are divided."
         "(eg.number of input channels is k1*k2*(C+1), which k1 and k2 "
         "are group width and height and C+1 is number of output "
-        "chanels. eg.(4, 6), which 4 is height of group and 6 is "
+        "channels. eg.(4, 6), which 4 is height of group and 6 is "
         "width of group");
     AddAttr<int>("pooled_height",
                  "(int), "
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 070c822a7ed..28705a7a2bf 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -117,7 +117,7 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
         .InEnum({"encode_center_size", "decode_center_size"});
     AddAttr<bool>("box_normalized",
                   "(bool, default true) "
-                  "whether treat the priorbox as a noramlized box")
+                  "whether treat the priorbox as a normalized box")
         .SetDefault(true);
     AddAttr<int>("axis",
                  "(int, default 0)"
@@ -140,7 +140,7 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
               "box_coder_op with shape [N, M, 4] representing the result of N "
               "target boxes encoded with M Prior boxes and variances. When "
               "code_type is 'decode_center_size', N represents the batch size "
-              "and M represents the number of deocded boxes.");
+              "and M represents the number of decoded boxes.");
 
     AddComment(R"DOC(
 
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index db69cf0301c..16753c42945 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -403,7 +403,7 @@ class GenerateMaskLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
         "each element is a bounding box with (xmin, ymin, xmax, ymax) format.");
     AddInput("LabelsInt32",
              "(LoDTensor), This intput is a 2D LoDTensor with shape [R, 1], "
-             "each element repersents a class label of a roi");
+             "each element represents a class label of a roi");
     AddOutput(
         "MaskRois",
         "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. "
@@ -411,7 +411,7 @@ class GenerateMaskLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
         "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
     AddOutput("RoiHasMaskInt32",
               "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
-              "each element repersents the output mask rois index with regard "
+              "each element represents the output mask rois index with regard "
               "to input rois");
     AddOutput("MaskInt32",
               "(LoDTensor), This output is a 4D LoDTensor with shape [P, Q], "
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index b8195fbcc03..79780e0d4ee 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -521,11 +521,11 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
         "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
     AddOutput("LabelsInt32",
               "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
-              "each element repersents a class label of a roi");
+              "each element represents a class label of a roi");
     AddOutput("BboxTargets",
               "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
               "class_nums], "
-              "each element repersents a box label of a roi");
+              "each element represents a box label of a roi");
     AddOutput(
         "BboxInsideWeights",
         "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc
index 55012556e23..6f8a8b0a085 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -63,7 +63,7 @@ class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
              "bottom coordinate of the box.");
     AddAttr<bool>("box_normalized",
                   "(bool, default true) "
-                  "whether treat the priorbox as a noramlized box")
+                  "whether treat the priorbox as a normalized box")
         .SetDefault(true);
     AddOutput("Out",
               "(LoDTensor, the lod is same as input X) The output of "
diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
index ee0708312dd..36e9d602801 100644
--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -393,7 +393,7 @@ class LocalityAwareNMSOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("nms_top_k",
                  "(int64_t) "
                  "Maximum number of detections to be kept according to the "
-                 "confidences aftern the filtering detections based on "
+                 "confidences after the filtering detections based on "
                  "score_threshold");
     AddAttr<float>("nms_threshold",
                    "(float, default: 0.3) "
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 62d6bb3ac15..9cdc46b4a26 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -424,7 +424,7 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("nms_top_k",
                  "(int64_t) "
                  "Maximum number of detections to be kept according to the "
-                 "confidences aftern the filtering detections based on "
+                 "confidences after the filtering detections based on "
                  "score_threshold");
     AddAttr<float>("nms_threshold",
                    "(float, default: 0.3) "
diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc
index 7c187066c66..3c02796de01 100644
--- a/paddle/fluid/operators/detection/target_assign_op.cc
+++ b/paddle/fluid/operators/detection/target_assign_op.cc
@@ -44,7 +44,7 @@ class TargetAssignOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(neg_dims.size(), 2,
                         "The rank of Input(NegIndices) must be 2.");
       PADDLE_ENFORCE_EQ(neg_dims[1], 1,
-                        "The last dimenstion of Out(NegIndices) must be 1.");
+                        "The last dimension of Out(NegIndices) must be 1.");
     }
 
     auto n = mi_dims[0];
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 5ffcfc0458f..dc7c465aae7 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -111,15 +111,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X",
              "The input tensor of YOLOv3 loss operator, "
              "This is a 4-D tensor with shape of [N, C, H, W]."
-             "H and W should be same, and the second dimention(C) stores"
+             "H and W should be same, and the second dimension(C) stores"
              "box locations, confidence score and classification one-hot"
              "keys of each anchor box");
     AddInput("GTBox",
              "The input tensor of ground truth boxes, "
              "This is a 3-D tensor with shape of [N, max_box_num, 5], "
              "max_box_num is the max number of boxes in each image, "
-             "In the third dimention, stores x, y, w, h coordinates, "
-             "x, y is the center cordinate of boxes and w, h is the "
+             "In the third dimension, stores x, y, w, h coordinates, "
+             "x, y is the center coordinate of boxes and w, h is the "
              "width and height and x, y, w, h should be divided by "
              "input image height to scale to [0, 1].");
     AddInput("GTLabel",
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_mul_op_dim.cc b/paddle/fluid/operators/elementwise/test_elementwise_mul_op_dim.cc
index 4477aa0a3f8..7443c142d0f 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_mul_op_dim.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_mul_op_dim.cc
@@ -79,7 +79,7 @@ TEST(ElementwiseMulOpTester, correct_dims) {
   MainTest(test_data);
 }
 
-// Checks if AreDimsAndFormatCorrect fails when channel_num is not divisable by
+// Checks if AreDimsAndFormatCorrect fails when channel_num is not devisable by
 // 16
 TEST(ElementwiseMulOpTester, incorrect_channel_num) {
   TestData test_data;
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cc b/paddle/fluid/operators/fused/fusion_group_op.cc
index 5880c3b317e..503c0355855 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op.cc
@@ -76,7 +76,7 @@ class FusionGroupOpMaker : public framework::OpProtoAndCheckerMaker {
 fusion_group Operator.
 
 It is used to execute a generated CUDA kernel which fuse the computation of
-multiple operators into one. It supports serveral types:
+multiple operators into one. It supports several types:
 0, fused computation of elementwise operations in which all the dims of inputs
     and outputs should be exactly the same.
 )DOC");
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
index 71202b26443..17cb4556d45 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -76,7 +76,7 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
         }
       }
 
-      // Since concat is aftern flatten, the output is 2D tensor.
+      // Since concat is after flatten, the output is 2D tensor.
       // If concat_axis is 0, each input's permutated tensor is continuous.
       // If concat_axis is 1, the stride of 0-th dim of each input's
       // permutated tensor is odims()[1].
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 244de0e3552..58476efa976 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -84,7 +84,7 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
         "Grid",
         "(Tensor) The input grid of GridSampleOp generated by AffineGridOp, "
         "This is a 4-D tensor with shape of [N, H, W, 2] is the concatenation "
-        "of x and y coordinates with shape [N, H, W] in last dimention");
+        "of x and y coordinates with shape [N, H, W] in last dimension");
     AddOutput("Output", "(Tensor) Output tensor with shape [N, C, H, W]");
     AddAttr<bool>(
         "use_cudnn",
@@ -93,11 +93,11 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
       This operation samples input X by using bilinear interpolation based on 
-      flow field grid, which is usually gennerated by affine_grid. The grid of
+      flow field grid, which is usually generated by affine_grid. The grid of
       shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
       with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
-      (in width dimension) of input data x and grid_y is indexng the 3rd 
-      dimention (in height dimension), finally results is the bilinear 
+      (in width dimension) of input data x and grid_y is indexing the 3rd 
+      dimension (in height dimension), finally results is the bilinear 
       interpolation value of 4 nearest corner points.
 
       Step 1:
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index da413dba646..114cc64edde 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -113,7 +113,7 @@ class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddOutput(
         "BatchResetHiddenPrev",
-        "(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
+        "(LoDTensor) The reset hidden state LoDTensor organized in batches. "
         "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
         "with `BatchGate`.")
         .AsIntermediate();
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
index c5f7f7b3ff4..038ce5a7ae8 100644
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -97,7 +97,7 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddOutput("ResetHiddenPrev",
               "(Tensor) Matrix with shape [batch_size, frame_size] for the "
-              "reseted hidden state of previous time step.")
+              "reset hidden state of previous time step.")
         .AsIntermediate();
     AddOutput("Hidden",
               "(Tensor) The GRU hidden state of the current time step "
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 8028b20e06d..bed9b815879 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -144,7 +144,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddOutput(
         "W_Out",
-        "(LoDTensor, optinal) using input 'W' as Output to make it mutable"
+        "(LoDTensor, optional) using input 'W' as Output to make it mutable"
         "When we are using prefetch")
         .AsIntermediate();
     AddAttr<AttrType>("num_classes", "(int, optional), The number of classes")
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index cc753b1f91f..c68e97cf98a 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -285,7 +285,7 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
           interpolation.
 
           Nearest neighbor interpolation is to perform nearest neighbor interpolation
-          in both the 3rd dimention(in height direction) and the 4th dimention(in width 
+          in both the 3rd dimension(in height direction) and the 4th dimension(in width 
           direction) on input tensor.
             
           Bilinear interpolation is an extension of linear interpolation for 
@@ -299,7 +299,7 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
           H-direction and W-direction in this op) on a rectilinear 3D grid. 
           The linear interpolation is performed on three directions.
 
-          Align_corners and align_mode are optinal parameters,the calculation method 
+          Align_corners and align_mode are optional parameters,the calculation method 
           of interpolation can be selected by them.
           
           Example:
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 099003586d7..f9be26a6ee8 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -296,7 +296,7 @@ $$
 
 Function implementation:
 
-Inputs and outpus are in NCHW or NHWC format, while input.shape.ndims() equals 4.
+Inputs and outputs are in NCHW or NHWC format, while input.shape.ndims() equals 4.
 If NCHW, the dimensions 0 ~ 3 represent batch size, feature maps, rows,
 and columns, respectively.
 
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index c399cb5d44a..410adc7b283 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -105,7 +105,7 @@ class SimpleCode {
   SimpleCode(size_t code, size_t num_classes, const int64_t* ids)
       : c_(static_cast<size_t>(ids[code]) + num_classes) {}
   /**
-   * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
+   * Here the id of root should be 1 rather than 0, thus the encoding of class c
    * is `c + num_classes` and all siblings can get the same weight indice using
    * prefixes.
    * Weight index is the prefixes of encoding, thus leave out the right most
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 4fafe439edc..c805d541979 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -129,19 +129,19 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
         "CustomDistProbs",
         "(Tensor) It is used in 'CostumDist' sampler. "
         "It is a tensor with shape [num_total_classes]."
-        "The i-th element is the probsbility of the i-th class being sampled.")
+        "The i-th element is the probability of the i-th class being sampled.")
         .AsDispensable();
     AddInput(
         "CustomDistAlias",
         "(Tensor) It is used in 'CostumDist' sampler. "
         "It is a tensor with shape [num_total_classes]."
-        "The i-th element is the probsbility of the i-th class being sampled.")
+        "The i-th element is the probability of the i-th class being sampled.")
         .AsDispensable();
     AddInput(
         "CustomDistAliasProbs",
         "(Tensor) It is used in 'CostumDist' sampler. "
         "It is a tensor with shape [num_total_classes]."
-        "The i-th element is the probsbility of the i-th class being sampled.")
+        "The i-th element is the probability of the i-th class being sampled.")
         .AsDispensable();
 
     AddOutput("Cost",
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 4fb4f7a022f..160905834ae 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -36,7 +36,7 @@ class PadConstantLikeOp : public framework::OperatorWithKernel {
     auto y_dim = ctx->GetInputDim("Y");
 
     PADDLE_ENFORCE_EQ(x_dim.size(), y_dim.size(),
-                      "The dimention of X and Y should be the same.");
+                      "The dimension of X and Y should be the same.");
 
     for (int i = 0; i < x_dim.size(); ++i) {
       if ((!ctx->IsRuntime()) && ((x_dim[i] == -1) || (y_dim[i] == -1))) {
@@ -164,7 +164,7 @@ class PadConstantLikeOpGrad : public framework::OperatorWithKernel {
     auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_EQ(dout_dim.size(), y_dim.size(),
-                      "The dimention of X and Y should be the same.");
+                      "The dimension of X and Y should be the same.");
 
     auto y_grad_name = framework::GradVarName("Y");
     if (ctx->HasOutput(y_grad_name)) {
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
index caf6892a987..e6cafb2584e 100644
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -325,7 +325,7 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
       } else {
         PADDLE_ENFORCE_EQ(rois->lod().empty(), false,
                           platform::errors::InvalidArgument(
-                              "the lod of Input ROIs shoule not be empty when "
+                              "the lod of Input ROIs should not be empty when "
                               "BatchRoINums is None!"));
         auto rois_lod = rois->lod().back();
         int rois_batch_size = rois_lod.size() - 1;
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index 25f45d0b2c9..5ec846c1473 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -293,7 +293,7 @@ class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_ENFORCE_EQ(rois->lod().empty(), false,
                         platform::errors::InvalidArgument(
-                            "the lod of Input ROIs shoule not be empty when "
+                            "the lod of Input ROIs should not be empty when "
                             "BatchRoINums is None!"));
       auto rois_lod = rois->lod().back();
       int rois_batch_size = rois_lod.size() - 1;
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 3da7c88f1fe..8a06f011a02 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -24,8 +24,8 @@ namespace operators {
 // Returns true if the two dimensions are compatible.
 // A dimension is compatible with the other if:
 // 1. The length of the dimensions are same.
-// 2. Each non-negative number of the two dimentions are same.
-// 3. For negative number in a dimention, it means unknown so it is compatible
+// 2. Each non-negative number of the two dimensions are same.
+// 3. For negative number in a dimension, it means unknown so it is compatible
 //    with any number.
 bool DimensionIsCompatibleWith(const framework::DDim& first,
                                const framework::DDim& second) {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index de021c8b455..d17e6b65cdf 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -174,7 +174,7 @@ class ReduceOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GT(
         dims.size(), 0,
         "ShapeError: The input dim dimensions of Reduce "
-        "shoud be greater than 0. But received the dim dimesions of Reduce "
+        "should be greater than 0. But received the dim dimesions of Reduce "
         " = %d",
         dims.size());
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 7516909f451..cc4635fefdd 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -162,7 +162,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
             shape[i], 0,
             platform::errors::InvalidArgument(
                 "Each dimension value of 'shape' in ReshapeOp must not "
-                "be negtive except one unknown dimension. "
+                "be negative except one unknown dimension. "
                 "But received  shape = [%s], shape[%d] = %d.",
                 framework::make_ddim(shape), i, shape[i]));
       }
@@ -234,7 +234,7 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor<int32>, optional). Target shape of reshape operator. "
              "It has a higher priority than Attr(shape) but a lower priority "
              "than Input(ShapeTensor). The Attr(shape) still should be "
-             "set correctly to gurantee shape inference in compile time.")
+             "set correctly to guarantee shape inference in compile time.")
         .AsDispensable();
     AddInput(
         "ShapeTensor",
@@ -288,7 +288,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of
 [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
 
 3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
-Attr(shape) still should be set correctly to gurantee shape inference in
+Attr(shape) still should be set correctly to guarantee shape inference in
 compile-time.
 
 )DOC");
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 12dadc4eb92..9d82017bfd2 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -86,7 +86,7 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Updates", "The updated value of scatter op");
     AddOutput("Out", "The output of scatter op");
     AddAttr<bool>("overwrite",
-                  "(bool, defalut: True) "
+                  "(bool, default: True) "
                   "The mode that updating the output when has same index,"
                   "If True, use the overwrite mode to update the output"
                   "of the same index, if False, use the accumulate mode to"
diff --git a/paddle/fluid/operators/select_input_op.cc b/paddle/fluid/operators/select_input_op.cc
index 33a5ff99a5d..be0d8a13849 100644
--- a/paddle/fluid/operators/select_input_op.cc
+++ b/paddle/fluid/operators/select_input_op.cc
@@ -67,7 +67,7 @@ class SelectInputOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     // Because this op is blocking whole control flow. I am implementing MVP
     // (minimal viable product) here.
     AddComment(R"DOC(
-Merge branches of LoDTensor into a single Output with a mask interger
+Merge branches of LoDTensor into a single Output with a mask integer
 specifying the output branchi.
 )DOC");
   }
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
index 3c6d36a0a61..df2176429bb 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
@@ -118,7 +118,7 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
         "sequences before padding.");
     AddAttr<int>(
         "padded_length",
-        "The length of padded sequences. It can be setted to -1 or "
+        "The length of padded sequences. It can be set to -1 or "
         "any positive int. When it is -1, all sequences will be padded up to "
         "the length of the longest one among them; when it a certain positive "
         "value, it must be greater than the length of the longest original "
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
index 09dba540282..10f382c9f06 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
@@ -54,7 +54,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
     AddOutput("Out",
               "(Tensor) The output of SequencePoolOp does not contain LoD "
-              "infomation.");
+              "information.");
     AddOutput("MaxIndex",
               "(Tensor<int>) This tensor is used for the sequence max-pooling "
               "to record the max indexes.")
@@ -93,7 +93,7 @@ Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
 Besides, for the sake of simplicity, we assume M=1 and N=1,
 and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
 
-Thus, Out is a [3,1,1] Tensor without LoD infomation.
+Thus, Out is a [3,1,1] Tensor without LoD information.
 And for different pooltype, the value of Out is as follows:
 
 - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
index b442c41eed1..38b70d07c54 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
@@ -63,7 +63,7 @@ class SequenceTopkAvgPoolingOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput(
         "Out",
         "(Tensor) The output of SequenceTopkPoolingOp does not contain LoD "
-        "infomation.");
+        "information.");
     AddOutput("pos", "(Tensor<int>) store the topk index ").AsIntermediate();
     AddAttr<std::vector<int>>("topks", "topks");
     AddAttr<int>("channel_num", "channel number");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
index 2b3c5a09406..d7ae82c783b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
@@ -96,7 +96,7 @@ class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
                     [ 6.0,  7.0,  8.0,  9.0, 10.0],
                     [11.0, 12.0, 13.0, 14.0, 15.0]], 
 `     
-      in which there are 3 sequences padded to length 5, and the acutal length 
+      in which there are 3 sequences padded to length 5, and the actual length 
       specified by Input(Length):
 
           Length.data = [2, 3, 4],
diff --git a/paddle/fluid/operators/shard_index_op.cc b/paddle/fluid/operators/shard_index_op.cc
index a02d0367159..3c1de753acf 100644
--- a/paddle/fluid/operators/shard_index_op.cc
+++ b/paddle/fluid/operators/shard_index_op.cc
@@ -63,7 +63,7 @@ class ShardIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("nshards",
                  "A positive integer to specify the number of shards.");
     AddAttr<int>("shard_id", "The current shard id");
-    AddAttr<int>("ignore_value", "An ingeter value out of sharded range")
+    AddAttr<int>("ignore_value", "An integer value out of sharded range")
         .SetDefault(-1);
     AddComment(R"DOC(
 This layer creates the sharded index for input. This layers is used in
@@ -80,7 +80,7 @@ to
     y = x % shard_size if x / shard_size == shard_id else ignore_value
 
 We take the distributed one-hot representation to show what this layer is
-used for. The distributed one-hot representation is seperated into multiple
+used for. The distributed one-hot representation is separated into multiple
 shards, and each shard is filling zeros except the one with the index
 inside. In order to create these sharded representation in each trainer,
 the original index should be recalculated (i.e. sharded) before.
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index c9b0795ddb0..16e1e17ca53 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -73,12 +73,12 @@ class ShrinkRNNMemoryOp : public ArrayOp {
 class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
+    AddInput("X", "(LoDTensor) The RNN step memory to be shrank.");
     AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
     AddInput("I",
              "(LoDTensor) The step index. The RNN step memory 'X' will be "
-             "shrinked to match the size of the input of the index'th step.");
-    AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
+             "shrank to match the size of the input of the index'th step.");
+    AddOutput("Out", "(LoDTensor) The shrank RNN step memory.");
     AddComment(R"DOC(
 This operator is used to shrink output batch of memory defined in dynamic RNN.
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index ebb299ba1f3..6e91ea44699 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -31,7 +31,7 @@ class SoftmaxWithCrossEntropyOpMaker
              "by softmax.");
     AddInput(
         "Label",
-        "(Tensor) The input tesnor of groud truth label. If :attr:`soft_label` "
+        "(Tensor) The input tensor of groud truth label. If :attr:`soft_label` "
         "is set to false, Label is a Tensor<int64> in same shape with "
         "Input(Logits) except the shape in dimension :attr:`axis` as 1. If "
         "soft_label is set to true, Label is a Tensor<float/double> in same "
@@ -50,7 +50,7 @@ class SoftmaxWithCrossEntropyOpMaker
               "entropy loss.");
     AddAttr<bool>(
         "soft_label",
-        "(bool, default: false), A flag to indicate whether to interpretate "
+        "(bool, default: false), A flag to indicate whether to interpretant "
         "the given labels as soft labels.")
         .SetDefault(false);
     AddAttr<bool>(
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 8f725be665b..dbda4b9b7e0 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -100,7 +100,7 @@ where:
 Therefore, the calculation can be separated into 3 steps:
 Step 1: row-wise operation to calculate max_i
 Step 2: row-wise operation to calculate logDiffMaxSum_i
-Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
+Step 3: calculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
 To save memory, we can share memory among max_i, logDiffMaxSum_i and
 cross\_entropy_i.
 In this way, the 3 steps should be changed to:
diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc
index 2cdcaeaf09d..7527424aac5 100644
--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ b/paddle/fluid/operators/spectral_norm_op.cc
@@ -93,7 +93,7 @@ class SpectralNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("U",
              "The weight_u tensor of spectral_norm operator, "
              "This can be a 1-D tensor in shape [H, 1],"
-             "H is the 1st dimentions of Weight after reshape"
+             "H is the 1st dimensions of Weight after reshape"
              "corresponding by Attr(dim). As for Attr(dim) = 1"
              "in conv2d layer with weight shape [M, C, K1, K2]"
              "Weight will be reshape to [C, M*K1*K2], U will"
@@ -101,7 +101,7 @@ class SpectralNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("V",
              "The weight_v tensor of spectral_norm operator, "
              "This can be a 1-D tensor in shape [W, 1], "
-             "W is the 2nd dimentions of Weight after reshape "
+             "W is the 2nd dimensions of Weight after reshape "
              "corresponding by Attr(dim). As for Attr(dim) = 1 "
              "in conv2d layer with weight shape [M, C, K1, K2] "
              "Weight will be reshape to [C, M*K1*K2], V will "
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 0e6781aa1c9..3a33c8be101 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -276,7 +276,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
             "size(%d).\n"
             "There are two possible causes for this problem: \n"
             "1. Check whether the runtime batch is larger than the max_batch "
-            "setted by EnableTensorrtEngine()\n"
+            "set by EnableTensorrtEngine()\n"
             "2. Check whether the model you are running has multiple trt "
             "subgraphs: \n "
             "\tIf there are multiple trt subgraphs, you need to ensure that "
diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index 6f0f5230019..394a89a0c07 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -51,7 +51,7 @@ class UnfoldOpMaker : public framework::OpProtoAndCheckerMaker {
 
 This Operator is used to extract sliding local blocks from a batched input tensor, also known
 as im2col when operated on batched 2D image tensor. For each block under the convolution filter,
-all element will be rearranged as a column. While the convolution filter silding over the input
+all element will be rearranged as a column. While the convolution filter sliding over the input
 feature map, a series of such columns will be formed. 
     )DOC");
   }
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 566fe662156..df617742317 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -177,7 +177,7 @@ class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
              "according to "
              "this given shape. It means that it has a higher priority than "
              "the shape attribute, while the shape attribute still should be "
-             "set correctly to gurantee shape inference in compile time.")
+             "set correctly to guarantee shape inference in compile time.")
         .AsDispensable();
     AddInput("ShapeTensorList",
              "(vector<Tensor<int64_t>> or vector<Tensor<int32_t>>, optional). "
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 0a0f7af6d9e..543a6fb73d6 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -153,7 +153,7 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_LT(static_cast<int>(axes.size()), 6,
                             "Invalid dimensions, dynamic dimensions should be "
                             "within [1, 6] dimensions (Eigen limit).");
-          // Validity Check: the range of unsqueeze aixs.
+          // Validity Check: the range of unsqueeze axis.
           for (int axis : axes) {
             PADDLE_ENFORCE_LT(axis, 6,
                               "Invalid dimensions, input axis should be"
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 04217e0ff20..d52889e0e54 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -123,10 +123,10 @@ An operator integrating the open-source
 https://arxiv.org/pdf/1512.02595v1.pdf),
 to compute Connectionist Temporal Classification (CTC) loss.
 It can be aliased as softmax with ctc, since a native softmax activation is
-interated to the warp-ctc library, to to normlize values for each row of the
+interated to the warp-ctc library, to to normalize values for each row of the
 input tensor.
 
-More detail of CTC loss can be found by refering to
+More detail of CTC loss can be found by referring to
 [Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with
 Recurrent Neural Networks](
 http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf).
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 74ab56c07cf..609bc4245e9 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -50,7 +50,7 @@ void PrintCuptiHint() {
   static bool showed = false;
   if (showed) return;
   showed = true;
-  LOG(WARNING) << "Invalid timestamp occured. Please try increasing the "
+  LOG(WARNING) << "Invalid timestamp occurred. Please try increasing the "
                   "FLAGS_multiple_of_cupti_buffer_size.";
 }
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index dc1a1041839..beeae143f5f 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -226,7 +226,7 @@ void BindImperative(py::module *m_ptr) {
     BackwardStrategy is a descriptor of how to run the backward process.
 
     **Note**:
-        **This API is only avaliable in** `Dygraph <../../user_guides/howto/dygraph/DyGraph.html>`_ **Mode**
+        **This API is only available in** `Dygraph <../../user_guides/howto/dygraph/DyGraph.html>`_ **Mode**
 
     Attribute:
         **sort_sum_gradient**:
@@ -339,7 +339,7 @@ void BindImperative(py::module *m_ptr) {
            },
            R"DOC(
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Returns a numpy array shows the value of current :ref:`api_guide_Variable_en`
 
@@ -375,7 +375,7 @@ void BindImperative(py::module *m_ptr) {
            },
            py::return_value_policy::copy, R"DOC(
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Returns a new Variable, detached from the current graph.
 
@@ -402,7 +402,7 @@ void BindImperative(py::module *m_ptr) {
       .def("clear_gradient", &imperative::VarBase::ClearGradient, R"DOC(
 
         **Notes**:
-        **1. This API is ONLY avaliable in Dygraph mode**
+        **1. This API is ONLY available in Dygraph mode**
 
         **2. Use it only Variable has gradient, normally we use this for Parameters since other temporal Variable will be deleted by Python's GC**
 
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
index f58491243be..22ecfac953f 100644
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -224,7 +224,7 @@ def max_job_id():
 
 def movie_categories():
     """
-    Get movie categoriges dictionary.
+    Get movie categories dictionary.
     """
     __initialize_meta_info__()
     return CATEGORIES_DICT
diff --git a/python/paddle/dataset/mq2007.py b/python/paddle/dataset/mq2007.py
index d5740f30c89..cfabd09705b 100644
--- a/python/paddle/dataset/mq2007.py
+++ b/python/paddle/dataset/mq2007.py
@@ -150,7 +150,7 @@ def gen_plain_txt(querylist):
   gen plain text in list for other usage
   Paramters:
   --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
+  querylist : querylist, one query match many document pairs in list, see QueryList
 
   return :
   ------
@@ -171,7 +171,7 @@ def gen_point(querylist):
   gen item in list for point-wise learning to rank algorithm
   Paramters:
   --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
+  querylist : querylist, one query match many document pairs in list, see QueryList
 
   return :
   ------
@@ -190,9 +190,9 @@ def gen_pair(querylist, partial_order="full"):
   gen pair for pair-wise learning to rank algorithm
   Paramters:
   --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
+  querylist : querylist, one query match many document pairs in list, see QueryList
   pairtial_order : "full" or "neighbour"
-    there is redudant in all possiable pair combinations, which can be simplifed
+    there is redundant in all possible pair combinations, which can be simplified
   gen pairs for neighbour items or the full partial order pairs
 
   return :
@@ -233,7 +233,7 @@ def gen_list(querylist):
   gen item in list for list-wise learning to rank algorithm
   Paramters:
   --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
+  querylist : querylist, one query match many document pairs in list, see QueryList
 
   return :
   ------
@@ -268,7 +268,7 @@ def query_filter(querylists):
 
 def load_from_text(filepath, shuffle=False, fill_missing=-1):
     """
-  parse data file into querys
+  parse data file into queries
   """
     prev_query_id = -1
     querylists = []
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index b9d46f74fd9..c4be745d524 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 """
 paddle.distributed.launch is a module that spawns multiple distributed 
-process on each trainning node for gpu trainning.
+process on each training node for gpu training.
 Usage:
     In both of single node training or multiple node training, this module 
 launch a process on each of the given gpu card.
-    1. for single node trainning with all visible gpu cards:
+    1. for single node training with all visible gpu cards:
        python -m paddle.distributed.launch \
          your_training_py (arg1 arg2 and all others)
     
-    2. for single node trainning with [0,4) cards
+    2. for single node training with [0,4) cards
        python -m paddle.distributed.launch --selected_gpus="0,1,2,3" \
          your_training_py (arg1 arg2 and all others)
-    3. for mulitple node training such as two node:192.168.0.16, 192.168.0.17
+    3. for multiple node training such as two node:192.168.0.16, 192.168.0.17
         on 192.168.0.16:
             python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
                 --node_ip=192.168.0.16 \
@@ -114,14 +114,14 @@ POD_IP (current node ip address, not needed for local training)
         "--selected_gpus",
         type=str,
         default=None,
-        help="It's for gpu trainning and the trainning process will run on the selected_gpus,"
-        "each process is bound to a single GPU. And if it's not setted, this module will use all the gpu cards for training."
+        help="It's for gpu training and the training process will run on the selected_gpus,"
+        "each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
     )
 
     parser.add_argument(
         "--log_dir",
         type=str,
-        help="The path for each process's log.If it's not setted, the log will printed to default pipe."
+        help="The path for each process's log.If it's not set, the log will printed to default pipe."
     )
 
     #positional
diff --git a/python/paddle/distributed/launch_ps.py b/python/paddle/distributed/launch_ps.py
index f8489965e71..49b6dccc98e 100644
--- a/python/paddle/distributed/launch_ps.py
+++ b/python/paddle/distributed/launch_ps.py
@@ -61,7 +61,7 @@ def parse_args():
         "--log_dir",
         default="logs",
         type=str,
-        help="The path for each process's log.If it's not setted, the log will printed to default pipe."
+        help="The path for each process's log.If it's not set, the log will printed to default pipe."
     )
 
     # positional
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index bbe83493d7e..7a2f2016908 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -832,7 +832,7 @@ def _append_backward_ops_(block,
         target_block(Block): the block which is going to hold new generated grad ops
         no_grad_dict(dict):
             key(int)  block index
-            val(set) a set of varibale names. These varibales have no gradient
+            val(set) a set of variable names. These variables have no gradient
         grad_to_var(dict)(output argument):
             key(str): grad variable name
             val(str): corresponding forward variable name
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 2687690978d..d89b1cb41d8 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -116,7 +116,7 @@ def var_conv_2d(input,
     """
     The var_conv_2d layer calculates the output base on the :attr:`input` with variable length,
     row, col, input channel, filter size and strides. Both :attr:`input`, :attr:`row`,
-    and :attr:`col` are 1-level LodTensor. The covolution operation is same as conv2d layer with 
+    and :attr:`col` are 1-level LodTensor. The convolution operation is same as conv2d layer with 
     padding. Besides, input.dims[1] should be 1. 
 
     .. code-block:: text
@@ -133,9 +133,9 @@ def var_conv_2d(input,
                 output.dims = [174, 1]  # where 174 = 90 + 84
 
     Args:
-        input (Variable): The input shoud be 1-level LodTensor with dims[1] equals 1.
-        row (Variable): The row shoud be 1-level LodTensor to provide height information.
-        col (Variable): The col shoud be 1-level LodTensor to provide width information.
+        input (Variable): The input should be 1-level LodTensor with dims[1] equals 1.
+        row (Variable): The row should be 1-level LodTensor to provide height information.
+        col (Variable): The col should be 1-level LodTensor to provide width information.
         input_channel (int): The number of input channel.
         output_channel (int): The number of output channel.
         filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
@@ -325,9 +325,9 @@ def sequence_topk_avg_pooling(input, row, col, topks, channel_num):
 
     Args:
         input (Variable): The input should be 2D LodTensor with dims[1] equals 1.
-        row (Variable): The row shoud be 1-level LodTensor to provide the height information
+        row (Variable): The row should be 1-level LodTensor to provide the height information
                         of the input tensor data.
-        col (Variable): The col shoud be 1-level LodTensor to provide the width information
+        col (Variable): The col should be 1-level LodTensor to provide the width information
                         of the input tensor data.
         topks (list): A list of incremental value to average the topk feature.
         channel_num (int): The number of input channel.
@@ -555,7 +555,7 @@ def multiclass_nms2(bboxes,
                                  low confidence score. If not provided, 
                                  consider all boxes.
         nms_top_k (int): Maximum number of detections to be kept according to
-                         the confidences aftern the filtering detections based
+                         the confidences after the filtering detections based
                          on score_threshold.
         nms_threshold (float): The threshold to be used in NMS. Default: 0.3
         nms_eta (float): The threshold to be used in NMS. Default: 1.0
diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py
index 17fe99ca4e6..603aa72a5a5 100644
--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ b/python/paddle/fluid/contrib/layers/rnn_impl.py
@@ -181,7 +181,7 @@ def basic_gru(input,
         sequence_length (Variabe|None): A Tensor (shape [batch_size]) stores each real length of each instance,
                         This tensor will be convert to a mask to mask the padding ids
                         If it's None means NO padding ids
-        dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of earch layers, 
+        dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of each layers, 
                              NOT between time steps
         bidirectional (bool|False): If it is bidirectional
         batch_first (bool|True): The shape format of the input and output tensors. If true,
@@ -411,7 +411,7 @@ def basic_lstm(input,
         sequence_length (Variabe|None): A tensor (shape [batch_size]) stores each real length of each instance,
                         This tensor will be convert to a mask to mask the padding ids
                         If it's None means NO padding ids
-        dropout_prob(float|0.0): Dropout prob, dropout ONLY work after rnn output of earch layers, 
+        dropout_prob(float|0.0): Dropout prob, dropout ONLY work after rnn output of each layers, 
                              NOT between time steps
         bidirectional (bool|False): If it is bidirectional
         batch_first (bool|True): The shape format of the input and output tensors. If true,
diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py
index 1f7ec69dd75..b5d85616cf0 100644
--- a/python/paddle/fluid/contrib/memory_usage_calc.py
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-This module privides a memory usage calculate function for user.
+This module provides a memory usage calculate function for user.
 The purpose of this API is to allow users to estimate memory usage of
 a program under a special batch size, then user can set appropriate
 batch size to fully utilize a GPU.
@@ -91,8 +91,9 @@ def memory_usage(program, batch_size):
             for x in var.shape:
                 if x < 0:
                     if neg_dim_count >= 1:
-                        raise ValueError("Var %s has more than one negtive dim."
-                                         % (var_name))
+                        raise ValueError(
+                            "Var %s has more than one negative dim." %
+                            (var_name))
                     neg_dim_count += 1
                     data_count *= batch_size * (-x)
                 else:
diff --git a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
index 471a796eb3e..807d3c6a430 100644
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
@@ -147,7 +147,7 @@ class QuantizeTranspiler(object):
         """Rewrites a training input program in place for simulated
         quantization. Insert fake quantization and de-quantization ops into
         program to simulate the error introduced by quantization. And change
-        the graident ops' input by using the faked quantization weights and
+        the gradient ops' input by using the faked quantization weights and
         activation. Since the program is transformed in place, the graph
         connection will change.
 
diff --git a/python/paddle/fluid/contrib/slim/core/compressor.py b/python/paddle/fluid/contrib/slim/core/compressor.py
index 0faac37b493..6d87a871ed2 100644
--- a/python/paddle/fluid/contrib/slim/core/compressor.py
+++ b/python/paddle/fluid/contrib/slim/core/compressor.py
@@ -302,7 +302,7 @@ class Compressor(object):
                                  this optimizer is used to minimize the combined loss of student-net and
                                  teacher-net while train_optimizer is used to minimize loss of
                                  student-net in fine-tune stage. 
-            search_space(slim.nas.SearchSpace): The instance that define the searching space. It must inherite
+            search_space(slim.nas.SearchSpace): The instance that define the searching space. It must inherit
                               slim.nas.SearchSpace class and overwrite the abstract methods.
             log_period(int): The period of print log of training.
 
@@ -551,7 +551,7 @@ class Compressor(object):
 
     def run(self):
         """
-        Execute compressiong pass.
+        Execute compressing pass.
         """
         context = Context(
             place=self.place,
diff --git a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
index 1c01eb82d7f..4a0e8ef005a 100644
--- a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
@@ -63,7 +63,7 @@ class VarWrapper(object):
 
     def shape(self):
         """
-        Get the shape of the varibale.
+        Get the shape of the variable.
         """
         return self._var.shape
 
@@ -152,13 +152,13 @@ class OpWrapper(object):
 
     def inputs(self, name):
         """
-        Get all the varibales by the input name.
+        Get all the variables by the input name.
         """
         return [self._graph.var(var_name) for var_name in self._op.input(name)]
 
     def outputs(self, name):
         """
-        Get all the varibales by the output name.
+        Get all the variables by the output name.
         """
         return [self._graph.var(var_name) for var_name in self._op.output(name)]
 
@@ -233,7 +233,7 @@ class GraphWrapper(object):
         """
         Whether the given variable is parameter.
         Args:
-            var(VarWrapper): The given varibale.
+            var(VarWrapper): The given variable.
         """
         return isinstance(var._var, Parameter)
 
@@ -241,7 +241,7 @@ class GraphWrapper(object):
         """
         Whether the given variable is persistable.
         Args:
-            var(VarWrapper): The given varibale.
+            var(VarWrapper): The given variable.
         """
         return var._var.persistable
 
@@ -397,7 +397,7 @@ class GraphWrapper(object):
         """
         Get a new graph for training by appending some backward operators and optimization operators.
         Args:
-            optimizer: The optimzier used to generate training graph.
+            optimizer: The optimizer used to generate training graph.
             place: The place to run the graph.
             scope: The scope used to run the graph. Some new variable will be added into this scope.
             no_grad_var_names(list<str>): Names of variables that should be ignored while computing gradients. default: [].
diff --git a/python/paddle/fluid/contrib/slim/nas/controller_server.py b/python/paddle/fluid/contrib/slim/nas/controller_server.py
index 65cfbd7d86f..3b5323a3ca4 100644
--- a/python/paddle/fluid/contrib/slim/nas/controller_server.py
+++ b/python/paddle/fluid/contrib/slim/nas/controller_server.py
@@ -27,7 +27,7 @@ _logger = get_logger(
 
 class ControllerServer(object):
     """
-    The controller wrapper with a socket server to handle the request of search agentt.
+    The controller wrapper with a socket server to handle the request of search agent.
     """
 
     def __init__(self,
diff --git a/python/paddle/fluid/contrib/slim/prune/auto_prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/auto_prune_strategy.py
index f9dce622da2..c758c2b3da1 100644
--- a/python/paddle/fluid/contrib/slim/prune/auto_prune_strategy.py
+++ b/python/paddle/fluid/contrib/slim/prune/auto_prune_strategy.py
@@ -53,7 +53,7 @@ class AutoPruneStrategy(PruneStrategy):
             metric_name(str): The metric used to evaluate the model.
                          It should be one of keys in out_nodes of graph wrapper. Default: 'top1_acc'
             pruned_params(str): The pattern str to match the parameter names to be pruned. Default: 'conv.*_weights'
-            retrain_epoch(int): The training epochs in each seaching step. Default: 0
+            retrain_epoch(int): The training epochs in each searching step. Default: 0
             uniform_range(int): The token range in each position of tokens generated by controller. None means getting the range automatically. Default: None.
             init_tokens(list<int>): The initial tokens. None means getting the initial tokens automatically. Default: None.
         """
diff --git a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
index bbdebf3e538..8d9020dd95e 100644
--- a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
+++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
@@ -741,7 +741,7 @@ class SensitivePruneStrategy(PruneStrategy):
 
     def _format_sensitivities(self, sensitivities):
         """
-        Print formated sensitivities in debug log level.
+        Print formatted sensitivities in debug log level.
         """
         tb = pt.PrettyTable()
         tb.field_names = ["parameter", "size"] + [
diff --git a/python/paddle/fluid/contrib/slim/prune/pruner.py b/python/paddle/fluid/contrib/slim/prune/pruner.py
index 506b8fbe1de..368e7831b3d 100644
--- a/python/paddle/fluid/contrib/slim/prune/pruner.py
+++ b/python/paddle/fluid/contrib/slim/prune/pruner.py
@@ -42,7 +42,7 @@ class StructurePruner(Pruner):
             pruning_axis(dict): The key is the name of parameter to be pruned,
                                 '*' means all the parameters.
                                 The value is the axis to be used. Given a parameter
-                                with shape [3, 4], the result of pruning 50% on aixs 1
+                                with shape [3, 4], the result of pruning 50% on axis 1
                                 is a parameter with shape [3, 2].
             criterions(dict): The key is the name of parameter to be pruned,
                               '*' means all the parameters.
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 9edf473546f..fa6a6e60ae3 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -666,10 +666,10 @@ class QuantizationFreezePass(object):
                  quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul']):
         """
         The freeze pass is used to adjust the quantize operator order, for example:
-            1) `activation -> quant -> dequant -> conv2d` will be freezed into
+            1) `activation -> quant -> dequant -> conv2d` will be frozen into
             `activation -> quant -> conv2d -> dequant`
-            2) `weight -> quant -> dequant -> conv2d` will be freezed into `weight -> conv2d`,
-            and weight will be sacled offline.
+            2) `weight -> quant -> dequant -> conv2d` will be frozen into `weight -> conv2d`,
+            and weight will be scaled offline.
 
         Args:
             scope(fluid.Scope): scope is used to get the weight tensor values.
@@ -994,8 +994,8 @@ class ConvertToInt8Pass(object):
 
     def apply(self, graph):
         """
-        Convert weights' tpye of the graph. After that, the data type of the
-        graph weigths is int8_t.
+        Convert weights' type of the graph. After that, the data type of the
+        graph weights is int8_t.
 
         Args:
             graph(IrGraph): the applied graph.
@@ -1065,7 +1065,7 @@ class ConvertToInt8Pass(object):
 class TransformForMobilePass(object):
     def __init__(self):
         """
-        This pass is used to convert the freezed graph for paddle-mobile execution.
+        This pass is used to convert the frozen graph for paddle-mobile execution.
         """
         self._fake_quant_op_names = _fake_quant_op_list
         self._fake_dequant_op_names = _fake_dequant_op_list
diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py
index d27b808438d..7c7ed0972c7 100644
--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -673,11 +673,11 @@ def save_checkpoint(executor,
     main_program and then saves these variables to the `checkpoint_dir`
     directory.
 
-    In the training precess, we generally save a checkpoint in each
+    In the training process, we generally save a checkpoint in each
     iteration. So there might be a lot of checkpoints in the
     `checkpoint_dir`. To avoid them taking too much disk space, the
     `max_num_checkpoints` are introduced to limit the total number of
-    checkpoints. If the number of existing checkpints is greater than
+    checkpoints. If the number of existing checkpoints is greater than
     the `max_num_checkpoints`, oldest ones will be scroll deleted.
 
     A variable is a checkpoint variable and will be saved if it meets
@@ -689,7 +689,7 @@ def save_checkpoint(executor,
     Args:
         executor(Executor): The executor to run for save checkpoint.
         checkpoint_dir(str): The folder where to save checkpoints.
-        trainer_id(int): currect trainer id, if id is equal to 0, the trainer
+        trainer_id(int): current trainer id, if id is equal to 0, the trainer
             is chief.
         trainer_args(dict|None): Current training arguments. Such as 'epoch_id'
             and 'step_id'.
@@ -772,7 +772,7 @@ def load_checkpoint(executor,
     main_program and then try to load these variables from the
     `checkpoint_dir` directory.
 
-    In the training precess, we generally save a checkpoint in each
+    In the training process, we generally save a checkpoint in each
     iteration. So there are more than one checkpoint in the
     `checkpoint_dir` (each checkpoint has its own sub folder), use
     `serial` to specify which serial of checkpoint you would like to
@@ -867,7 +867,7 @@ def _load_persist_vars_without_grad(executor,
                                     has_model_dir=False):
     """
     This function filters out all checkpoint variables from the give
-    program and then trys to load these variables from the given directory.
+    program and then tries to load these variables from the given directory.
 
     A variable is a checkpoint variable if it meets all following
     conditions:
@@ -898,7 +898,7 @@ def _load_persist_vars_without_grad(executor,
 
             # In this example, `_load_persist_vars_without_grad` function
             # will first filters out all checkpoint variables in the default
-            # main program, and then trys to load these variables form the
+            # main program, and then tries to load these variables form the
             # folder "./my_paddle_model/__model__".
     """
 
@@ -1135,12 +1135,12 @@ def _is_checkpoint_var(var):
 
 def _make_chekcpoint_dirs(dirs):
     """
-    _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
+    _make_chekcpoint_dirs will makedir local directory directly, when the directory is exist, it will ignore it.
     """
     assert dirs is not None
 
     if os.path.isfile(dirs):
-        raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
+        raise OSError(errno.ENOTDIR, "dirs path should be a Directory.", dirs)
 
     if not os.path.isdir(dirs):
         try:
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
index 962a5653f61..2de4f82bd14 100644
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
@@ -312,9 +312,9 @@ class HDFSClient(object):
     @staticmethod
     def make_local_dirs(local_path):
         """
-        create a directiory local, is same to mkdir
+        create a directory local, is same to mkdir
         Args:
-            local_path: local path that wants to create a directiory.
+            local_path: local path that wants to create a directory.
         """
         try:
             os.makedirs(local_path)
diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
index 2d18a9a8620..8552bc8fc10 100644
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
@@ -137,7 +137,7 @@ def load_persistables_for_increment(dirname, executor, program,
                                     lookup_table_var, lookup_table_var_path):
     """
     WARNING: this function will only be used for distributed training with distributed lookup table.
-    for increment trainning, the pserver will not only load dense variables,
+    for increment training, the pserver will not only load dense variables,
     but also load the suitable lookup table var. Because of sliced lookup table
     var with HASH, we must load the correct sliced var.
 
@@ -417,7 +417,7 @@ def get_inference_model(main_program, feeded_var_names, target_vars):
 
     Args:
         main_program(Program|None): The original program, which will be pruned to
-                                    build the inference model. If is setted None,
+                                    build the inference model. If is set None,
                                     the default main program will be used.
                                     Default: None.
         feeded_var_names(list[str]): Names of variables that need to be feeded data
diff --git a/python/paddle/fluid/data.py b/python/paddle/fluid/data.py
index 6376b4f7749..179c3b07dbe 100644
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
@@ -54,7 +54,7 @@ def data(name, shape, dtype='float32', lod_level=0):
            for more details.
        shape (list|tuple): List|Tuple of integers declaring the shape. You can
            set "None" at a dimension to indicate the dimension can be of any
-           size. For example, it is useful to set changable batch size as "None" 
+           size. For example, it is useful to set changeable batch size as "None" 
        dtype (np.dtype|VarType|str, optional): The type of the data. Supported
            dtype: bool, float16, float32, float64, int8, int16, int32, int64,
            uint8. Default: float32
@@ -75,7 +75,7 @@ def data(name, shape, dtype='float32', lod_level=0):
           # User can only feed data of the same shape to x
           x = fluid.data(name='x', shape=[3, 2, 1], dtype='float32')
 
-          # Creates a variable with changable batch size.
+          # Creates a variable with changeable batch size.
           # Users can feed data of any batch size into y,
           # but size of each data sample has to be [2, 1]
           y = fluid.data(name='y', shape=[None, 2, 1], dtype='float32')
diff --git a/python/paddle/fluid/data_feed_desc.py b/python/paddle/fluid/data_feed_desc.py
index eca9543c60a..4878c25fde5 100644
--- a/python/paddle/fluid/data_feed_desc.py
+++ b/python/paddle/fluid/data_feed_desc.py
@@ -53,7 +53,7 @@ class DataFeedDesc(object):
       data_feed = fluid.DataFeedDesc('data.proto')
 
     However, users usually shouldn't care about the message format; instead,
-    they are encouragd to use :code:`Data Generator` as a tool to generate a
+    they are encouraged to use :code:`Data Generator` as a tool to generate a
     valid data description, in the process of converting their raw log files to
     training files acceptable to AsyncExecutor.
 
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 6b49f7a8b4a..70a429f65c5 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -339,10 +339,10 @@ class DataFeeder(object):
         """
         Similar with feed function, feed_parallel is used with multiple devices (CPU|GPU).
         Here :code:`iterable` is a list of python generators. The data return by each 
-        generator in the list will be fed into a seperate device.        
+        generator in the list will be fed into a separate device.        
 
         Parameters:
-            iterable (list|tuple): list of user-defined python geneators. The element 
+            iterable (list|tuple): list of user-defined python generators. The element 
                 number should match the :code:`num_places`.
             num_places (int, optional): the number of devices. If not provided (None), 
                 all available devices on the machine will be used. Default None.
@@ -379,7 +379,7 @@ class DataFeeder(object):
                 exe.run(fluid.default_startup_program())
                 program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(places=places)
 
-                # print sample feed_parallel r resultt
+                # print sample feed_parallel r result
                 # for item in list(feeder.feed_parallel([generate_reader(5, 0, 1), generate_reader(3, 10, 2)], 2)):
                 #     print(item['x'])
                 #     print(item['y'])
@@ -433,7 +433,7 @@ class DataFeeder(object):
 
         Parameters:
             reader(generator): a user defined python generator used to get :code:`mini-batch` of data.
-                A :code:`mini-batch` can be regarded as a python generator that returns batchs of input 
+                A :code:`mini-batch` can be regarded as a python generator that returns batches of input 
                 entities, just like the below :code:`_mini_batch` in the code example.                      
             multi_devices(bool): indicate whether to use multiple devices or not.
             num_places(int, optional): if :code:`multi_devices` is True, you can specify the number
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index b10ebcaa47e..60dd4eb3831 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -100,7 +100,7 @@ class DatasetBase(object):
         Args:
             record_candidate_size(int): size of instances candidate to shuffle 
                                         one slot
-            fea_eval(bool): wheather enable fea eval mode to enable slots shuffle.
+            fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
                             default is True.
             
         Examples:
@@ -822,7 +822,7 @@ class BoxPSDataset(InMemoryDataset):
 
     def wait_preload_done(self):
         """
-        Wait async proload done
+        Wait async preload done
         Wait Until Feed Pass Done
         Examples:
             .. code-block:: python
diff --git a/python/paddle/fluid/debugger.py b/python/paddle/fluid/debugger.py
index ef07dcebcab..9110b8daf38 100644
--- a/python/paddle/fluid/debugger.py
+++ b/python/paddle/fluid/debugger.py
@@ -338,7 +338,7 @@ def run_fast_nan_inf_debug(executor,
                            use_program_cache=False,
                            dump_core=True):
     """
-    Run a program by the given executor. Catch the exception of NAN and INF, and save persistbales into the dumped core.
+    Run a program by the given executor. Catch the exception of NAN and INF, and save persistables into the dumped core.
     """
 
     assert (executor is not None)
diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py
index 902daf1a4ac..61e508ea72e 100644
--- a/python/paddle/fluid/distributed/downpour.py
+++ b/python/paddle/fluid/distributed/downpour.py
@@ -59,7 +59,7 @@ class DownpourSGD(object):
         """
         DownpounSGD is a distributed optimizer so
         that user can call minimize to generate backward
-        operators and optimization operators within minmize function
+        operators and optimization operators within minimize function
         Args:
             loss(Variable): loss variable defined by user
             startup_program(Program): startup program that defined by user
diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py
index 19d661c660e..e89a1b71dd5 100644
--- a/python/paddle/fluid/distributed/ps_instance.py
+++ b/python/paddle/fluid/distributed/ps_instance.py
@@ -110,7 +110,7 @@ class PaddlePSInstance(object):
 
     def gather_ips(self):
         """
-        Return all servers and workers ip throught mpi allgather 
+        Return all servers and workers ip through mpi allgather 
         """
         self._ips = self.dh.comm.allgather(self._ip)
         return self._ips
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index 45e8959ef82..91bd7836e19 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -88,9 +88,9 @@ class PiecewiseDecay(LearningRateDecay):
         boundaries(list): A list of steps numbers. The type of element in the list is python int. 
         values(list): A list of learning rate values that will be picked during
             different step boundaries. The type of element in the list is python float.
-        begin(int): The begin step to initilize the global_step in the description above.
+        begin(int): The begin step to initialize the global_step in the description above.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
 
@@ -158,7 +158,7 @@ class NaturalExpDecay(LearningRateDecay):
             default value is False.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
 
@@ -238,7 +238,7 @@ class ExponentialDecay(LearningRateDecay):
             default value is False.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
 
@@ -312,7 +312,7 @@ class InverseTimeDecay(LearningRateDecay):
             default value is False.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be 
             'float32', 'float64'. The default value is 'float32'.
 
@@ -393,7 +393,7 @@ class PolynomialDecay(LearningRateDecay):
         cycle(bool, optional): If set true, decay the learning rate every decay_steps. The default value is False.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
 
@@ -471,7 +471,7 @@ class CosineDecay(LearningRateDecay):
         epochs(int): The number of epochs.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
 
@@ -528,7 +528,7 @@ class NoamDecay(LearningRateDecay):
             it's a tensor with shape [1] and the data type can be int32 or int64. The type can also be python int.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
 
@@ -592,7 +592,7 @@ class LinearLrWarmup(LearningRateDecay):
         end_lr (float): Final learning rate of warm up.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
     
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 64119a89ec5..c9ede3bdefe 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -50,7 +50,7 @@ class Conv2D(layers.Layer):
     C will equal the number of input feature map divided by the groups.
     Please refer to UFLDL's `convolution
     <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
-    for more detials.
+    for more details.
     If bias attribution and activation type are provided, bias is added to the
     output of the convolution, and the corresponding activation function is
     applied to the final result.
@@ -1003,7 +1003,7 @@ class BatchNorm(layers.Layer):
 
     Parameters:
         num_channels(int): Indicate the number of channels of the input ``Tensor``.
-        act(str, optional): Activation to be applied to the output of batch normalizaiton. Default: None.
+        act(str, optional): Activation to be applied to the output of batch normalization. Default: None.
         is_test (bool, optional): A flag indicating whether it is in test phrase or not. Default: False.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
@@ -1242,7 +1242,7 @@ class Embedding(layers.Layer):
             default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
             user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. 
             The local word vector needs to be transformed into numpy format, and the shape of local word
-            vector shoud be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
+            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
             is used to load custom or pre-trained word vectors. See code example 2 for details.
         dtype(np.dtype|core.VarDesc.VarType|str): It refers to the data type of output Tensor.
             It must be "float32" or "float64". Default: "float32".
@@ -1382,7 +1382,7 @@ class LayerNorm(layers.Layer):
             omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
             a default :code:`ParamAttr` would be added as bias. The
             :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
-        act(str, optional): Activation to be applied to the output of layer normalizaiton.
+        act(str, optional): Activation to be applied to the output of layer normalization.
                   Default: None.
         dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
 
@@ -1435,7 +1435,7 @@ class LayerNorm(layers.Layer):
                 default_initializer=Constant(1.0))
         else:
             if self._param_attr:
-                logging.warn("param_attr are only avaliable with scale is True")
+                logging.warn("param_attr are only available with scale is True")
 
         if self._shift:
             assert self._bias_attr is not False
@@ -1446,7 +1446,7 @@ class LayerNorm(layers.Layer):
                 is_bias=True)
         else:
             if self._bias_attr:
-                logging.warn("bias_attr are only avaliable with shift is True")
+                logging.warn("bias_attr are only available with shift is True")
 
     def forward(self, input):
         input_shape = list(input.shape)
@@ -1702,7 +1702,7 @@ class NCE(layers.Layer):
              will create ParamAttr as bias_attr. If the Initializer of the bias_attr
              is not set, the bias is initialized zero. Default: None.
         num_neg_samples (int, optional): The number of negative classes. The default value is 10.
-        sampler (str, optional): The sampler used to sample class from negtive classes.
+        sampler (str, optional): The sampler used to sample class from negative classes.
                        It can be 'uniform', 'log_uniform' or 'custom_dist'.
                        default: 'uniform'.
         custom_dist (float[], optional): A float[] with size=num_total_classes.
@@ -2544,7 +2544,7 @@ class GroupNorm(layers.Layer):
         bias_attr(ParamAttr, optional): The parameter attribute for the learnable
                                         bias :math:`b`. If it is set to False, no bias will be added to the output units.
                                         If it is set to None, the bias is initialized zero. Default: None.
-        act(str, optional): Activation to be applied to the output of group normalizaiton. Default: None.
+        act(str, optional): Activation to be applied to the output of group normalization. Default: None.
         data_layout(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
 
     Returns:
@@ -2640,7 +2640,7 @@ class SpectralNorm(layers.Layer):
     and W is the product result of remaining dimensions.
 
     Step 2:
-    :attr:`power_iters` shoule be a positive interger, do following
+    :attr:`power_iters` should be a positive integer, do following
     calculations with U and V for :attr:`power_iters` rounds.
 
     .. math::
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 1390919151a..b7e87ab8b6b 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -27,7 +27,7 @@ def monkey_patch_varbase():
     def set_value(self, value):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Set a new value for this Variable.
 
@@ -76,7 +76,7 @@ def monkey_patch_varbase():
     def backward(self, backward_strategy=None):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Run backward of current Graph which starts from current Variable
 
@@ -116,13 +116,13 @@ def monkey_patch_varbase():
             self._run_backward(backward_strategy, framework._dygraph_tracer())
         else:
             raise ValueError(
-                "Variable.backward() is only avaliable in DyGraph mode")
+                "Variable.backward() is only available in DyGraph mode")
 
     @framework.dygraph_only
     def gradient(self):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Get the Gradient of Current Variable
 
diff --git a/python/paddle/fluid/dygraph_grad_clip.py b/python/paddle/fluid/dygraph_grad_clip.py
index db7a76615f8..c90795e09f9 100644
--- a/python/paddle/fluid/dygraph_grad_clip.py
+++ b/python/paddle/fluid/dygraph_grad_clip.py
@@ -55,7 +55,7 @@ class GradClipByValue(GradClipBase):
     Args:
         max_value (float): The maximum value to clip by. 
         min (float, optional): The minimum value to clip by. if not set by user, \
-        will be set to -max_value(max_value MUST be postive) by framework. 
+        will be set to -max_value(max_value MUST be positive) by framework. 
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index b2e0fc28ed7..90979c6b839 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -164,8 +164,8 @@ def dimension_is_compatible_with(first, second):
 
     A dimension is compatible with the other if:
     1. The length of the dimensions are same.
-    2. Each non-negative number of the two dimentions are same.
-    3. For negative number or 'None' in a dimention, it means unknown so it
+    2. Each non-negative number of the two dimensions are same.
+    3. For negative number or 'None' in a dimension, it means unknown so it
        is compatible with any number.
 
     Args:
@@ -200,8 +200,8 @@ def check_feed_shape_type(var, feed, num_places=1):
 
     A dimension is compatible with the other if:
     1. The length of the dimensions are same.
-    2. Each non-negative number of the two dimentions are same.
-    3. For negative number or 'None' in a dimention, it means unknown so it
+    2. Each non-negative number of the two dimensions are same.
+    3. For negative number or 'None' in a dimension, it means unknown so it
        is compatible with any number.
     
     Args:
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 7b9919d2ab7..17512989759 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -458,7 +458,7 @@ def name_scope(prefix=None):
     if in_dygraph_mode():
         yield
     else:
-        assert prefix, "namescope prefix cannot be empty."
+        assert prefix, "namescope prefix canot be empty."
         global _name_scope
         _name_scope = _name_scope.child(prefix)
         yield
@@ -816,7 +816,7 @@ class Variable(object):
     There are many kinds of variables. Each kind of them has its own attributes
     and usages. Please refer to the `framework.proto <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto>`_ for details.
 
-    Most of a Variable's member variables can be setted to be None. It mean
+    Most of a Variable's member variables can be set to be None. It mean
     it is not available or will be specified later.
 
     Examples:
@@ -949,7 +949,7 @@ class Variable(object):
     def detach(self):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Returns a new Variable, detached from the current graph.
 
@@ -979,7 +979,7 @@ class Variable(object):
     def numpy(self):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Returns a numpy array shows the value of current :ref:`api_guide_Variable_en`
 
@@ -1011,7 +1011,7 @@ class Variable(object):
     def set_value(self, value):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Set a new value for this Variable.
 
@@ -1042,7 +1042,7 @@ class Variable(object):
     def backward(self, backward_strategy=None):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Run backward of current Graph which starts from current Variable
 
@@ -1080,7 +1080,7 @@ class Variable(object):
     def gradient(self):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Get the Gradient of Current Variable
 
@@ -1128,7 +1128,7 @@ class Variable(object):
     def clear_gradient(self):
         """
         **Notes**:
-            **1. This API is ONLY avaliable in Dygraph mode**
+            **1. This API is ONLY available in Dygraph mode**
 
             **2. Use it only Variable has gradient, normally we use this for Parameters since other temporal Variable will be deleted by Python's GC**
 
@@ -1495,7 +1495,7 @@ class Variable(object):
         if length < 0:
             raise ValueError("length should not be negative")
         if step == 0:
-            raise ValueError("slice step cannot be zero")
+            raise ValueError("slice step canot be zero")
 
         # Find lower and upper bounds for start and stop.
         lower = -1 if step < 0 else 0
@@ -2965,7 +2965,7 @@ class IrVarNode(IrNode):
             shape(list): shape to be set.
         """
         assert self.node.var() is not None, \
-            "The node variable description cannot be None."
+            "The node variable description canot be None."
         self.node.var().set_shape(shape)
 
     def persistable(self):
@@ -2976,7 +2976,7 @@ class IrVarNode(IrNode):
             bool: indicate whether the variable is persistable.
         """
         assert self.node.var() is not None, \
-            "The node variable description cannot be None."
+            "The node variable description canot be None."
         return self.node.var().persistable()
 
     def type(self):
@@ -2987,7 +2987,7 @@ class IrVarNode(IrNode):
             core.VarDesc.VarType: the variable type.
         """
         assert self.node.var() is not None, \
-            "The node variable description cannot be None."
+            "The node variable description canot be None."
         return self.node.var().type()
 
     def dtype(self):
@@ -2998,7 +2998,7 @@ class IrVarNode(IrNode):
             core.VarDesc.VarType: the variable data type.
         """
         assert self.node.var() is not None, \
-            "The node variable description cannot be None."
+            "The node variable description canot be None."
         return self.node.var().dtype()
 
     def shape(self):
@@ -3009,7 +3009,7 @@ class IrVarNode(IrNode):
             list: the variable shape.
         """
         assert self.node.var() is not None, \
-            "The node variable description cannot be None."
+            "The node variable description canot be None."
         return self.node.var().shape()
 
     @property
@@ -3059,7 +3059,7 @@ class IrOpNode(IrNode):
             new_input_name(str): the new input name.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         self.node.op()._rename_input(old_input_name, new_input_name)
 
     def rename_output(self, old_output_name, new_output_name):
@@ -3071,7 +3071,7 @@ class IrOpNode(IrNode):
             new_output_name(str): the new output name.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         print("op: {}, old: {}, new: {}\n".format(self.node.op().type(
         ), old_output_name, new_output_name))
         self.node.op()._rename_output(old_output_name, new_output_name)
@@ -3087,7 +3087,7 @@ class IrOpNode(IrNode):
             list(str): the argument name list.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         return self.node.op().input(name)
 
     def output(self, name):
@@ -3101,7 +3101,7 @@ class IrOpNode(IrNode):
             list(str): the argument name list.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         return self.node.op().output(name)
 
     def set_type(self, new_type):
@@ -3112,7 +3112,7 @@ class IrOpNode(IrNode):
             new_type(str): new operator type to be set.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         return self.node.op().set_type(new_type)
 
     def set_attr(self, name, val):
@@ -3130,7 +3130,7 @@ class IrOpNode(IrNode):
         Update the value of the op desc's attribute by attribute's name.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         desc = self.node.op()
         if isinstance(val, Block):
             desc.set_block_attr(name, val.desc)
@@ -3151,7 +3151,7 @@ class IrOpNode(IrNode):
             list(str): input arguments' names of this op node.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         return self.node.op().input_arg_names()
 
     def output_arg_names(self):
@@ -3162,7 +3162,7 @@ class IrOpNode(IrNode):
             list(str): output arguments' names of this op node.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         return self.node.op().output_arg_names()
 
     @property
@@ -3318,7 +3318,7 @@ class IrGraph(object):
             op_type(str): the type of the operator node.
             attrs(dict): the attributes of the operator node.
             inputs(dict): the inputs of the operator node.
-            outputs(dict): the outpus of the operator node.
+            outputs(dict): the outputs of the operator node.
 
         Returns:
             IrOpNode: the created operator node.
@@ -3459,7 +3459,7 @@ class IrGraph(object):
         """
         Perform the topology sort operation on the graph.
 
-        Notes: the `graph` cannot contain a circle.
+        Notes: the `graph` canot contain a circle.
 
         Returns:
             list(IrNode): nodes in topology order.
@@ -3805,9 +3805,9 @@ class Program(object):
 
                 prog = fluid.default_main_program()
                 prog_string = prog.to_string(throw_on_error=True, with_details=False)
-                print("program string without detial: {}".format(prog_string))
+                print("program string without detail: {}".format(prog_string))
                 prog_string_with_detail = prog.to_string(throw_on_error=True, with_details=True)
-                print("program string with detial: {}".format(prog_string_with_detail))
+                print("program string with detail: {}".format(prog_string_with_detail))
         """
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
@@ -4606,7 +4606,7 @@ class Parameter(Variable):
             Default: {'learning_rate': 1.0}
         regularizer(WeightDecayRegularizer): The Regularizer which will
             be applied on the parameter. Default: None
-        gradient_clip_attr(BaseGradientClipAttr): The gradint clip strategy
+        gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
             which will be applied on the parameter. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this parameter.
@@ -4712,7 +4712,7 @@ class ParamBase(core.VarBase):
             Default: {'learning_rate': 1.0}
         regularizer(WeightDecayRegularizer): The Regularizer which will
             be applied on the ParamBase. Default: None
-        gradient_clip_attr(BaseGradientClipAttr): The gradint clip strategy
+        gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
             which will be applied on the ParamBase. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this ParamBase.
@@ -5022,7 +5022,7 @@ def load_op_library(lib_filename):
     Load a dynamic library, including custom operators and kernels.
     When library is loaded, ops and kernels registered in the library
     will be available in PaddlePaddle main process.
-    Please note, the type of custom operators cann't have the same type
+    Please note, the type of custom operators can't have the same type
     with the existing operators in the framework.
 
     Args:
diff --git a/python/paddle/fluid/incubate/data_generator/__init__.py b/python/paddle/fluid/incubate/data_generator/__init__.py
index 77c3fc6bf2d..8d31a68e808 100644
--- a/python/paddle/fluid/incubate/data_generator/__init__.py
+++ b/python/paddle/fluid/incubate/data_generator/__init__.py
@@ -143,7 +143,7 @@ class DataGenerator(object):
         '''
         Further processing the output of the process() function rewritten by
         user, outputting data that can be directly read by the datafeed,and
-        updating proto_info infomation.
+        updating proto_info information.
 
         Args:
             line(str): the output of the process() function rewritten by user.
@@ -243,7 +243,7 @@ class MultiSlotStringDataGenerator(DataGenerator):
         '''
         Further processing the output of the process() function rewritten by
         user, outputting data that can be directly read by the MultiSlotDataFeed,
-        and updating proto_info infomation.
+        and updating proto_info information.
 
         The input line will be in this format:
             >>> [(name, [str(feasign), ...]), ...]
@@ -284,7 +284,7 @@ class MultiSlotDataGenerator(DataGenerator):
         '''
         Further processing the output of the process() function rewritten by
         user, outputting data that can be directly read by the MultiSlotDataFeed,
-        and updating proto_info infomation.
+        and updating proto_info information.
 
         The input line will be in this format:
             >>> [(name, [feasign, ...]), ...] 
diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py
index 6600ed9aa4e..bada19abcc3 100644
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@@ -542,8 +542,8 @@ class PaddleCloudRoleMaker(RoleMakerBase):
 class GeneralRoleMaker(RoleMakerBase):
     """
     This role maker is for general use, you can set os.environ to customize:
-        PADDLE_PSERVERS_IP_PORT_LIST : all pservers' ip:port, seperated by ','
-        PADDLE_TRAINER_ENDPOINTS     : all trainers' ip:port, seperated by ','
+        PADDLE_PSERVERS_IP_PORT_LIST : all pservers' ip:port, separated by ','
+        PADDLE_TRAINER_ENDPOINTS     : all trainers' ip:port, separated by ','
         TRAINING_ROLE                : TRAINER or PSERVER
         PADDLE_TRAINER_ID            : current trainer id (only for trainer),
                                        it is index in PADDLE_TRAINER_ENDPOINTS
diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py
index e33662cf082..5150d108479 100644
--- a/python/paddle/fluid/incubate/fleet/collective/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py
@@ -220,7 +220,7 @@ class CollectiveOptimizer(DistributedOptimizer):
 
     def _check_collective_mode(self, main_program, optimizer, strategy):
         """
-        Check the conflict condtions.
+        Check the conflict conditions.
         """
         if strategy.use_local_sgd:
             strategy.mode = "collective"
@@ -392,7 +392,7 @@ class CollectiveOptimizer(DistributedOptimizer):
             tuple: (optimize_ops, params_grads) which are, list of operators appended;
             and list of (param, grad) Variables pair for optimization.
         Note that in parameter server mode, a worker will not get anything about optimize_os
-        Because optmizer algorithms run on pserver side. We will make this usable in pserver
+        Because optimizer algorithms run on pserver side. We will make this usable in pserver
         process, but currently the optimization part is written into Fleet(). A user does not
         need to care about how to startup a pserver node.
         """
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index ec5f6de81c9..d6ea97fc57b 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -618,7 +618,7 @@ class DownpourOptimizer(DistributedOptimizer):
         """
         minimize a program through loss, loss can be a list in DistributedOptimizer.
         Note that in parameter server mode, a worker will not get anything about optimize_os
-        Because optmizer algorithms run on pserver side. We will make this usable in pserver
+        Because optimizer algorithms run on pserver side. We will make this usable in pserver
         process, but currently the optimization part is written into Fleet(). A user does not
         need to care about how to startup a pserver node.
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index c5e105cc8d6..1d119039f12 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -156,7 +156,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
         """
         DownpounSGD is a distributed optimizer so
         that user can call minimize to generate backward
-        operators and optimization operators within minmize function
+        operators and optimization operators within minimize function
         Args:
             loss(Variable): loss variable defined by user
             startup_program(Program): startup program that defined by user
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
index 4a1fd20afc0..2b46459280b 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -901,7 +901,7 @@ class FleetUtil(object):
             hadoop_fs_name(str): hadoop fs name
             hadoop_fs_ugi(str): hadoop fs ugi
             hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-            save_combine(bool): whether to save in a file or seperate files,
+            save_combine(bool): whether to save in a file or separate files,
                                 default is True
 
         Examples:
@@ -990,7 +990,7 @@ class FleetUtil(object):
             hadoop_fs_ugi(str): hadoop fs ugi
             hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
             var_names(list): save persistable var names, default is None
-            save_combine(bool): whether to save in a file or seperate files,
+            save_combine(bool): whether to save in a file or separate files,
                                 default is True
 
         Examples:
@@ -1300,7 +1300,7 @@ class FleetUtil(object):
               from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
               fleet_util = FleetUtil()
               metric_list = fleet_util.get_global_metrics(myscope,
-                                                          stat_pos.nane,
+                                                          stat_pos.name,
                                                           stat_neg.name,
                                                           local_sqrerr.name,
                                                           local_abserr.name,
@@ -1487,7 +1487,7 @@ class FleetUtil(object):
               from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
               fleet_util = FleetUtil()
               fleet_util.print_global_metrics(myscope,
-                                              stat_pos.nane,
+                                              stat_pos.name,
                                               stat_neg.name,
                                               local_sqrerr.name,
                                               local_abserr.name,
diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
index 23a22531a45..c16d7e3cc45 100644
--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
@@ -274,10 +274,10 @@ class HDFSClient(object):
     @staticmethod
     def make_local_dirs(local_path):
         """
-        create a directiory local, is same to mkdir
+        create a directory local, is same to mkdir
 
         Args:
-            local_path(str): local path that wants to create a directiory.
+            local_path(str): local path that wants to create a directory.
         """
         try:
             os.makedirs(local_path)
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index b14234b5998..dfea275d7b8 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -782,7 +782,7 @@ class BilinearInitializer(Initializer):
         super(BilinearInitializer, self).__init__()
 
     def __call__(self, var, block):
-        """Add biliear initialization ops for a variable
+        """Add bilinear initialization ops for a variable
 
         Args:
             var (Variable): Variable that needs to be initialized.
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index 458f386919b..bf771d801e6 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -204,7 +204,7 @@ def embedding(input,
             default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
             user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. 
             The local word vector needs to be transformed into numpy format, and the shape of local word
-            vector shoud be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
+            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
             is used to load custom or pre-trained word vectors. See code example 2 for details.
         dtype(str|core.VarDesc.VarType): It refers to the data type of output Tensor.
             It must be float32 or float64. Default: float32.
@@ -219,7 +219,7 @@ def embedding(input,
           import numpy as np
           data = fluid.data(name='x', shape=[None, 10], dtype='int64')
 
-          # exampel 1
+          # example 1
           emb_1 = fluid.embedding(input=data, size=[128, 64])
 
           # example 2: load custom or pre-trained word vectors
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 42366aad88e..201cc61e4d4 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -44,7 +44,7 @@ class SimpleLayer(Layer):
 
 
 def run_check():
-    ''' intall check to verify if install is success
+    ''' install check to verify if install is success
 
     This func should not be called only if you need to verify installation
     '''
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index dd312bce117..1830950866c 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -219,7 +219,7 @@ def save_vars(executor,
     variables that make `predicate(variable) == True`. The first way has a higher priority.
 
     The `dirname` is used to specify the folder where to save variables.
-    If you prefer to save variables in separate files in the `dirname` floder,
+    If you prefer to save variables in separate files in the `dirname` folder,
     do not set `filename`. If you prefer to save all variables in a single file,
     use `filename` to specify it.
 
@@ -435,7 +435,7 @@ def _save_distributed_persistables(executor, dirname, main_program):
 
     def __save_remote_params(executor, dirname, remote_params_map):
         """
-        recive params on pserver through rpc.
+        receive params on pserver through rpc.
         if the params are be sliced, will concat them to one, then save it.
         """
         if not remote_params_map:
@@ -571,7 +571,7 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
                             You can refer to :ref:`api_guide_executor_en` for 
                             more details.
         dirname(str): The saving directory path.
-        main_program(Program, optional): The program whose persistbale variables will
+        main_program(Program, optional): The program whose persistable variables will
                                          be saved. You can refer to 
                                          :ref:`api_guide_Program_en` for more details.
                                          If it is None, the default main program will 
@@ -835,7 +835,7 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
     """
     This API filters out all variables with ``persistable==True`` from the
     given ``main_program`` and then tries to load these variables from the
-    directory ``dirnameme`` or the file ``filename``.
+    directory ``dirname`` or the file ``filename``.
 
     Use the ``dirname`` to specify the directory where persistable variables
     (refer to :ref:`api_guide_model_save_reader_en`) were saved. If variables
@@ -846,7 +846,7 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
         executor(Executor): The executor used for loading persistable variables.
                             See :ref:`api_guide_executor_en` for more details about it.
         dirname(str): The directory path.
-        main_program(Program, optional): The program whose persistbale variables will
+        main_program(Program, optional): The program whose persistable variables will
                                     be loaded. If it is None, the ``default_main_program``
                                     will be used automatically. See :ref:`api_guide_Program_en`
                                     for more about ``Program``.
@@ -1050,14 +1050,14 @@ def save_inference_model(dirname,
         executor(Executor): The executor that saves the inference model. You can refer 
                             to :ref:`api_guide_executor_en` for more details.
         main_program(Program, optional): The original program, which will be pruned to
-                                         build the inference model. If is setted None,
+                                         build the inference model. If is set None,
                                          the global default :code:`_main_program_` will be used.
                                          Default: None.
         model_filename(str, optional): The name of file to save the inference program
-                                       itself. If is setted None, a default filename
+                                       itself. If is set None, a default filename
                                        :code:`__model__` will be used.
         params_filename(str, optional): The name of file to save all related parameters.
-                                        If it is setted None, parameters will be saved
+                                        If it is set None, parameters will be saved
                                         in separate files .
         export_for_deployment(bool): If True, programs are modified to only support
                                      direct inference deployment. Otherwise,
@@ -1086,7 +1086,7 @@ def save_inference_model(dirname,
 
             path = "./infer_model"
 
-            # User defined network, here a softmax regresssion example
+            # User defined network, here a softmax regession example
             image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
             feeder = fluid.DataFeeder(feed_list=[image, label], place=fluid.CPUPlace())
@@ -1408,7 +1408,7 @@ def get_parameter_value_by_name(name, executor, program=None):
     Raises:
         TypeError: If given `name` is not an instance of basestring.
         TypeError: If the parameter with the given name doesn't exist.
-        AssertionError: If there is a varibale named `name` in the
+        AssertionError: If there is a variable named `name` in the
                         given program but it is not a Parameter.
 
     Examples:
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 14dda7a0ea4..4385e64583b 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -215,7 +215,7 @@ def Print(input,
     Args:
         input (Variable): A Tensor to print.
         summarize (int): Number of elements in the tensor to be print. If it's
-                vaule is -1, then all elements in the tensor will be print.
+                value is -1, then all elements in the tensor will be print.
         message (str): A string message to print as a prefix.
         first_n (int): Only log `first_n` number of times.
         print_tensor_name (bool, optional): Print the tensor name. Default: True.
@@ -703,7 +703,7 @@ class StaticRNN(object):
         Args:
             mem(Variable): the memory variable.
             var(Variable): the plain variable generated in RNN block, used to update memory.
-                           var and mem should hava same dims and data type.
+                           var and mem should have same dims and data type.
 
         Returns:
             None
@@ -1019,7 +1019,7 @@ def lod_rank_table(x, level=0):
     of LoD, this layer creates a LodRankTable object. A LoDRankTable object
     contains a list of bi-element tuples. Each tuple consists of an index and
     a length, both of which are int type. Refering to specified level of LoD,
-    the index is the sequence index number and the length representes the
+    the index is the sequence index number and the length represents the
     sequence length. Please note that the list is ranked in descending order by
     the length. The following is an example:
 
@@ -1179,7 +1179,7 @@ def increment(x, value=1.0, in_place=True):
     Notice that the number of elements in :attr:`x` must be equal to 1.
 
     Parameters:
-        x (Variable): A tensor that must alway contain only one element, its data type supports
+        x (Variable): A tensor that must always contain only one element, its data type supports
             float32, float64, int32 and int64.
         value (float, optional): The amount to increment the data of :attr:`x`. Default: 1.0.
         in_place (bool, optional): Whether the OP should be performed in-place. Default: True.
@@ -1668,7 +1668,7 @@ def array_length(array):
     """
     This OP is used to get the length of the input array :ref:`api_fluid_LoDTensorArray` .
     It can be used together with :ref:`api_fluid_layers_array_read` , :ref:`api_fluid_layers_array_write` , 
-    :ref:`api_fluid_layers_While` OP to traverse, read and wirte LoDTensorArray.
+    :ref:`api_fluid_layers_While` OP to traverse, read and write LoDTensorArray.
 
     Args:
         array (LoDTensorArray): The input array that will be used to compute the length.
@@ -1749,7 +1749,7 @@ class ConditionalBlock(object):
 
     Args:
         inputs (Variable): bool conditions.
-        is_scalar_condition (bool): whether the branch is controled by a scalar.
+        is_scalar_condition (bool): whether the branch is controlled by a scalar.
         name(str): name of this ConditionalBlock.
 
     Examples:
@@ -2539,7 +2539,7 @@ class DynamicRNN(object):
     The total number of time steps is determined by the longest sequence.
     DynamicRNN will not pad all sequences to the same length, instead it will
     sort the sequences internally by the sequence length in descending order.
-    The input sequences will be shrinked because only sequences of which the
+    The input sequences will be shrank because only sequences of which the
     length is larger than the time step will participate the remaining calculation.
 
     If defined :code:`drnn = DynamicRNN()`, then users can call :code:`drnn()`
@@ -2827,7 +2827,7 @@ class DynamicRNN(object):
                 Optional data types are: bool, float16, float32, float64, int8, int16, int32, int64, uint8.
 
         Returns:
-            Variable: The input LoDTensor after sorted and shrinked. If there are :code:`num_sequences` \
+            Variable: The input LoDTensor after sorted and shrank. If there are :code:`num_sequences` \
                 sequences in RNN's input LoDTensor whose length is larger than :code:`step_idx` , \
                 the static input Tensor will be sorted to the same order as RNN's input and \
                 will only retain data corresponding to those :code:`num_sequences` sequences. \
@@ -2926,7 +2926,7 @@ class DynamicRNN(object):
 
     def __call__(self, *args, **kwargs):
         """
-        This function is used to get the output  sequneces of DynamicRNN.
+        This function is used to get the output  sequences of DynamicRNN.
 
         Args:
             None
@@ -2968,10 +2968,10 @@ class DynamicRNN(object):
                 If setting shape to :math:`\{D_1, D_2, ...\}` , the shape of memory Tensor
                 will be :math:`\{batch\_size, D_1, D_2, ...\}` , where batch_size is
                 determined by RNN's input sequences. The default value is None.
-            value (float, optional): When init is None, it is used as initalized value
+            value (float, optional): When init is None, it is used as initialized value
                 of memory. The default value is 0.0.
             need_reorder (bool, optional): When init is not None, it determines whether
-                the memory needs to reorder like the RNN's input sequeneces. It should be
+                the memory needs to reorder like the RNN's input sequences. It should be
                 set to True when the initialized memory depends on the order of input samples.
                 The default value is False.
             dtype (str|numpy.dtype, optional): When init is None, it is used to set the
@@ -2979,9 +2979,9 @@ class DynamicRNN(object):
                 are: "float32", "float64", "int32", "int64".
 
         Returns:
-            Variable: The memory LoDTensor after shrinked.  If there are :code:`num_sequences` \
+            Variable: The memory LoDTensor after shrank.  If there are :code:`num_sequences` \
                 sequences in RNN's input LoDTensor whose length is larger than :code:`step_idx` , \
-                the memory Tensor also need to be shrinked and will only retain data \
+                the memory Tensor also need to be shrank and will only retain data \
                 corresponding to those :code:`num_sequences` sequences.
 
         Raises:
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 1d0dbce39c2..a4fa34e8fb3 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -98,7 +98,7 @@ def retinanet_target_assign(bbox_pred,
     the training process.
 
     Retinanet predicts a :math:`C`-vector for classification and a 4-vector for box
-    regresion for each anchor, hence the target label for each positive(or negative)
+    regression for each anchor, hence the target label for each positive(or negative)
     sample is a :math:`C`-vector and the target locations for each positive sample
     is a 4-vector. As for a positive sample, if the category of its assigned
     ground-truth box is class :math:`i`, the corresponding entry in its length
@@ -156,7 +156,7 @@ def retinanet_target_assign(bbox_pred,
             of :attr:`is_crowd` is int32.
         im_info(Variable): A 2-D Tensor with shape [N, 3] represents the size
             information of input images. :math:`N` is the batch size, the size
-            informarion of each image is a 3-vector which are the height and width
+            information of each image is a 3-vector which are the height and width
             of the network input along with the factor scaling the origin image to
             the network input. The data type of :attr:`im_info` is float32.
         num_classes(int32): The number of categories for classification, the default
@@ -557,7 +557,7 @@ def detection_output(loc,
             categories will be considered. Default: 0.
         nms_threshold(float): The threshold to be used in NMS. Default: 0.3.
         nms_top_k(int): Maximum number of detections to be kept according
-            to the confidences aftern filtering detections based on
+            to the confidences after filtering detections based on
             score_threshold and before NMS. Default: 400.
         keep_top_k(int): Number of total bboxes to be kept per image after
             NMS step. -1 means keeping all bboxes after NMS step. Default: 200.
@@ -660,7 +660,7 @@ def iou_similarity(x, y, box_normalized=True, name=None):
     Args:
         x (Variable): ${x_comment}.The data type is float32 or float64.
         y (Variable): ${y_comment}.The data type is float32 or float64.
-        box_normalized(bool): Whether treat the priorbox as a noramlized box.
+        box_normalized(bool): Whether treat the priorbox as a normalized box.
             Set true by default.
     Returns:
         Variable: ${out_comment}.The data type is same with x.
@@ -775,7 +775,7 @@ def box_coder(prior_box,
         code_type(str): The code type used with the target box. It can be
             `encode_center_size` or `decode_center_size`. `encode_center_size` 
             by default.
-        box_normalized(bool): Whether treat the priorbox as a noramlized box.
+        box_normalized(bool): Whether treat the priorbox as a normalized box.
             Set true by default.
         name(str, optional): For detailed information, please refer 
             to :ref:`api_guide_Name`. Usually name is no need to set and 
@@ -793,7 +793,7 @@ def box_coder(prior_box,
         output tensor of box_coder_op with shape [N, M, 4] representing the 
         result of N target boxes encoded with M Prior boxes and variances. 
         When code_type is 'decode_center_size', N represents the batch size 
-        and M represents the number of deocded boxes.
+        and M represents the number of decoded boxes.
 
     Examples:
  
@@ -908,13 +908,13 @@ def yolov3_loss(x,
     Args:
         x (Variable): ${x_comment}The data type is float32 or float64. 
         gt_box (Variable): groud truth boxes, should be in shape of [N, B, 4],
-                          in the third dimenstion, x, y, w, h should be stored. 
-                          x,y is the center cordinate of boxes, w, h are the
+                          in the third dimension, x, y, w, h should be stored. 
+                          x,y is the center coordinate of boxes, w, h are the
                           width and height, x, y, w, h should be divided by 
                           input image height to scale to [0, 1].
                           N is the batch number and B is the max box number in 
                           an image.The data type is float32 or float64. 
-        gt_label (Variable): class id of ground truth boxes, shoud be in shape
+        gt_label (Variable): class id of ground truth boxes, should be in shape
                             of [N, B].The data type is int32. 
         anchors (list|tuple): ${anchors_comment}
         anchor_mask (list|tuple): ${anchor_mask_comment}
@@ -924,7 +924,7 @@ def yolov3_loss(x,
         name (string): The default value is None.  Normally there is no need 
                        for user to set this property.  For more information, 
                        please refer to :ref:`api_guide_Name`
-        gt_score (Variable): mixup score of ground truth boxes, shoud be in shape
+        gt_score (Variable): mixup score of ground truth boxes, should be in shape
                             of [N, B]. Default None.
         use_label_smooth (bool): ${use_label_smooth_comment}
 
@@ -1415,7 +1415,7 @@ def ssd_loss(location,
 
       1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
 
-      1.2 Compute matched boundding box by bipartite matching algorithm.
+      1.2 Compute matched bounding box by bipartite matching algorithm.
 
     2. Compute confidence for mining hard examples
 
@@ -1525,10 +1525,10 @@ def ssd_loss(location,
     def __reshape_to_2d(var):
         return nn.flatten(x=var, axis=2)
 
-    # 1. Find matched boundding box by prior box.
+    # 1. Find matched bounding box by prior box.
     #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
     iou = iou_similarity(x=gt_box, y=prior_box)
-    #   1.2 Compute matched boundding box by bipartite matching algorithm.
+    #   1.2 Compute matched bounding box by bipartite matching algorithm.
     matched_indices, matched_dist = bipartite_match(iou, match_type,
                                                     overlap_threshold)
 
@@ -1653,7 +1653,7 @@ def prior_box(input,
     sequence according to the aspect_ratios.
 
     Parameters:
-       input(Variable): 4-D tenosr(NCHW), the data type should be float32 or float64.
+       input(Variable): 4-D tensor(NCHW), the data type should be float32 or float64.
        image(Variable): 4-D tensor(NCHW), the input image data of PriorBoxOp,
             the data type should be float32 or float64.
        min_sizes(list|tuple|float): the min sizes of generated prior boxes.
@@ -2051,7 +2051,7 @@ def multi_box_head(inputs,
        min_max_aspect_ratios_order(bool): If set True, the output prior box is
             in order of [min, max, aspect_ratios], which is consistent with
             Caffe. Please note, this order affects the weights order of
-            convolution layer followed by and does not affect the fininal
+            convolution layer followed by and does not affect the final
             detection results. Default: False.
 
     Returns:
@@ -2610,7 +2610,7 @@ def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
             target_size / original_size, target_size is the size after resize,
             original_size is the original image size.
         gt_classes (Variable): A 2-D LoDTensor with shape [M, 1]. Data type
-            shoule be int. M is the total number of ground-truth, each
+            should be int. M is the total number of ground-truth, each
             element is a class label.
         is_crowd (Variable): A 2-D LoDTensor with same shape and same data type
             as gt_classes, each element is a flag indicating whether a
@@ -2628,7 +2628,7 @@ def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
             float32. R is the total number of RoIs, each element is a bounding
             box with (xmin, ymin, xmax, ymax) format in the range of original image.
         labels_int32 (Variable): A 2-D LoDTensor in shape of [R, 1] with type
-            of int32. R is the same as it in `rois`. Each element repersents
+            of int32. R is the same as it in `rois`. Each element represents
             a class label of a RoI.
         num_classes (int): Class number.
         resolution (int): Resolution of mask predictions.
@@ -2637,15 +2637,15 @@ def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
         mask_rois (Variable):  A 2D LoDTensor with shape [P, 4] and same data
         type as `rois`. P is the total number of sampled RoIs. Each element
         is a bounding box with [xmin, ymin, xmax, ymax] format in range of
-        orignal image size.
+        original image size.
 
         mask_rois_has_mask_int32 (Variable): A 2D LoDTensor with shape [P, 1]
-        and int data type, each element repersents the output mask RoI
+        and int data type, each element represents the output mask RoI
         index with regard to input RoIs.
 
         mask_int32 (Variable): A 2D LoDTensor with shape [P, K * M * M] and int
         data type, K is the classes number and M is the resolution of mask
-        predictions. Each element repersents the binary mask targets.
+        predictions. Each element represents the binary mask targets.
 
     Examples:
         .. code-block:: python
@@ -2745,7 +2745,7 @@ def generate_proposals(scores,
             N is batch size, A is number of anchors, H and W are height and
             width of the feature map. The data type must be float32.
         bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W]
-            represents the differece between predicted box locatoin and
+            represents the difference between predicted box location and
             anchor location. The data type must be float32.
         im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin
             image information for N batch. Height and width are the input sizes 
@@ -2841,7 +2841,7 @@ def box_clip(input, im_info, name=None):
         input(Variable): The input Tensor with shape :math:`[N_1, N_2, ..., N_k, 4]`,
             the last dimension is 4 and data type is float32 or float64.
         im_info(Variable): The 2-D Tensor with shape [N, 3] with layout 
-            (height, width, scale) represeting the information of image. 
+            (height, width, scale) representing the information of image. 
             Height and width are the input sizes and scale is the ratio of network input
             size and original size. The data type is float32 or float64.
         name(str, optional): For detailed information, please refer 
@@ -2851,7 +2851,7 @@ def box_clip(input, im_info, name=None):
     Returns:
         Variable:
 
-        output(Variable): The cliped tensor with data type float32 or float64. 
+        output(Variable): The clipped tensor with data type float32 or float64. 
         The shape is same as input.
 
         
@@ -2919,7 +2919,7 @@ def retinanet_detection_output(bboxes,
             The data type of each element is float32 or float64.
         im_info(Variable): A 2-D Tensor with shape :math:`[N, 3]` represents the size
             information of input images. :math:`N` is the batch size, the size
-            informarion of each image is a 3-vector which are the height and width
+            information of each image is a 3-vector which are the height and width
             of the network input along with the factor scaling the origin image to
             the network input. The data type of :attr:`im_info` is float32.
         score_threshold(float): Threshold to filter out bounding boxes
@@ -2946,7 +2946,7 @@ def retinanet_detection_output(bboxes,
     that there is no detection if :attr:`score_threshold` are used at all
     levels. Hence, this OP do not filter out anchors from the highest FPN level
     before NMS. And the last element in :attr:`bboxes`:, :attr:`scores` and
-    :attr:`anchors` is required to be from the hightest FPN level.
+    :attr:`anchors` is required to be from the highest FPN level.
 
     Returns:
         Variable(The data type is float32 or float64):
@@ -3090,7 +3090,7 @@ def multiclass_nms(bboxes,
                                  low confidence score. If not provided, 
                                  consider all boxes.
         nms_top_k (int): Maximum number of detections to be kept according to
-                         the confidences aftern the filtering detections based
+                         the confidences after the filtering detections based
                          on score_threshold.
         nms_threshold (float): The threshold to be used in NMS. Default: 0.3
         nms_eta (float): The threshold to be used in NMS. Default: 1.0
@@ -3201,7 +3201,7 @@ def locality_aware_nms(bboxes,
                                  low confidence score. If not provided,
                                  consider all boxes.
         nms_top_k (int): Maximum number of detections to be kept according to
-                         the confidences aftern the filtering detections based
+                         the confidences after the filtering detections based
                          on score_threshold.
         nms_threshold (float): The threshold to be used in NMS. Default: 0.3
         nms_eta (float): The threshold to be used in NMS. Default: 1.0
diff --git a/python/paddle/fluid/layers/distributions.py b/python/paddle/fluid/layers/distributions.py
index 69b99bf577f..396ab443a4b 100644
--- a/python/paddle/fluid/layers/distributions.py
+++ b/python/paddle/fluid/layers/distributions.py
@@ -561,7 +561,7 @@ class MultivariateNormalDiag(Distribution):
             a.entropy()
             # [2.033158] with shape: [1]
             b.entropy()
-            # [1.7777451] with shaoe: [1]
+            # [1.7777451] with shape: [1]
 
             a.kl_divergence(b)
             # [0.06542051] with shape: [1]
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 7c6e5aa1859..180aec4d9b2 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -253,7 +253,7 @@ def Send(endpoints, send_vars, dummy_output=None, sync=True):
     side when server have finished running server side program.
 
     Args:
-        endpoints (str): comma seperated IP:PORT pairs in the order
+        endpoints (str): comma separated IP:PORT pairs in the order
                    of send_vars to send
         send_vars (list): variables to send to server
         sync (bool): whether to wait the request finish
@@ -296,7 +296,7 @@ def Recv(endpoints, get_vars, dummy_input=None, sync=True):
     Receive variables from server side
 
     Args:
-        endpoints (str): comma seperated IP:PORT pairs in the order
+        endpoints (str): comma separated IP:PORT pairs in the order
                    of send_vars to send
         get_vars (list): vars to get from server after send completes.
         sync (bool): whether to wait the request finish
@@ -603,7 +603,7 @@ def py_reader(capacity,
          import paddle.dataset.mnist as mnist
 
          def network(image, label):
-             # user defined network, here a softmax regresssion example
+             # user defined network, here a softmax regession example
              predict = fluid.layers.fc(input=image, size=10, act='softmax')
              return fluid.layers.cross_entropy(input=predict, label=label)
 
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 6ca0769d997..eae8d43bfc4 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -109,7 +109,7 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     training progresses. By using this function, the learning rate will be decayed by
     'decay_rate' every 'decay_steps' steps.
 
-    Decayed learning rate calcualtes as follows:
+    Decayed learning rate calculates as follows:
 
     >>> if staircase == True:
     >>>     decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
@@ -165,7 +165,7 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     training progresses. By using this function, the learning rate will be decayed by
     natural exponential power 'decay_rate' every 'decay_steps' steps.
 
-    Decayed learning rate calcualtes as follows:
+    Decayed learning rate calculates as follows:
 
     >>> if not staircase:
     >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
@@ -178,7 +178,7 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
         decay_steps(int): The learning rate decay steps. See the decay computation above.
         decay_rate(float): The learning rate decay rate. See the decay computation above.
         staircase(bool): If True, decay the learning rate at discrete intervals, which 
-                         means the learning rate will be decayed by natual exponential power
+                         means the learning rate will be decayed by natural exponential power
                          `decay_rate` every `decay_steps`. If False, learning rate will be
                          decayed continuously and following the formula above. Default: False
 
@@ -222,7 +222,7 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     training progresses. By using this function, an inverse decay function will be
     applied to the initial learning rate.
 
-    Decayed learning rate calcualtes as follows:
+    Decayed learning rate calculates as follows:
 
     >>> if staircase == True:
     >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 5b9a5f26b3a..23c062a419b 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -497,7 +497,7 @@ def warpctc(input,
     (https://github.com/baidu-research/warp-ctc)
     to compute Connectionist Temporal Classification (CTC) loss.
     It can be aliased as softmax with CTC, since a native softmax activation is
-    interated to the Warp-CTC library to normlize values for each row of the
+    interated to the Warp-CTC library to normalize values for each row of the
     input tensor.
 
     Args:
@@ -523,7 +523,7 @@ def warpctc(input,
        norm_by_times(bool, default false): Whether to normalize the gradients
          by the number of time-step, which is also the sequence's length.
          There is no need to normalize the gradients if warpctc layer was
-         follewed by a mean_op.
+         followed by a mean_op.
        input_length(Variable): The length for each input sequence if it is 
          of Tensor type, it should have shape `[batch_size]` and dtype int64.
        label_length(Variable): The length for each label sequence if it is
@@ -663,12 +663,12 @@ def nce(input,
         num_neg_samples (int): ${num_neg_samples_comment}.
         name(str|None): For detailed information, please refer to 
             :ref:`api_guide_Name` . Usually name is no need to set and None by default.
-        sampler (str, optional): The sampler used to sample class from negtive classes.
+        sampler (str, optional): The sampler used to sample class from negative classes.
                        It can be 'uniform', 'log_uniform' or 'custom_dist'.
                        default: 'uniform'.
         custom_dist (nd.array|None): A numpy ndarray with size=num_total_classes.
                        It is used when sampler is set to 'custom_dist'.
-                       custom_dist[i] is the probsbility of i-th class to be sampled.
+                       custom_dist[i] is the probability of i-th class to be sampled.
                        default: None.
         seed (int, optional): The seed used in sampler. Default 0, means no random seed.
         is_sparse(bool, optional): The flag indicating whether to use sparse update, 
@@ -1194,7 +1194,7 @@ def softmax_with_cross_entropy(logits,
             Label is a ``Tensor``  in the same shape with :attr:`logits`. 
             If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor`` 
             in the same shape with :attr:`logits` expect shape in dimension :attr:`axis` as 1.
-        soft_label (bool, optional): A flag to indicate whether to interpretate the given
+        soft_label (bool, optional): A flag to indicate whether to interpretant the given
             labels as soft labels. Default False.
         ignore_index (int, optional): Specifies a target value that is ignored and does
                                       not contribute to the input gradient. Only valid
@@ -1665,7 +1665,7 @@ def mse_loss(input, label):
 
     Parameters: 
         input (Variable): Input tensor, the data type should be float32.
-        label (Variable): Label tensor, the data type shoulf be float32.
+        label (Variable): Label tensor, the data type should be float32.
 
     Returns:
         Variable: The tensor variable storing the mean square error difference of input and label.
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 3136fca1095..b804984efbd 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -277,7 +277,7 @@ def fc(input,
         input (Variable|list of Variable): A Tensor(or LoDTensor) with shape :math:`[N_1, N_2,..., N_k]` or
             a list of Tensor(or LoDTensor). The dimensions of the input Tensor is at least 2 and the data
             type should be float32 or float64.
-        size(int): The number of output units in this layer, which also means the feature size of ouput
+        size(int): The number of output units in this layer, which also means the feature size of output
             Tensor(or LoDTensor).
         num_flatten_dims (int): The fc layer can accept an input Tensor with more than
             two dimensions. If this happens, the multidimensional tensor will first be flattened
@@ -445,7 +445,7 @@ def embedding(input,
             default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
             user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. 
             The local word vector needs to be transformed into numpy format, and the shape of local word
-            vector shoud be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
+            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
             is used to load custom or pre-trained word vectors. See code example 2 for details.
         dtype(str|core.VarDesc.VarType): It refers to the data type of output Tensor.
             It must be float32 or float64. Default: float32.
@@ -460,7 +460,7 @@ def embedding(input,
           import numpy as np
           data = fluid.data(name='x', shape=[None, 1], dtype='int64')
 
-          # exampel 1
+          # example 1
           emb_1 = fluid.embedding(input=data, size=[128, 64])
 
           # example 2: load custom or pre-trained word vectors
@@ -819,7 +819,7 @@ def dropout(x,
 
             import paddle.fluid as fluid
             x = fluid.data(name="data", shape=[None, 32, 32], dtype="float32")
-            droped = fluid.layers.dropout(x, dropout_prob=0.5)
+            dropped = fluid.layers.dropout(x, dropout_prob=0.5)
     """
 
     def get_attrs(prog, dropout_prob, is_test, seed):
@@ -934,7 +934,7 @@ def chunk_eval(input,
             a LoDTensor, its shape would be `[N, 1]` where `N` stands for the total
             sequence lengths in this mini-batch. The data type should be int64.
         label (Variable): A Tensor or LoDTensor representing the ground-truth labels.
-            It shoud have the same shape, lod and data type as ``input`` .
+            It should have the same shape, lod and data type as ``input`` .
         chunk_scheme (str): Indicate the tagging schemes used here. The value must
             be IOB, IOE, IOBES or plain.
         num_chunk_types (int): The number of chunk types.
@@ -1090,7 +1090,7 @@ def softmax(input, use_cudnn=False, name=None, axis=-1):
     Args:
         input (Variable): The input variable. A multi-dimension ``Tensor`` with type float32 or float64.
         use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn \
-            library is installed. To improve numerical stablity, set use_cudnn to \
+            library is installed. To improve numerical stability, set use_cudnn to \
             False by default.
         name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . Default: None.
             will be named automatically. Default: None.
@@ -1215,7 +1215,7 @@ def conv2d(input,
             If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
             Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
-            on both sides for each dimention.If `padding` is a string, either 'VALID' or
+            on both sides for each dimension.If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_height, pad_width]` or
             `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when 
@@ -1483,7 +1483,7 @@ def conv3d(input,
             tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
             Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
-            on both sides for each dimention. If `padding` is a string, either 'VALID' or
+            on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
             `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
@@ -2171,7 +2171,7 @@ def adaptive_pool2d(input,
 
           # average adaptive pool2d
           # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n],
-          # output shape is [N, C, m, n], adaptive pool divide H and W dimentions
+          # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
           # of input data into m * n grids averagely and performs poolings in each
           # grid to get output.
           # adaptive average pool performs calculations as follow:
@@ -2193,7 +2193,7 @@ def adaptive_pool2d(input,
 
           # max adaptive pool2d
           # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n],
-          # output shape is [N, C, m, n], adaptive pool divide H and W dimentions
+          # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
           # of input data into m * n grids averagely and performs poolings in each
           # grid to get output.
           # adaptive average pool performs calculations as follow:
@@ -2312,7 +2312,7 @@ def adaptive_pool3d(input,
 
           # average adaptive pool3d
           # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n],
-          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions
+          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
           # of input data into l * m * n grids averagely and performs poolings in each
           # grid to get output.
           # adaptive average pool performs calculations as follow:
@@ -2341,7 +2341,7 @@ def adaptive_pool3d(input,
 
           # max adaptive pool3d
           # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n],
-          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions
+          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
           # of input data into l * m * n grids averagely and performs poolings in each
           # grid to get output.
           # adaptive average pool performs calculations as follow:
@@ -2985,7 +2985,7 @@ def layer_norm(input,
             omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
             a default :code:`ParamAttr` would be added as bias. The
             :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
-        act(str, optional): Activation to be applied to the output of layer normalizaiton.
+        act(str, optional): Activation to be applied to the output of layer normalization.
                   Default: None.
         name(str): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
 
@@ -3026,7 +3026,7 @@ def layer_norm(input,
         inputs['Scale'] = scale
     else:
         if param_attr:
-            warnings.warn("param_attr is only avaliable with scale is True.")
+            warnings.warn("param_attr is only available with scale is True.")
     if shift:
         assert bias_attr is not False, "bias_attr should not be False when using shift."
         bias = helper.create_parameter(
@@ -3034,7 +3034,7 @@ def layer_norm(input,
         inputs['Bias'] = bias
     else:
         if bias_attr:
-            warnings.warn("bias_attr is only avaliable with shift is True.")
+            warnings.warn("bias_attr is only available with shift is True.")
 
     # create output
     mean_out = helper.create_variable_for_type_inference(
@@ -3085,7 +3085,7 @@ def group_norm(input,
             attribute. If a bool type, only False is supported, which means there is no bias parameter.
             Default: None, the default bias parameter attribute is used. For more information, please
             refer to :ref:`api_guide_ParamAttr` .
-        act(str, optional): Activation to be applied to the output of group normalizaiton.
+        act(str, optional): Activation to be applied to the output of group normalization.
         data_layout(str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -3174,7 +3174,7 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
     and W is the product result of remaining dimensions.
 
     Step 2:
-    :attr:`power_iters` shoule be a positive interger, do following
+    :attr:`power_iters` should be a positive integer, do following
     calculations with U and V for :attr:`power_iters` rounds. Calculations
     as follows:
 
@@ -3944,7 +3944,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             x = fluid.data(name='x', shape=[2, 4], dtype='float32')
             fluid.layers.reduce_mean(x)  # [0.4375]
             fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
@@ -3954,7 +3954,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
             # y is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1.0, 2.0], [3.0, 4.0]],
             #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
             fluid.layers.reduce_mean(y, dim=[1, 2]) # [2.5, 6.5]
             fluid.layers.reduce_mean(y, dim=[0, 1]) # [4.0, 5.0]
@@ -4015,7 +4015,7 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             x = fluid.data(name='x', shape=[2, 4], dtype='float32')
             fluid.layers.reduce_max(x)  # [0.9]
             fluid.layers.reduce_max(x, dim=0)  # [0.2, 0.3, 0.6, 0.9]
@@ -4025,7 +4025,7 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
             # y is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1.0, 2.0], [3.0, 4.0]],
             #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
             fluid.layers.reduce_max(y, dim=[1, 2]) # [4.0, 8.0]
             fluid.layers.reduce_max(y, dim=[0, 1]) # [7.0, 8.0]
@@ -4076,7 +4076,7 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             x = fluid.data(name='x', shape=[2, 4], dtype='float32')
             fluid.layers.reduce_min(x)  # [0.1]
             fluid.layers.reduce_min(x, dim=0)  # [0.1, 0.2, 0.5, 0.7]
@@ -4086,7 +4086,7 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
             # y is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1.0, 2.0], [3.0, 4.0]],
             #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
             fluid.layers.reduce_min(y, dim=[1, 2]) # [1.0, 5.0]
             fluid.layers.reduce_min(y, dim=[0, 1]) # [1.0, 2.0]
@@ -4115,7 +4115,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
         input (Variable): The input variable which is a Tensor, the data type is float32,
             float64, int32, int64.
         dim (list|int, optional): The dimensions along which the product is performed. If
-            :attr:`None`, multipy all elements of :attr:`input` and return a
+            :attr:`None`, multiply all elements of :attr:`input` and return a
             Tensor variable with a single element, otherwise must be in the
             range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
             the dimension to reduce is :math:`rank + dim[i]`.
@@ -4137,7 +4137,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             x = fluid.data(name='x', shape=[2, 4], dtype='float32')
             fluid.layers.reduce_prod(x)  # [0.0002268]
             fluid.layers.reduce_prod(x, dim=0)  # [0.02, 0.06, 0.3, 0.63]
@@ -4148,7 +4148,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             # y is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1.0, 2.0], [3.0, 4.0]],
             #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
             fluid.layers.reduce_prod(y, dim=[1, 2]) # [24.0, 1680.0]
             fluid.layers.reduce_prod(y, dim=[0, 1]) # [105.0, 384.0]
@@ -4845,7 +4845,7 @@ def ctc_greedy_decoder(input,
         in result were empty, the result LoDTensor will be [-1] with  empty \
         LoD [[]].
 
-        For padding mode, returns a tuple of (output, output_length), which was describled as below: 
+        For padding mode, returns a tuple of (output, output_length), which was described as below: 
 
         output, 2-D Tensor, shape is [batch_size, N], data type is int64.
 
@@ -5039,7 +5039,7 @@ def im2sequence(input,
             is :math:`[batchsize, 2]` . It is just for batch inference when not None. Default is None.
 
         out_stride(int32 | List[int32]): The scaling of image through CNN. It is valid only when input_image_size is not None.
-            If out_stride is List,  it must contain two intergers,
+            If out_stride is List,  it must contain two integers,
             :math:`[out\_stride\_height, out\_stride\_W]` . Otherwise,
             the out_stride_height = out_stride_width = out_stride. Default is 1.
 
@@ -5254,7 +5254,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
     It takes the first dimension of :attr:`x` and :attr:`y` as batch size.
     For each instance, it computes the smooth L1 loss element by element first
-    and then sums all the losses. So the shape of ouput Variable is
+    and then sums all the losses. So the shape of output Variable is
     [batch_size, 1].
 
     Args:
@@ -5479,7 +5479,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
     When ``shape`` and ``actual_shape`` are set at the same time,
     ``actual_shape`` has a higher priority than ``shape``
     but at this time ``shape`` can only be an integer list or tuple, and ``shape`` still should be set correctly to
-    gurantee shape inference in compile-time.
+    guarantee shape inference in compile-time.
 
     Some tricks exist when specifying the target shape.
 
@@ -5632,7 +5632,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                 else:
                     assert dim_size > 0, (
                         "Each dimension value of 'shape' in reshape must not "
-                        "be negtive except one unknown dimension. "
+                        "be negative except one unknown dimension. "
                         "But received shape[%d] = %s." %
                         (dim_idx, str(dim_size)))
         return attrs_shape
@@ -6163,7 +6163,7 @@ def pad_constant_like(x, y, pad_value=0., name=None):
             Out.shape = (2, 3, 2, 3)
 
     Args:
-        x (Variable): Tensor, its shape spicifies the shape of output.
+        x (Variable): Tensor, its shape specifies the shape of output.
         y (Variable): Tensor, its rank is the same with :attr:`x`, and for each dimension :math:`i` , 
                       :math:`y\_shape[i] <= x\_shape[i]` . The data type can be float32 or float64.
         pad_value (float): The constant value used to pad.
@@ -6498,7 +6498,7 @@ def image_resize(input,
         'NEAREST' : Nearest neighbor interpolation
 
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
-    in both the 3rd dimention(in height direction) and the 4th dimention(in width 
+    in both the 3rd dimension(in height direction) and the 4th dimension(in width 
     direction) on input tensor.
             
     Bilinear interpolation is an extension of linear interpolation for 
@@ -6512,7 +6512,7 @@ def image_resize(input,
     H-direction and W-direction in this op) on a rectilinear 3D grid. 
     The linear interpolation is performed on three directions.
 
-    Align_corners and align_mode are optinal parameters,the calculation method 
+    Align_corners and align_mode are optional parameters,the calculation method 
     of interpolation can be selected by them.
 
     Example:
@@ -6629,7 +6629,7 @@ def image_resize(input,
                                 will be deprecated. When using actual_shape to 
                                 specify output shape, one of :attr:`out_shape` 
                                 and :attr:`scale` should also be set, otherwise 
-                                errors would be occured in graph constructing stage.
+                                errors would be occurred in graph constructing stage.
                                 Default: None
         align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the 
                                input and output tensors are aligned, preserving the values at the 
@@ -6659,7 +6659,7 @@ def image_resize(input,
         ValueError: out_shape length should be 2 for input 4-D tensor.
         ValueError: out_shape length should be 3 for input 5-D tensor.
         ValueError: scale should be greater than zero.
-        TypeError: align_corners shoule be a bool value
+        TypeError: align_corners should be a bool value
         ValueError: align_mode can only be '0' or '1'
         ValueError: data_format can only be 'NCHW', 'NHWC', 'NCDHW' or 'NDHWC'.
 
@@ -6897,7 +6897,7 @@ def resize_bilinear(input,
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation
 
-    Align_corners and align_mode are optinal parameters,the calculation 
+    Align_corners and align_mode are optional parameters,the calculation 
     method of interpolation can be selected by them.
 
     Example:
@@ -6954,7 +6954,7 @@ def resize_bilinear(input,
                                 will be deprecated. When using actual_shape to 
                                 specify output shape, one of :attr:`out_shape` 
                                 and :attr:`scale` should also be set, otherwise 
-                                errors would be occured in graph constructing stage.
+                                errors would be occurred in graph constructing stage.
                                 Default: None
         align_corners(bool): ${align_corners_comment}
         align_mode(bool): ${align_mode_comment}
@@ -7059,7 +7059,7 @@ def resize_trilinear(input,
     For details of trilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Trilinear_interpolation
 
-    Align_corners and align_mode are optinal parameters,the calculation 
+    Align_corners and align_mode are optional parameters,the calculation 
     method of interpolation can be selected by them.
 
     Example:
@@ -7118,7 +7118,7 @@ def resize_trilinear(input,
                                 will be deprecated. When using actual_shape to 
                                 specify output shape, one of :attr:`out_shape` 
                                 and :attr:`scale` should also be set, otherwise 
-                                errors would be occured in graph constructing stage.
+                                errors would be occurred in graph constructing stage.
                                 Default: None
         align_corners(bool): ${align_corners_comment}
         align_mode(bool): ${align_mode_comment}
@@ -7272,7 +7272,7 @@ def resize_nearest(input,
                                 will be deprecated. When using actual_shape to 
                                 specify output shape, one of :attr:`out_shape` 
                                 and :attr:`scale` should also be set, otherwise 
-                                errors would be occured in graph constructing stage.
+                                errors would be occurred in graph constructing stage.
                                 Default: None
         align_corners(bool): ${align_corners_comment}
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
@@ -7581,7 +7581,7 @@ def scatter(input, index, updates, name=None, overwrite=True):
     Args:
         input (Variable): The input N-D Tensor with rank>=1. Data type can be float32.
         index (Variable): The index 1-D Tensor. Data type can be int32, int64. The length of index cannot exceed updates's length, and the value in index cannot exceed input's length.
-        updates (Variable): update input with updates parameter based on index. shape should be the same as input, and dim value with dim > 1 shoule be the same as input.
+        updates (Variable): update input with updates parameter based on index. shape should be the same as input, and dim value with dim > 1 should be the same as input.
         name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
         overwrite (bool): The mode that updating the output when there are same indices.
             If True, use the overwrite mode to update the output of the same index,
@@ -8169,13 +8169,13 @@ def crop_tensor(x, shape=None, offsets=None, name=None):
         x (Variable): 1-D to 6-D Tensor, the data type is float32, float64, int32 or int64.
         shape (list|tuple|Variable): The output shape is specified
             by `shape`. Its data type is int32. If a list/tuple, it's length must be
-            the same as the dimension size of `x`. If a Variable, it shoule be a 1-D Tensor.
+            the same as the dimension size of `x`. If a Variable, it should be a 1-D Tensor.
             When it is a list, each element can be an integer or a Tensor of shape: [1].
             If Variable contained, it is suitable for the case that the shape may
             be changed each iteration.
         offsets (list|tuple|Variable, optional): Specifies the cropping
             offsets at each dimension. Its data type is int32. If a list/tuple, it's length
-            must be the same as the dimension size of `x`. If a Variable, it shoule be a 1-D
+            must be the same as the dimension size of `x`. If a Variable, it should be a 1-D
             Tensor. When it is a list, each element can be an integer or a Tensor of shape: [1].
             If Variable contained, it is suitable for the case that the offsets may be changed
             each iteration. Default: None, the offsets are 0 at each dimension.
@@ -9357,7 +9357,7 @@ def expand(x, expand_times, name=None):
             else:
                 attrs_expand_times.append(times)
                 assert times > 0, (
-                    "Each element given in expand_times must not be negtive.")
+                    "Each element given in expand_times must not be negative.")
         return attrs_expand_times
 
     def get_new_expand_times_tensor(list_expand_times):
@@ -11198,7 +11198,7 @@ def logical_not(x, out=None, name=None):
             # Graph organizing
             x = fluid.layers.data(name='x', shape=[2], dtype='bool')
             res = fluid.layers.logical_not(x)
-            # The comment lists another availble method.
+            # The comment lists another avaliable method.
             # res = fluid.layers.fill_constant(shape=[2], dtype='bool', value=0)
             # fluid.layers.logical_not(x, out=res)
 
@@ -11495,7 +11495,7 @@ def space_to_depth(x, blocksize, name=None):
         dimension.
     The attr blocksize indicates the input block size.
 
-    space_to_depth will reorgnize the elements of input with shape[batch, channel, height, width] \
+    space_to_depth will reorganize the elements of input with shape[batch, channel, height, width] \
         according to blocksize to construct output with shape \
         [batch, channel * blocksize * blocksize, height/blocksize, width/blocksize]:
 
@@ -11848,11 +11848,11 @@ def hash(input, hash_size, num_hash=1, name=None):
 def grid_sampler(x, grid, name=None):
     """
     This operation samples input X by using bilinear interpolation based on
-    flow field grid, which is usually gennerated by :code:`affine_grid` . The grid of
+    flow field grid, which is usually generated by :code:`affine_grid` . The grid of
     shape [N, H, W, 2] is the concatenation of (x, y) coordinates
     with shape [N, H, W] each, where x is indexing the 4th dimension
-    (in width dimension) of input data x and y is indexng the 3rd
-    dimention (in height dimension), finally results is the bilinear
+    (in width dimension) of input data x and y is indexing the 3rd
+    dimension (in height dimension), finally results is the bilinear
     interpolation value of 4 nearest corner points. The output tensor 
     shape will be [N, C, H, W].
 
@@ -12223,7 +12223,7 @@ def shuffle_channel(x, group, name=None):
                         
     Args: 
         x(Variable): The input tensor variable. It should be a 4-D tensor with shape [N, C, H, W]
-        group(int): Indicating the conuts of subgroups, It should divide the number of channels.
+        group(int): Indicating the counts of subgroups, It should divide the number of channels.
 
     Returns:
         out(Variable): the channels shuffling result is a tensor variable with the 
@@ -12687,7 +12687,7 @@ def prroi_pool(input,
         pooled_height (integer): The pooled output height. Default: 1.
         pooled_width (integer): The pooled output width. Default: 1.
         batch_roi_nums (Variable): The number of roi for each image in batch. It 
-                         shoule be 1-D Tensor, with shape [N] and dtype int64, 
+                         should be 1-D Tensor, with shape [N] and dtype int64, 
                          where N is the batch size. Default: None. Be note: The lod of input should be
                          empty when batch_roi_nums has values;
         name (str, default None): The name of this operation.
@@ -12857,7 +12857,7 @@ def continuous_value_model(input, cvm, use_cvm=True):
 
     :attr:`input` is an embedding vector including show and click value, whose shape is :math:`[N, D]` (N is batch size. D is `2 + embedding dim` ).
     Show and click at first two dims of embedding vector D.
-    If :attr:`use_cvm` is True, it will caculate :math:`log(show)` and :math:`log(click)` , and output shape is :math:`[N, D]` .
+    If :attr:`use_cvm` is True, it will calculate :math:`log(show)` and :math:`log(click)` , and output shape is :math:`[N, D]` .
     If :attr:`use_cvm` is False, it will remove show and click from :attr:`input` , and output shape is :math:`[N, D - 2]` .
     :attr:`cvm` is show_click info, whose shape is :math:`[N, 2]` .
 
@@ -13019,7 +13019,7 @@ def unique(x, dtype='int32'):
 
 def unique_with_counts(x, dtype='int32'):
     """
-    This OP return a unique tensor for `x` , and count tensor that the count of unqiue result in raw input, \
+    This OP return a unique tensor for `x` , and count tensor that the count of unique result in raw input, \
     and an index tensor pointing to this unique tensor. 
 
     **NOTICE**: This op support the variable type of Tensor only.
@@ -13032,7 +13032,7 @@ def unique_with_counts(x, dtype='int32'):
         tuple, the variable type in tuple is Tensor, the output :attr:`out` data type is the same as input :attr:`x`, \
         and data type of output :attr:`index` and :attr:`count` will be int32 or int64.: The :attr:`out` is unique tensor for input :attr:`x`,\
         the data shape is :math:`[K]`, the `K` may be different to the `N` in shape of :attr:`x`. :attr:`index` is an index tensor pointing\
-        to :attr:`out`, the data shape is :math:`[N]` , the data shape is the same as input :attr:`x`. :attr:`count` is count of unqiue element in\
+        to :attr:`out`, the data shape is :math:`[N]` , the data shape is the same as input :attr:`x`. :attr:`count` is count of unique element in\
         the :attr:`x`, the data shape is :math:`[K]`, the data shape is the same as output :attr:`out`.
 
     Examples:
@@ -13163,7 +13163,7 @@ def deformable_conv(input,
         deformable_groups (int): The number of deformable group partitions.
             Default: deformable_groups = 1.
         im2col_step (int): Maximum number of images per im2col computation; 
-            The total batch size should be divisable by this value or smaller
+            The total batch size should be devisable by this value or smaller
             than this value; if you face out of memory problem, you can try
             to use a smaller value here.
             Default: im2col_step = 64.
@@ -13298,7 +13298,7 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
 
     This op returns a col buffer of sliding local blocks of input x, also known
     as im2col for batched 2D image tensors. For each block under the convolution filter,
-    all element will be rearranged as a column. While the convolution filter silding over
+    all element will be rearranged as a column. While the convolution filter sliding over
     the input feature map, a series of such columns will be formed.
 
     For each input :math:`x` with shape [N, C, H, W], the output shape [N, Cout, Lout]
@@ -13335,7 +13335,7 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
         dilations(int|list):      the dilations of convolution kernel, shold be
-                                  [dilation_h, dilation_w], or an integer dialtion treated as
+                                  [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.  
                              Normally there is no need for user to set this property.  
@@ -13344,7 +13344,7 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
     
     Returns:
         The tensor variable corresponding to the sliding local blocks. 
-        The output shape is [N, Cout, Lout] as decribled above. 
+        The output shape is [N, Cout, Lout] as decriabled above. 
         Cout is the  total number of values within each block, 
         and Lout is the total number of such blocks. 
         The data type of output is the same as the input :math:`x`
@@ -13462,7 +13462,7 @@ def deformable_roi_pooling(input,
                          Equals the reciprocal of total stride in convolutional layers, Default: 1.0.
         group_size (list|tuple): The number of groups which input channels are divided and the input is list or tuple, which value type is int32. (eg.number of input channels 
                           is k1 * k2 * (C + 1), which k1 and k2 are group width and height and C+1 is number of output
-                          chanels.) eg.(4, 6), which 4 is height of group and 6 is width of group. Default: [1, 1].
+                          channels.) eg.(4, 6), which 4 is height of group and 6 is width of group. Default: [1, 1].
         pooled_height (int): The pooled output height which value type is int32. Default: 1.
         pooled_width (int): The pooled output width which value type is int32. Default: 1.
         part_size (list|tuple): The height and width of offset which values in list or tuple is int32, eg.(4, 6), which height is 4 and width is 6, and values always equal to pooled_height \
@@ -13470,7 +13470,7 @@ def deformable_roi_pooling(input,
         sample_per_part (int): The number of samples in each bin which value type is int32. If value is bigger, it will consume more performance. Default: 1.
         trans_std (float): Coefficient of offset which value type is float32. It controls weight of offset. Default: 0.1.
         position_sensitive (bool): Whether to choose deformable psroi pooling mode or not, and value type is bool(True or False). If value is False, input dimension equals to output dimension. \
-                                   If value is True, input dimension shoule be output dimension * pooled_height * pooled_width. Default: False.
+                                   If value is True, input dimension should be output dimension * pooled_height * pooled_width. Default: False.
         name (str|None): Name of layer. Default: None.
     Returns:
         Variable: Output of deformable roi pooling is that, if position sensitive is False, input dimension equals to output dimension. If position sensitive is True,\
@@ -13602,10 +13602,10 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
     
     Args:
         - **input** (Variable): Input indices, last dimension must be 1.
-        - **index_num** (scalar): An interger defining the range of the index.
+        - **index_num** (scalar): An integer defining the range of the index.
         - **nshards** (scalar): The number of shards
         - **shard_id** (scalar): The index of the current shard
-        - **ignore_value** (scalar): An ingeter value out of sharded index range
+        - **ignore_value** (scalar): An integer value out of sharded index range
 
     Returns:
         Variable: The sharded index of input.
@@ -13810,7 +13810,7 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
         Variable: A Tensor of the specified shape filled with uniform_random values.
 
     Raises:
-        TypeError: The shape type should be list or tupple or variable.
+        TypeError: The shape type should be list or tuple or variable.
     
     Examples:
         .. code-block:: python
@@ -13864,7 +13864,7 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
             else:
                 attrs_shape.append(dim_size)
                 assert dim_size > 0, (
-                    "Each dimension size given in shape must not be negtive "
+                    "Each dimension size given in shape must not be negative "
                     "except one unknown dimension.")
         return attrs_shape
 
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 97d8a5bb3cd..5951d869981 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -142,7 +142,7 @@ The cumulative sum of the elements along a given axis. By default, the first ele
 
 Args:
     x (Variable): Input of cumsum operator, the Tensor/LoDTensor needed to be cumsumed. 
-    axis (int, optional): The dimenstion to accumulate along. -1 means the last dimenstion. Default is -1.
+    axis (int, optional): The dimension to accumulate along. -1 means the last dimension. Default is -1.
     exclusive (bool, optional): Whether to perform exclusive cumsum. Default is False.
     reverse (bool, optional): If true, the cumsum is performed in the reversed direction. Default is False.
 
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 43409c80f3a..dd274233a6b 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -99,11 +99,11 @@ class RNNCell(object):
             batch_ref: A (possibly nested structure of) tensor variable[s].
                 The first dimension of the tensor will be used as batch size to
                 initialize states.
-            shape: A (possiblely nested structure of) shape[s], where a shape is
+            shape: A (possibly nested structure of) shape[s], where a shape is
                 represented as a list/tuple of integer). -1(for batch size) will
                 beautomatically inserted if shape is not started with it. If None,
                 property `state_shape` will be used. The default value is None.
-            dtype: A (possiblely nested structure of) data type[s]. The structure
+            dtype: A (possibly nested structure of) data type[s]. The structure
                 must be same as that of `shape`, except when all tensors' in states
                 has the same data type, a single data type can be used. If None and
                 property `cell.state_shape` is not available, float32 will be used
@@ -171,7 +171,7 @@ class RNNCell(object):
         """
         Abstract method (property).
         Used to initialize states.
-        A (possiblely nested structure of) shape[s], where a shape is represented
+        A (possibly nested structure of) shape[s], where a shape is represented
         as a list/tuple of integers (-1 for batch size would be automatically
         inserted into a shape if shape is not started with it). 
         Not necessary to be implemented if states are not initialized by
@@ -186,9 +186,9 @@ class RNNCell(object):
         """
         Abstract method (property).
         Used to initialize states.
-        A (possiblely nested structure of) data types[s]. The structure must be
+        A (possibly nested structure of) data types[s]. The structure must be
         same as that of `shape`, except when all tensors' in states has the same
-        data type, a signle data type can be used.
+        data type, a single data type can be used.
         Not necessary to be implemented if states are not initialized
         by `get_initial_states` or the `dtype` argument is provided when using
         `get_initial_states`.
@@ -356,7 +356,7 @@ class LSTMCell(RNNCell):
             inputs(Variable): A tensor with shape `[batch_size, input_size]`,
                 corresponding to :math:`x_t` in the formula. The data type
                 should be float32.
-            states(Variable): A list of containing two tensers, each shaped
+            states(Variable): A list of containing two tensors, each shaped
                 `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}`
                 in the formula. The data type should be float32.
 
@@ -391,7 +391,7 @@ def rnn(cell,
         **kwargs):
     """
     rnn creates a recurrent neural network specified by RNNCell `cell`,
-    which performs :code:`cell.call()` repeatedly until reachs to the maximum
+    which performs :code:`cell.call()` repeatedly until reaches to the maximum
     length of `inputs`.
 
     Parameters:
@@ -408,7 +408,7 @@ def rnn(cell,
         sequence_length(Variable, optional): A tensor with shape `[batch_size]`.
             It stores real length of each instance, thus enables users to extract
             the last valid state when past a batch element's sequence length for
-            correctness. If not provided, the padddings would be treated same as
+            correctness. If not provided, the paddings would be treated same as
             non-padding inputs. Default None.
         time_major(bool, optional): Indicate the data layout of Tensor included
             in `input` and `output` tensors. If `False`, the data layout would
@@ -590,7 +590,7 @@ class Decoder(object):
                 :math:`[time\_step, batch\_size, ...]` , which is done by the caller. 
             final_states(Variable): A (possibly nested structure of) tensor variable[s].
                 It is the `next_states` returned by `decoder.step` at last decoding step,
-                thus has the same structrue, shape and data type with states at any time
+                thus has the same structure, shape and data type with states at any time
                 step.
 
         Returns:
@@ -664,7 +664,7 @@ class BeamSearchDecoder(Decoder):
                 **Note that fluid.embedding should be used here rather than
                 fluid.layers.embedding, since shape of ids is [batch_size, beam_size].
                 when using fluid.layers.embedding, must unsqueeze in embedding_fn.**
-                If not provided, the id to embedding transfomation must be built into
+                If not provided, the id to embedding transformation must be built into
                 `cell.call`. Default None.
             output_fn(optional): A callable to apply to the cell's output prior to
                 calculate scores and select candidate token ids. Default None.
@@ -687,7 +687,7 @@ class BeamSearchDecoder(Decoder):
         `beam_size` times.
 
         Parameters:
-            x(Variable): A tenosr with shape `[batch_size, ...]`. The data type
+            x(Variable): A tensor with shape `[batch_size, ...]`. The data type
                 should be float32, float64, int32, int64 or bool.
             beam_size(int): The beam width used in beam search.
 
@@ -716,7 +716,7 @@ class BeamSearchDecoder(Decoder):
         tensor with shape `[batch_size, beam_size, ...]`. 
 
         Parameters:
-            x(Variable): A tenosr with shape `[batch_size * beam_size, ...]`. The
+            x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The
                 data type should be float32, float64, int32, int64 or bool.
 
         Returns:
@@ -732,7 +732,7 @@ class BeamSearchDecoder(Decoder):
         tensor with shape `[batch_size * beam_size, ...]`. 
 
         Parameters:
-            x(Variable): A tenosr with shape `[batch_size, beam_size, ...]`. The
+            x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The
                 data type should be float32, float64, int32, int64 or bool.
 
         Returns:
@@ -1030,7 +1030,7 @@ class BeamSearchDecoder(Decoder):
                 `[time_step, batch_size, ...]`, which is done by the caller. 
             final_states(Variable): A structure(namedtuple) of tensor variables.
                 It is the `next_states` returned by `decoder.step` at last
-                decoding step, thus has the same structrue, shape and data type
+                decoding step, thus has the same structure, shape and data type
                 with states at any time step.
             sequence_lengths(Variable): An `int64` tensor shaped `[batch_size, beam_size]`.
                 It contains sequence lengths for each beam determined during
@@ -1059,7 +1059,7 @@ def dynamic_decode(decoder,
     """
     Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned
     Tensor indicating finished status contains all True values or the number of
-    decoding step reachs to :attr:`max_step_num`.
+    decoding step reaches to :attr:`max_step_num`.
 
     :code:`decoder.initialize()` would be called once before the decoding loop.
     If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
@@ -1074,7 +1074,7 @@ def dynamic_decode(decoder,
             Tensor by :code:`decoder.step()` indicating finished status contains
             all True. Default `None`.
         output_time_major(bool, optional): Indicate the data layout of Tensor included
-            in the final outpus(the first returned value of this method). If
+            in the final outputs(the first returned value of this method). If
             attr:`False`, the data layout would be batch major with shape
             `[batch_size, seq_len, ...]`.  If attr:`True`, the data layout would
             be time major with shape `[seq_len, batch_size, ...]`. Default: `False`.
@@ -2080,7 +2080,7 @@ def lstm(input,
         name (str, optional): A name for this layer. If set None, the layer
                          will be named automatically. Default: None.
         default_initializer(Initializer, optional): Where use initializer to initialize the Weight
-                         If set None, defaule initializer will be used. Default: None.
+                         If set None, default initializer will be used. Default: None.
         seed(int, optional): Seed for dropout in LSTM, If it's -1, dropout will use random seed. Default: 1.
 
 
@@ -2365,9 +2365,9 @@ def dynamic_lstmp(input,
         inputs['C0'] = c_0
 
     if cell_clip:
-        assert cell_clip >= 0, "cell_clip should not be negtive."
+        assert cell_clip >= 0, "cell_clip should not be negative."
     if proj_clip:
-        assert proj_clip >= 0, "proj_clip should not be negtive."
+        assert proj_clip >= 0, "proj_clip should not be negative."
 
     helper.append_op(
         type='lstmp',
@@ -2628,7 +2628,7 @@ def gru_unit(input,
     Returns:
         tuple: The tuple contains three Tensor variables with the same data type \
             as ``input`` . They represent the hidden state for next time step ( :math:`h_t` ), \
-            reseted previous hidden state ( :math:`r_t \odot h_{t-1}` ), and the \
+            reset previous hidden state ( :math:`r_t \odot h_{t-1}` ), and the \
             concatenation of :math:`h_t, r_t, \\tilde{h_t}` . And they have shape \
             :math:`[N, D]` , :math:`[N, D]` , :math:`[N, D \times 3]` separately. \
             Usually only the hidden state for next time step ( :math:`h_t` ) is used \
@@ -2716,7 +2716,7 @@ def beam_search(pre_ids,
     scores calculation to perform beam search for one time step. Specifically,
     after ``ids`` and ``scores`` have been produced, it selects the top-K
     ( `k` is ``beam_size`` ) candidate word ids of current step from ``ids``
-    according to the correspongding ``scores``. Additionally, ``pre_id`` and
+    according to the corresponding ``scores``. Additionally, ``pre_id`` and
     ``pre_scores`` are the output of `beam_search` at previous step, they
     are needed for special use to handle ended candidate translations.
 
@@ -2750,7 +2750,7 @@ def beam_search(pre_ids,
             ids.
         scores(Variable): A LodTensor variable containing the accumulated
             scores corresponding to ``ids`` . Both its shape and lod are same as
-            thoes of ``ids`` . The data type should be float32.
+            those of ``ids`` . The data type should be float32.
         beam_size(int): The beam width used in beam search.
         end_id(int): The id of end token.
         level(int): **It can be ignored and mustn't change currently.**
@@ -2883,7 +2883,7 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
 
     Returns:
         tuple: The tuple contains two LodTensor variables. The two LodTensor, \
-            containing the full sequences of ids and the correspongding accumulated \
+            containing the full sequences of ids and the corresponding accumulated \
             scores, have the same shape flattened to 1D and have the same 2 level \
             lod. The lod can be used to get how many predicted sequences each sample \
             has and how many ids each predicted sequence has.
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index 2a4fe0b69f6..74823900cf5 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -109,7 +109,7 @@ def sequence_conv(input,
             the same as input whether :attr:`padding` is set true or false. Because the length of
             input sequence may be shorter than :attr:`filter\_size`, which will cause the convolution
             result to not be computed correctly. These padding data will not be trainable or updated
-            while trainnig. Default: True.
+            while training. Default: True.
         padding_start (int): It is used to indicate the start index for padding the input
             sequence, which can be negative. The negative number means to pad
             :attr:`|padding_start|` time-steps of all-zero data at the beginning of each instance.
@@ -626,7 +626,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
         ref_level: 0
 
         then output is a 1-level LoDTensor out:
-            out.lod =  [[2,        2,        2,        2]]    #lod based on offfset
+            out.lod =  [[2,        2,        2,        2]]    #lod based on offset
             out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
             out.dims = [8, 1]
 
@@ -844,7 +844,7 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
          to ``maxlen``). The padding value is defined by ``pad_value``, and will be \
         appended to the tail of sequences. The result is a Python tuple ``(Out, Length)``: \
         the LodTensor ``Out`` is the padded sequences, and LodTensor ``Length`` is \
-        the length information of input sequences. For removing paddding data (unpadding \
+        the length information of input sequences. For removing padding data (unpadding \
 	operation), See :ref:`api_fluid_layers_sequence_unpad` .
 
     Please note that the input ``x`` should be LodTensor.
@@ -869,7 +869,7 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
             x.data = [[a1,a2],[b1,b2],[c1,c2],[d1,d2],[e1,e2]]
         pad_value:
             pad_value.data = [0]
-        defualt maxlen = None, (the virtual value is 3, according to the shape of x)
+        default maxlen = None, (the virtual value is 3, according to the shape of x)
 
         the output tuple (Out, Length):
             Out.data = [[[a1,a2],[b1,b2],[0,0]],[[c1,c2],[d1,d2],[e1,e2]]]
@@ -881,7 +881,7 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
             x.data = [[a1,a2],[b1,b2],[c1,c2],[d1,d2],[e1,e2]]
         pad_value:
             pad_value.data = [p1,p2]
-        defualt maxlen = None, (the virtual value is 3)
+        default maxlen = None, (the virtual value is 3)
 
         get tuple (Out, Length):
             Out.data = [[[a1,a2],[b1,b2],[p1,p2]],[[c1,c2],[d1,d2],[e1,e2]]]
@@ -891,7 +891,7 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
 
     Args:
         x (Variable): Input 1-level LodTensor with dims ``[M, K]``. The batch \
-            size is described by lod infor (the number of sequnces ). \
+            size is described by lod infor (the number of sequences ). \
             The data type should be float32, float64, int8, int32 or int64.
         pad_value (Variable): Padding value. It can be a scalar or a 1D tensor \
             with length ``K``. If it's a scalar, it will be automatically broadcasted \
@@ -962,7 +962,7 @@ def sequence_unpad(x, length, name=None):
 		      [ 6.0,  7.0,  8.0,  9.0, 10.0],
 		      [11.0, 12.0, 13.0, 14.0, 15.0]],
 
-	in which there are 3 sequences padded to length 5, and the acutal length
+	in which there are 3 sequences padded to length 5, and the actual length
 	specified by input Variable **length**:
 
 	    length.data = [2, 3, 4],
@@ -1077,7 +1077,7 @@ def sequence_scatter(input, index, updates, name=None):
     
     **The index and updates parameters of the OP must be LoDTensor.**
      
-    Plus the updates data to the correspoding input according to the index.
+    Plus the updates data to the corresponding input according to the index.
  
     The updated algorithm is as follows: output[instance_index][index [pos]] = input[instance_index][index [pos]] +  updates[pos], 
     where instance_idx is the K sample corresponding to pos in batch.
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index c8b8e634137..5c467f2d36d 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -512,9 +512,9 @@ def assign(input, output=None):
 def fill_constant(shape, dtype, value, force_cpu=False, out=None):
     """
     This OP creates a Tensor with specified `shape` and `dtype`, and
-    initializes it with a constant specifed by `value`.
+    initializes it with a constant specified by `value`.
 
-    The attribute `stop_gradient` of the created Tensor is setted to True.
+    The attribute `stop_gradient` of the created Tensor is set to True.
 
     Args:
         shape(list|tuple|Variable): Shape of the Tensor to be created.
@@ -524,7 +524,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
         dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output tensor which can
             be float16, float32, float64, int32, int64.
         value(float): The constant value used to initialize the Tensor to be created.
-        force_cpu(True): data should be on CPU if it's true, defalut value is False.
+        force_cpu(True): data should be on CPU if it's true, default value is False.
         out(Variable, optional): Optional output which can be any created 
             Variable that meets the requirements to store the result of operation.
             if out is None, a new Varibale will be create to store the result.
@@ -686,7 +686,7 @@ def fill_constant_batch_size_like(input,
             The default value is 0.
         output_dim_idx(int): Used to specify which dimension of Tensor is created to be set
             the value of batch_size of input Tensor. The default value is 0.
-        force_cpu(bool): data should be on CPU if it's true, defalut value is False.
+        force_cpu(bool): data should be on CPU if it's true, default value is False.
 
     Returns:
         Variable: Tensor which will be created according to dtype.
@@ -1079,7 +1079,7 @@ def save_combine(x, file_path, overwrite=True):
 
 def load_combine(out, file_path):
     """
-    Loads a list of vairables from a single file.
+    Loads a list of variable from a single file.
 
     Args:
         out(list): The list of variables to be read from the disk file.
@@ -1288,7 +1288,7 @@ def zeros_like(x, out=None):
         x(Variable): The input tensor which specifies shape and dtype, the input data dtype could be bool, float32, float64, int32, int64.
         out(Variable, optional): If is :attr:`None` , the op will create the variable as output, the data type and shape of \
             this variable will be same as input :attr:`x`. If is a tensor, the data type and shape need to be same as input :attr:`x`. 
-            The defalut value is :attr:`None` .
+            The default value is :attr:`None` .
 
     Returns:
         Variable: The N-D tensor, the element in tensor is related to input data type, if the input data type is bool, \
diff --git a/python/paddle/fluid/log_helper.py b/python/paddle/fluid/log_helper.py
index 0933d7b9048..ab20ed4c48c 100644
--- a/python/paddle/fluid/log_helper.py
+++ b/python/paddle/fluid/log_helper.py
@@ -31,7 +31,7 @@ def get_logger(name, level, fmt=None):
         fmt (str): Format of logger output
 
     Returns:
-        logging.Logger: logging logger with given setttings
+        logging.Logger: logging logger with given settings
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 43c83e6c68b..cc9d2603762 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -71,7 +71,7 @@ class MetricBase(object):
     2. aggregate the existing evaluation results as the overall performance.
 
     The class Metric is the base class for all classes in paddle.fluid.metrics, it defines
-    the fundmental APIs for all metrics classes, including:
+    the fundamental APIs for all metrics classes, including:
 
     1. update(preds, labels): given the prediction results (preds) and the labels (labels)
     of some mini-batch, compute the evaluation result of that mini-batch, and memorize the
@@ -142,7 +142,7 @@ class MetricBase(object):
             None
 
         Returns:
-            a python dict, which costains the inner states of the metric instance
+            a python dict, which contains the inner states of the metric instance
 
         Return types:
             a python dict
@@ -275,7 +275,7 @@ class Precision(MetricBase):
     relevant instances among the retrieved instances. Refer to
     https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers
 
-    Noted that this class mangages the precision score only for binary classification task.
+    Noted that this class manages the precision score only for binary classification task.
 
     Args:
        name (str, optional): Metric name. For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -358,7 +358,7 @@ class Recall(MetricBase):
     Refer to:
     https://en.wikipedia.org/wiki/Precision_and_recall
 
-    Noted that this class mangages the recall score only for binary classification task.
+    Noted that this class manages the recall score only for binary classification task.
 
     Args:
        name (str, optional): Metric name. For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -391,7 +391,7 @@ class Recall(MetricBase):
     def __init__(self, name=None):
         super(Recall, self).__init__(name)
         self.tp = 0  # true positive
-        self.fn = 0  # false negtive
+        self.fn = 0  # false negative
 
     def update(self, preds, labels):
         """
@@ -529,10 +529,10 @@ class ChunkEvaluator(MetricBase):
         .. code-block:: python
 
             import paddle.fluid as fluid
-            # init the chunck-level evaluation manager
+            # init the chunk-level evaluation manager
             metric = fluid.metrics.ChunkEvaluator()
 
-            # suppose the model predict 10 chuncks, while 8 ones are correct and the ground truth has 9 chuncks.
+            # suppose the model predict 10 chucks, while 8 ones are correct and the ground truth has 9 chucks.
             num_infer_chunks = 10
             num_label_chunks = 9 
             num_correct_chunks = 8
@@ -542,7 +542,7 @@ class ChunkEvaluator(MetricBase):
 
             print("precision: %.2f, recall: %.2f, f1: %.2f" % (numpy_precision, numpy_recall, numpy_f1))
 
-            # the next batch, predicting 3 prefectly correct chuncks.
+            # the next batch, predicting 3 perfectly correct chucks.
             num_infer_chunks = 3
             num_label_chunks = 3
             num_correct_chunks = 3
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index f8ad54751fe..118b9d60e3b 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -397,7 +397,7 @@ def scaled_dot_product_attention(queries,
             where :math:`N` stands for batch size, :math:`L_k` for the sequence length
             of key, :math:`d_v \\times h` for the feature size of value, :math:`h` for head
             number. The data type should be the same as ``queries`` .
-        num_heads (int, optional): Indicate the number of head. If the numher
+        num_heads (int, optional): Indicate the number of head. If the number
             is 1, linear projection would not be performed on inputs. Default: 1.
         dropout_rate (float, optional): The rate to drop the attention weight.
             Default: 0.0, which means no dropout.
@@ -410,7 +410,7 @@ def scaled_dot_product_attention(queries,
             Multi-Head Attention.
 
     Raises:
-        ValueError: Inputs quries, keys and values should all be 3-D tensors.
+        ValueError: Inputs queries, keys and values should all be 3-D tensors.
         ValueError: The hidden size of queries and keys should be the same.
         ValueError: The max sequence length in query batch and in key batch should be the same.
         ValueError: he hidden size of keys must be divisible by the number of attention heads.
@@ -429,7 +429,7 @@ def scaled_dot_product_attention(queries,
     """
     if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
         raise ValueError(
-            "Inputs quries, keys and values should all be 3-D tensors.")
+            "Inputs queries, keys and values should all be 3-D tensors.")
 
     if queries.shape[-1] != keys.shape[-1]:
         raise ValueError(
@@ -474,7 +474,7 @@ def scaled_dot_product_attention(queries,
 
     def __split_heads(x, num_heads):
         """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
+        Reshape the last dimension of input tensor x so that it becomes two
         dimensions.
 
         Args:
@@ -496,13 +496,13 @@ def scaled_dot_product_attention(queries,
             x=x,
             shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
 
-        # permuate the dimensions into:
+        # permute the dimensions into:
         # [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
         return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
 
     def __combine_heads(x):
         """
-        Reshape the last two dimensions of inpunt tensor x so that it becomes
+        Reshape the last two dimensions of input tensor x so that it becomes
         one dimension.
 
         Args:
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index d9c2cb702f6..dd7995c6f7f 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -112,12 +112,12 @@ class Optimizer(object):
     @framework.dygraph_only
     def state_dict(self):
         '''
-        Get state dict information from optimizer. It contain all the variable used by optimizer. For Adam opimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict.
-        If the optimzier never be called(minimize function), the state_dict is empty.
+        Get state dict information from optimizer. It contain all the variable used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict.
+        If the optimizer never be called(minimize function), the state_dict is empty.
 
         Args: None
         Return:
-            state_dict(dict) : dict contains all the variablel used by optimizer
+            state_dict(dict) : dict contains all the variable used by optimizer
         
         Examples:
             .. code-block:: python
@@ -153,7 +153,7 @@ class Optimizer(object):
     @framework.dygraph_only
     def set_dict(self, state_dict):
         '''
-        Load optimizer state dict. For Adam opimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.
+        Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.
 
         Args: 
             state_dict(dict) : Dict contains all the Variable needed by optimizer
@@ -289,7 +289,7 @@ class Optimizer(object):
     def current_step_lr(self):
         """
         .. note::
-          **This API is ONLY avaliable in Dygraph mode**
+          **This API is ONLY available in Dygraph mode**
         
         Get current step learning rate. The return value is all the same When LearningRateDecay is not used,
         otherwise return the step learning rate.
@@ -1613,7 +1613,7 @@ class AdagradOptimizer(Optimizer):
 
 class AdamOptimizer(Optimizer):
     """
-    The Adam optimzier uses an optimization described at the end
+    The Adam optimizer uses an optimization described at the end
     of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
     it can dynamically adjusts the learning rate of each parameter using
     the 1st moment estimates and the 2nd moment estimates of the gradient.
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index a82a75e10bb..dae6d99ee77 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -196,7 +196,7 @@ class WeightNormParamAttr(ParamAttr):
     Args:
         dim(int): Dimension over which to compute the norm. Dim is a non-negative
             number which is less than the rank of weight Tensor. For Example, dim can
-            be choosed from 0, 1, 2, 3 for convolution whose weight shape is [cout, cin, kh, kw]
+            be chosen from 0, 1, 2, 3 for convolution whose weight shape is [cout, cin, kh, kw]
             and rank is 4. Default None, meaning that all elements will be normalized.
         name(str, optional): The parameter's name. Default None, meaning that the name would
             be created automatically. Please refer to :ref:`api_guide_Name` for more details.
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index c0b0c86ecd2..730e9c10a73 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -312,7 +312,7 @@ def profiler(state,
 
             #### Examples Results ####
             #### 1) sorted_key = 'total', 'calls', 'max', 'min', 'ave' ####
-            # The only difference in 5 sorted_key results is the following sentense: 
+            # The only difference in 5 sorted_key results is the following sentence: 
             # "Sorted by number of xxx in descending order in the same thread."
             # The reason is that in this example, above 5 columns are already sorted.
             ------------------------->     Profiling Report     <-------------------------
diff --git a/python/paddle/fluid/tests/demo/pipeline_train.py b/python/paddle/fluid/tests/demo/pipeline_train.py
index 54fa719e29d..bebc0761bf0 100644
--- a/python/paddle/fluid/tests/demo/pipeline_train.py
+++ b/python/paddle/fluid/tests/demo/pipeline_train.py
@@ -71,7 +71,7 @@ def parse_args():
     parser.add_argument(
         '--emb_lr_rate', type=float, default=0.5, help='learning rate')
     parser.add_argument(
-        '--step', type=int, default=1, help='gnn propogation steps')
+        '--step', type=int, default=1, help='gnn propagation steps')
     parser.add_argument(
         '--lr_dc', type=float, default=0.1, help='learning rate decay rate')
     parser.add_argument(
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index b8d83323600..c3ff3c0feb7 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -223,7 +223,7 @@ input_descs = {
     # The actual data shape of label_word is:
     # [batch_size * max_trg_len_in_batch, 1]
     "lbl_word": [(batch_size * seq_len, long_type(1)), "int64"],
-    # This input is used to mask out the loss of paddding tokens.
+    # This input is used to mask out the loss of padding tokens.
     # The actual data shape of label_weight is:
     # [batch_size * max_trg_len_in_batch, 1]
     "lbl_weight": [(batch_size * seq_len, long_type(1)), "float32"],
@@ -972,7 +972,7 @@ def multi_head_attention(queries,
     """
     if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
         raise ValueError(
-            "Inputs: quries, keys and values should all be 3-D tensors.")
+            "Inputs: queries, keys and values should all be 3-D tensors.")
 
     def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
         """
@@ -997,7 +997,7 @@ def multi_head_attention(queries,
 
     def __split_heads(x, n_head):
         """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
+        Reshape the last dimension of input tensor x so that it becomes two
         dimensions and then transpose. Specifically, input a tensor with shape
         [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
         with shape [bs, n_head, max_sequence_length, hidden_dim].
@@ -1011,13 +1011,13 @@ def multi_head_attention(queries,
         reshaped = layers.reshape(
             x=x, shape=[0, 0, n_head, hidden_size // n_head])
 
-        # permuate the dimensions into:
+        # permute the dimensions into:
         # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
         return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
 
     def __combine_heads(x):
         """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
+        Transpose and then reshape the last two dimensions of input tensor x
         so that it becomes one dimension, which is reverse to __split_heads.
         """
         if len(x.shape) == 3: return x
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index 48509067cde..db9e8d2c6bd 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -126,7 +126,7 @@ class TestSqrtDoubleGradCheck(unittest.TestCase):
 class TestSquareDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
index 52d44d69fae..e86f18a6216 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
@@ -28,7 +28,7 @@ from decorator_helper import prog_scope
 class TestElementwiseMulDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
@@ -55,7 +55,7 @@ class TestElementwiseMulDoubleGradCheck(unittest.TestCase):
 class TestElementwiseMulBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
@@ -82,7 +82,7 @@ class TestElementwiseMulBroadcastDoubleGradCheck(unittest.TestCase):
 class TestElementwiseAddDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
@@ -109,7 +109,7 @@ class TestElementwiseAddDoubleGradCheck(unittest.TestCase):
 class TestElementwiseAddBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
@@ -136,7 +136,7 @@ class TestElementwiseAddBroadcastDoubleGradCheck(unittest.TestCase):
 class TestElementwiseSubDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
@@ -163,7 +163,7 @@ class TestElementwiseSubDoubleGradCheck(unittest.TestCase):
 class TestElementwiseSubBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
@@ -190,7 +190,7 @@ class TestElementwiseSubBroadcastDoubleGradCheck(unittest.TestCase):
 class TestElementwiseDivDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.0001
         dtype = np.float64
@@ -218,7 +218,7 @@ class TestElementwiseDivDoubleGradCheck(unittest.TestCase):
 class TestElementwiseDivBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.0001
         dtype = np.float64
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index 5ce405dccae..fceaa0c14c4 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -74,7 +74,7 @@ def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
     else:
         # Avoid sorting possibly large arrays;
         # First partition to get top K unsorted
-        # and then sort just thoes
+        # and then sort just those
         inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN]
         order = np.argsort(-scores[inds].squeeze())
         order = inds[order]
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index c57d10a24aa..7e9dad69def 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -266,7 +266,7 @@ input_descs = {
     # The actual data shape of label_word is:
     # [batch_size * max_trg_len_in_batch, 1]
     "lbl_word": [(batch_size * seq_len, 1), "int64"],
-    # This input is used to mask out the loss of paddding tokens.
+    # This input is used to mask out the loss of padding tokens.
     # The actual data shape of label_weight is:
     # [batch_size * max_trg_len_in_batch, 1]
     "lbl_weight": [(batch_size * seq_len, 1), "float32"],
diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
index 27ee3b08a4e..8a9204c73fc 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
@@ -45,7 +45,7 @@ class LinearChainCrfForward(object):
         self.w_exps = transition_exps[2:, :]
 
         # The output of linear chain crf operator.
-        # alpha is a memo table in dynamic programming to caculate
+        # alpha is a memo table in dynamic programming to calculate
         # nomalization factor.
         self.alpha = np.zeros(
             (seq_start_positions[-1], self.tag_num), dtype="float64")
diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
index ae11f23299c..b978e721746 100644
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -131,12 +131,12 @@ class TestNCECase1SelectedRows(unittest.TestCase):
 
     @staticmethod
     def get_train_data(batch_size):
-        batchs = []
+        batches = []
         for i in range(batch_size):
             input = np.random.randn(batch_size, 10).astype(np.float32)
             labels = np.random.randint(0, 20, (batch_size, 1))
-            batchs.append([input, labels])
-        return batchs
+            batches.append([input, labels])
+        return batches
 
     def get_optimizer(self):
         # SGD optimizer
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index e38028feea2..c6cfe01dce4 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -104,7 +104,7 @@ class TestReduceMeanWithDimDoubleGradCheck(unittest.TestCase):
 class TestMulDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         x_shape = [7, 11]
         y_shape = [11, 9]
         eps = 0.005
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 826737aeefa..3dfd9023f5a 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -329,7 +329,7 @@ class TestReshapeOpError(unittest.TestCase):
 
             self.assertRaises(AssertionError, test_shape_2)
 
-            # The argument shape have more than one negtive value.
+            # The argument shape have more than one negative value.
             def test_shape_3():
                 fluid.layers.reshape(x3, [-1, -2, 5])
 
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py
index d4b92f9849a..ad141c96bbf 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -285,7 +285,7 @@ class TestSaveLoadBase(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -299,7 +299,7 @@ class TestSaveLoadBase(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.load(main_program, "./test_1.pdparams", exe)
@@ -394,7 +394,7 @@ class TestSaveLoadPartial(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -408,7 +408,7 @@ class TestSaveLoadPartial(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.load(test_program, "./test_1.pdopt", None)
@@ -496,7 +496,7 @@ class TestSaveLoadSetStateDict(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -510,7 +510,7 @@ class TestSaveLoadSetStateDict(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.load(main_program, "./test_1", exe)
@@ -605,7 +605,7 @@ class TestProgramStatePartial(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -619,7 +619,7 @@ class TestProgramStatePartial(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             #fluid.load(test_program, "./test_1", None )
@@ -652,7 +652,7 @@ class TestProgramStatePartial(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.set_program_state(test_program, program_state_1)
@@ -672,7 +672,7 @@ class TestProgramStatePartial(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.set_program_state(test_program, program_state_2)
@@ -692,7 +692,7 @@ class TestProgramStatePartial(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.set_program_state(test_program, program_state_3)
@@ -777,7 +777,7 @@ class TestVariableInit(unittest.TestCase):
             if isinstance(var, framework.Parameter) or var.persistable:
                 t = np.array(fluid.global_scope().find_var(var.name)
                              .get_tensor())
-                # make sure all the paramerter or optimzier var have been update
+                # make sure all the paramerter or optimizer var have been update
                 base_map[var.name] = t
 
         for var in program.list_vars():
@@ -868,7 +868,7 @@ class TestLoadFromOldInterface(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -883,7 +883,7 @@ class TestLoadFromOldInterface(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.load(main_program, "test_path", exe)
@@ -984,7 +984,7 @@ class TestLoadFromOldInterfaceSingleFile(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -1000,7 +1000,7 @@ class TestLoadFromOldInterfaceSingleFile(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             file_model_path = os.path.join("test_path", "model_single")
@@ -1136,7 +1136,7 @@ class TestProgramStateOldSave(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -1150,7 +1150,7 @@ class TestProgramStateOldSave(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             #fluid.load(test_program, "./test_1", None )
@@ -1247,7 +1247,7 @@ class TestProgramStateOldSaveSingleModel(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -1262,7 +1262,7 @@ class TestProgramStateOldSaveSingleModel(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             #fluid.load(test_program, "./test_1", None )
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index 1782d432490..970eb2daea5 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -57,7 +57,7 @@ def multi_head_attention(queries,
     """
     if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
         raise ValueError(
-            "Inputs: quries, keys and values should all be 3-D tensors.")
+            "Inputs: queries, keys and values should all be 3-D tensors.")
 
     def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
         """
@@ -91,7 +91,7 @@ def multi_head_attention(queries,
 
     def __split_heads(x, n_head):
         """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
+        Reshape the last dimension of input tensor x so that it becomes two
         dimensions and then transpose. Specifically, input a tensor with shape
         [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
         with shape [bs, n_head, max_sequence_length, hidden_dim].
@@ -104,13 +104,13 @@ def multi_head_attention(queries,
         reshaped = layers.reshape(
             x=x, shape=[batch_size, -1, n_head, hidden_size // n_head])
 
-        # permuate the dimensions into:
+        # permute the dimensions into:
         # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
         return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
 
     def __combine_heads(x):
         """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
+        Transpose and then reshape the last two dimensions of input tensor x
         so that it becomes one dimension, which is reverse to __split_heads.
         """
         if len(x.shape) == 3: return x
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
index dc78ffe70b3..08e4056487a 100644
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -98,7 +98,7 @@ def op_to_code(op, skip_op_callstack=True):
         op: A fluid operator.
 
     Returns:
-        string: The foramtted string.
+        string: The formatted string.
     """
 
     outputs_str = "{"
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index f6d754c3a77..31177cc2c61 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -160,7 +160,7 @@ class DistributeTranspilerConfig(object):
           Minimum number of splitted elements in block, default is 8192.
 
           According to : https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
-          We can use bandwidth effiently when data size is larger than 2MB.If you
+          We can use bandwidth efficiently when data size is larger than 2MB.If you
           want to change it, please be sure you have read the slice_variable function. You can find
           the definition of slice_variable in
           https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -202,7 +202,7 @@ class DistributeTranspilerConfig(object):
     #The picture here illustrates the principle:
     #https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
     use_hierarchical_allreduce = False
-    #Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu cards' number in most cases.
+    #Nccl ranks in a node when use hierarchical allreduce, it's set to gpu cards' number in most cases.
     hierarchical_allreduce_inter_nranks = 0
 
     # if mode is collective
@@ -1460,7 +1460,7 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
             endpoint (str): current pserver endpoint.
             pserver_program (Program): deprecated, call get_pserver_program first.
             startup_program (Program): deprecated, should pass startup_program
-                when initalizing
+                when initializing
 
         Returns:
             Program: parameter server side startup program.
diff --git a/python/paddle/fluid/transpiler/geo_sgd_transpiler.py b/python/paddle/fluid/transpiler/geo_sgd_transpiler.py
index 4c2172f8676..484f6aa5eb5 100644
--- a/python/paddle/fluid/transpiler/geo_sgd_transpiler.py
+++ b/python/paddle/fluid/transpiler/geo_sgd_transpiler.py
@@ -308,7 +308,7 @@ class GeoSgdTranspiler(DistributeTranspiler):
             }) for ep in self.pserver_endpoints
         ]
 
-        # step 5. Create delta var of Geo-Sgd & record vars infomation
+        # step 5. Create delta var of Geo-Sgd & record vars information
         for origin_name, splited_vars in self.param_var_mapping.items():
             origin_var = self.origin_program.global_block().var(origin_name)
             self.vars_info[origin_name] = collections.OrderedDict()
diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py
index 3d2273be0c0..a91007c0d38 100644
--- a/python/paddle/fluid/transpiler/ps_dispatcher.py
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
@@ -19,7 +19,7 @@ class PSDispatcher(object):
     """
     PSDispatcher is the base class for dispatching vars
     into different pserver instance.
-    You need to implement the `dispatch` inferface.
+    You need to implement the `dispatch` interface.
     """
 
     def __init__(self, pserver_endpoints):
@@ -88,7 +88,7 @@ class HashName(PSDispatcher):
 
 class RoundRobin(PSDispatcher):
     """
-    Distribute variables to serveral endpoints using
+    Distribute variables to several endpoints using
     RondRobin<https://en.wikipedia.org/wiki/Round-robin_scheduling> method.
 
     Args:
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index becf61934ed..a81746e4a2c 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -458,12 +458,12 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
     """
     This API use python ``multiprocessing`` to read data from ``readers`` parallelly,
     and then ``multiprocess.Queue`` or ``multiprocess.Pipe`` is used to merge 
-    these data. A seperate process will be created for each reader in the 
+    these data. A separate process will be created for each reader in the 
     ``readers`` list, please guarantee every reader can work independently 
     to avoid conflicts in parallel environment.
     
 
-    ``Multiprocess.Queue`` require the rw access right to /dev/shm, and it's not suppported 
+    ``Multiprocess.Queue`` require the rw access right to /dev/shm, and it's not supported 
     in some platforms.
 
     Parameters:
diff --git a/python/paddle/utils/image_util.py b/python/paddle/utils/image_util.py
index 94029cff9fb..b113f574e9f 100644
--- a/python/paddle/utils/image_util.py
+++ b/python/paddle/utils/image_util.py
@@ -98,7 +98,7 @@ def preprocess_img(im, img_mean, crop_size, is_train, color=True):
     Does data augmentation for images.
     If is_train is false, cropping the center region from the image.
     If is_train is true, randomly crop a region from the image,
-    and randomy does flipping.
+    and random does flipping.
     im: (K x H x W) ndarrays
     """
     im = im.astype('float32')
diff --git a/python/paddle/utils/plotcurve.py b/python/paddle/utils/plotcurve.py
index a95e5497e23..9c298acf01d 100644
--- a/python/paddle/utils/plotcurve.py
+++ b/python/paddle/utils/plotcurve.py
@@ -37,7 +37,7 @@ optional arguments:
 
 The keys must be in the order of paddle output(!!!).
 
-For example, paddle.INFO contrains the following log
+For example, paddle.INFO contains the following log
    I0406 21:26:21.325584  3832 Trainer.cpp:601]  Pass=0 Batch=7771 AvgCost=0.624935 Eval: error=0.260972
 
 To use this script to generate plot for AvgCost, error:
diff --git a/python/paddle/utils/preprocess_img.py b/python/paddle/utils/preprocess_img.py
index fc67949dfe0..e54393fa4a0 100644
--- a/python/paddle/utils/preprocess_img.py
+++ b/python/paddle/utils/preprocess_img.py
@@ -135,7 +135,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
 
     def create_dataset_from_dir(self, path):
         """
-        Create a Dataset object for image classfication.
+        Create a Dataset object for image classification.
         Each folder in the path directory corresponds to a set of images of
         this label, and the name of the folder is the name of the
         path: the path of the image dataset.
diff --git a/python/paddle/utils/preprocess_util.py b/python/paddle/utils/preprocess_util.py
index 05b2067d01a..76fc83acdc0 100644
--- a/python/paddle/utils/preprocess_util.py
+++ b/python/paddle/utils/preprocess_util.py
@@ -39,7 +39,7 @@ def save_list(l, outfile):
 
 def exclude_pattern(f):
     """
-    Return whether f is in the exlucde pattern.
+    Return whether f is in the exclude pattern.
     Exclude the files that starts with . or ends with ~.
     """
     return f.startswith(".") or f.endswith("~")
@@ -81,7 +81,7 @@ def list_files(path):
 def get_label_set_from_dir(path):
     """
     Return a dictionary of the labels and label ids from a path.
-    Assume each direcotry in the path corresponds to a unique label.
+    Assume each directory in the path corresponds to a unique label.
     The keys of the dictionary is the label name.
     The values of the dictionary is the label id.
     """
@@ -198,7 +198,7 @@ class DataBatcher:
 
     def __init__(self, train_data, test_data, label_set):
         """
-        train_data, test_data: Each one is a dataset object repesenting
+        train_data, test_data: Each one is a dataset object representing
         training and testing data, respectively.
         label_set: a dictionary storing the mapping from label name to label id.
         """
@@ -256,7 +256,7 @@ class DataBatcher:
 class DatasetCreater(object):
     """
     A virtual class for creating datasets.
-    The derived clasas needs to implemnt the following methods:
+    The derived class needs to implement the following methods:
        - create_dataset()
        - create_meta_file()
     """
-- 
GitLab