diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 94424a5ffaf23067b86e66fe232dcdec6a189712..d82035c03ee10ba186c2a989c62e5a65c8dbba3b 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -126,7 +126,7 @@ class Dataset {
   virtual void DestroyPreLoadReaders() = 0;
   // set preload thread num
   virtual void SetPreLoadThreadNum(int thread_num) = 0;
-  // seperate train thread and dataset thread
+  // separate train thread and dataset thread
   virtual void DynamicAdjustChannelNum(int channel_num) = 0;
   virtual void DynamicAdjustReadersNum(int thread_num) = 0;
   // set fleet send sleep seconds
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 738bbf51115fc126e072a3ca3519e234b22c6b4b..5388df6bc504203abb57237f2d23a324367ce087 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -133,10 +133,10 @@ struct BuildStrategy {
   // The picture is here:
   // https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
   bool use_hierarchical_allreduce_{false};
-  // Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu
+  // Nccl ranks in a node when use hierarchical allreduce, it's set to gpu
   // cards' number in most cases.
   size_t hierarchical_allreduce_inter_nranks_{0};
-  // Nccl ranks bewteen nodes when use hierarchical allreduce, it's setted to
+  // Nccl ranks bewteen nodes when use hierarchical allreduce, it's set to
   // nodes number.
   size_t hierarchical_allreduce_exter_nranks_{0};
 
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index b4d6f683ce747a35aea7b431165911d942bcf092..c3c02c30b9d6279161d7cbafd8d67462fc8bbe85 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -33,7 +33,7 @@ namespace ir {
   GET_IR_NODE(act_op);                 \
   GET_IR_NODE(act_out);
 
-// Inherient the basic infomation from `base_desc`, and modify some fields.
+// Inherient the basic information from `base_desc`, and modify some fields.
 framework::proto::OpDesc PrepareOpDesc(
     const framework::proto::OpDesc& base_desc, const std::string& bias,
     const std::string& bias1, const std::string& activation,
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index ba0a2fb96458bd70105fa4d97114b609657b62f6..b15871ef03fbb3834160b0e118ecded6b568e1ca 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -31,7 +31,7 @@ namespace ir {
   GET_IR_NODE(act_op);               \
   GET_IR_NODE(act_out);
 
-// Inherient the basic infomation from `base_desc`, and modify some fields.
+// Inherient the basic information from `base_desc`, and modify some fields.
 framework::proto::OpDesc PrepareOpDesc(
     const framework::proto::OpDesc& base_desc, const std::string& bias,
     const std::string& activation, const std::string& output) {
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index b55bbbe5aea65490db7d437b7a0ef94a5715e8c5..35bdfde96bc3c8a0a9247378849730d9ef4f54aa 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -382,7 +382,7 @@ const VarDesc *FuseOptimizerOpPass::GetVarDescFromVarsInfo(
     const std::string &var_name) const {
   auto grad_iter = vars_info.find(var_name);
   PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true,
-                    "The gradient varibale %s is not found.", var_name);
+                    "The gradient variable %s is not found.", var_name);
   PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true,
                     "The gradient var node %s is not found.", var_name);
   PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var(),
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 919364541e4eee27a5970da12ffd818124699d50..e0b7a4d3378401dd10117a46e01480e7a2a8fe3e 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -131,7 +131,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
 }
 
 // The intermediate Nodes can only link to the nodes inside the pattern, or this
-// subgraph will be droped.
+// subgraph will be dropped.
 void GraphPatternDetector::ValidateByNodeRole(
     std::vector<GraphPatternDetector::subgraph_t> *subgraphs) {
   std::vector<GraphPatternDetector::subgraph_t> result;
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index a8720ff4bfb5c7fa7aee6d23949b030c328b90e6..b075cde3212b0434e15ebda0ce45976e2e018c53 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -179,7 +179,7 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const {
         ir::Node* var = nullptr;
         auto updated_var = UpdateGradVarDesc(in_node->Var(), i, grad_names,
                                              bn_vars_need_rename);
-        // should be initialized by startup, how to initilize tensor in the
+        // should be initialized by startup, how to initialize tensor in the
         // scope?
         if (node->Name() == "batch_norm" &&
             bn_vars_need_rename.find(in_node->Name()) !=
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 79b50993556d40579714426b8cb333345dcd2fa9..935931b8150373c7cc26793252918bb098ed55df 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -1041,7 +1041,7 @@ void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
     // There are 4 conditions:
     // 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS.
     // Need to broadcast received parameters to other GPU.
-    // 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to
+    // 2. GPU && AllReduce: AllReduce all gradient to each GPU. Need to
     // broadcast received parameters to other GPU.
     // 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to
     // broadcast received parameters to other scope.
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 87a99afc9ae05050f5b2a66bfa86817ce4a07b9b..cbb2c79c5c47d82b576b18a1790fb8f68391acd6 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -80,7 +80,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     PADDLE_ENFORCE_EQ(
         in_var_names.size(), out_var_names.size(),
         platform::errors::PreconditionNotMet(
-            "Op [%s]:  Input var number shoule be equal with output var number",
+            "Op [%s]:  Input var number should be equal with output var number",
             op_.Type()));
 
     for (size_t i = 0; i < in_var_names.size(); ++i) {
@@ -663,7 +663,7 @@ void OpDesc::Flush() {
 
 void OpDesc::CheckAttrs() {
   PADDLE_ENFORCE(!Type().empty(),
-                 "CheckAttr() can not be called before type is setted.");
+                 "CheckAttr() can not be called before type is set.");
   auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
   if (checker == nullptr) {
     // checker is not configured. That operator could be generated by Paddle,
@@ -706,7 +706,7 @@ void OpDesc::InferShape(const BlockDesc &block) const {
 void OpDesc::InferVarType(BlockDesc *block) const {
   // There are a few places that var type can be set.
   // When VarDesc is created, default set to LOD_TENSOR.
-  // When output variable is created, default is defaut set to LOD_TENSOR.
+  // When output variable is created, default is default set to LOD_TENSOR.
   // We limit here to be the only place that operator defines its customized
   // var type inference. Hence, we don't do any "default" setting here.
   auto &info = OpInfoMap::Instance().Get(this->Type());
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index df773c044bf118e2b381c4281c8e5039360aeef9..6ffe3d87136c483d3930b45b01ed4388d09e118e 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -654,7 +654,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
     PADDLE_ENFORCE_EQ(
         in_var_list.size(), out_var_list.size(),
         platform::errors::PreconditionNotMet(
-            "Op [%s]: Input var size should be equal with ouput var size",
+            "Op [%s]: Input var size should be equal with output var size",
             op_.Type()));
 
     auto& out_var_names = op_.Outputs(out);
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index f30620a0a7ff798b50eef23b2c2551f218f5a7fb..97d2dad06c8a0999d232ecaa2cc85b108b607c36 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -53,8 +53,8 @@ constexpr char kEmptyVarName[] = "@EMPTY@";
 constexpr char kTempVarName[] = "@TEMP@";
 
 /// If a variable's name has a certain suffix, it means that the
-/// variable is the gradient of another varibale.
-/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
+/// variable is the gradient of another variable.
+/// e.g. Variable "x@GRAD" is the gradient of variable "x".
 constexpr char kGradVarSuffix[] = "@GRAD";
 
 constexpr size_t kGradVarSuffixSize = 5U;
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 7bed06b0a3d50a0e460565923e9faef835141f64..77c98a08cf03496b2d1375eb216560734e241858 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -340,7 +340,7 @@ class IndicateLoDTensorDataTypeTestProtoMaker : public OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("LoDTensor", "Input of Tensor type Variable.");
-    AddComment("This Op is only for IndicateVarDataType inferface test.");
+    AddComment("This Op is only for IndicateVarDataType interface test.");
   }
 };
 
@@ -362,7 +362,7 @@ class IndicateSelectedRowsDataTypeTestProtoMaker
  public:
   void Make() {
     AddInput("SelectedRows", "Input of SelectedRows type Variable.");
-    AddComment("This Op is only for IndicateVarDataType inferface test.");
+    AddComment("This Op is only for IndicateVarDataType interface test.");
   }
 };
 
@@ -382,7 +382,7 @@ class IndicateOtherDataTypeTestProtoMaker : public OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("Other", "Input of Other type Variable");
-    AddComment("This Op is only for IndicateVarDataType inferface test.");
+    AddComment("This Op is only for IndicateVarDataType interface test.");
   }
 };
 
@@ -572,7 +572,7 @@ class GetSetLoDLevelTestMaker : public OpProtoAndCheckerMaker {
   void Make() {
     AddInput("X", "(LoDTensor) Input Variable.");
     AddOutput("Out", "(LoDTensor) Output Variable.");
-    AddComment("This Op is only for Get/SetLoDLevel inferface test.");
+    AddComment("This Op is only for Get/SetLoDLevel interface test.");
   }
 };
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
index 699e9eb01de1360373f08a309b07e27534a6b6db..22f17d440e5d68dc47d2aa9a638c8b01b45a2343 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -112,7 +112,7 @@ void RenameAndGetOutputs(
     std::unordered_map<std::string, std::string> *output_name_map,
     const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
     bool trt_and_not_int8) {
-  //// In the normal case, the paddle-trt exists bug when runing the googlenet.
+  //// In the normal case, the paddle-trt exists bug when running the googlenet.
   // When there are more than two convolutions of 1 * 1 with the same input, the
   // paddle-tensorrt will do the merging optimization, which fuse those conv
   // into one conv, and then trigger bug. So,  We should use strategy to avoid
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 397411ccf872a1d36ff817b1d7dada0cab91e00a..2b6418bbf8ab43e0c9b429e2a40bb8f53157e683 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -223,7 +223,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   auto use_static_engine = Get<bool>("use_static_engine");
   // TODO(NHZlX)
   // There are models with the same structure but the different parameters,
-  // when runing in the 'use_serialize' mode, there is a bug.
+  // when running in the 'use_serialize' mode, there is a bug.
   auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
                                       std::to_string(0));
   auto predictor_id = Get<int>("predictor_id");
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 8b379457a2d031dbe859562c1a8dade0badc56c2..c497ab384b5fac74b5241d61517485fd8f2b40c4 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -137,7 +137,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                  "model version %ld is not supported.",
                  main_program->Version());
 
-  // model_from_memory is false in seperate parameters.
+  // model_from_memory is false in separate parameters.
   LoadPersistables(executor, scope, *main_program, dirname, "",
                    false /* model_from_memory */);
   return main_program;
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 97affafb4bffd20a52199bdd80affc235319f5f4..3c48c8192f6b06e5a0ba005738383b46bc550ecb 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -101,7 +101,7 @@ class TRTConvertValidation {
     DeclVar(name, dim_vec);
   }
 
-  // Declare a parameter varaible in the scope.
+  // Declare a parameter variable in the scope.
   void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
     DeclVar(name, dims, true);
   }
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 771ad702448a40b1e03430ae12398dcb86540e5a..e7f7a842cf5725d4c83f1c4b8205ba32515a79fd 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -104,10 +104,9 @@ void TensorRTEngine::FreezeNetwork() {
 
       for (auto &t : all_t) {
         if (!quant_dynamic_range_.count(t)) {
-          VLOG(3)
-              << "We are in trt int8 mode(not calibration), scale not setted"
-              << " for tensor " << t->getName()
-              << ", this might be ok when trt does not need this range";
+          VLOG(3) << "We are in trt int8 mode(not calibration), scale not set"
+                  << " for tensor " << t->getName()
+                  << ", this might be ok when trt does not need this range";
         }
       }
       std::unordered_set<std::string> all_out_t_name;
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 4f656ab165fea82f0545700f462470550dec1d95..402815c7e6326aec872e1b086bd10db429b5a724 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -172,7 +172,7 @@ class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
              "(std::vector<LodTensor>) A vector of tensors that is going to "
              "be casted to a big LoDTensor.");
     AddInput("RankTable",
-             "(LoDRankTable) RankTable provides the coarse lod infomation to "
+             "(LoDRankTable) RankTable provides the coarse lod information to "
              "build the output LoDTensor. See "
              "'paddle/framework/lod_rank_table.h' for more details.");
     AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index 5b3dbcd65ec58e45a964cc170aefbbadf258813a..e2cde218db3438df3e50bdf5cf78ec123d92367d 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -132,7 +132,7 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
         "(Tensor<int64_t>), The accumulating times of previous window with "
         "shape [1].");
     AddInput("in_num_updates",
-             "(Tensor<int64_t>), The total number of batches used by trainning "
+             "(Tensor<int64_t>), The total number of batches used by training "
              "before this batch with shape [1].");
 
     AddOutput("out_sum_1",
@@ -155,10 +155,9 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
         "out_old_num_accumulates",
         "(Tensor<int64_t>) The accumulating times of previous window with "
         "shape [1].");
-    AddOutput(
-        "out_num_updates",
-        "(Tensor<int64_t>), The total number of batches used by trainning "
-        "before this batch with shape [1].");
+    AddOutput("out_num_updates",
+              "(Tensor<int64_t>), The total number of batches used by training "
+              "before this batch with shape [1].");
 
     AddAttr<float>("average_window",
                    "(float, default 0) "
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h
index 8e2f25dfcf51e275e935de9b68b3313e395ae841..8f6c9b60dcad570094b53e49b10420dca456b90d 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
@@ -49,7 +49,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
-    // Create the intermediate variable to caculate the result of
+    // Create the intermediate variable to calculate the result of
     // Input(X) multiplied by Input(Weight_i), the formula is:
     // left_mul = X Weight_i.
     Tensor left_mul;
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 81f099d7c1cb8f977e51c952c9a418842d23359c..39ba1054740a2dc59dc092cedd29df3c1af74628 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -267,7 +267,7 @@ void Conv2DTransposeOpMaker::Make() {
                "workspace is a section of GPU memory which will be "
                "allocated/freed each time the operator runs, larger "
                "workspace size can increase performance but also requires "
-               "better hardward. This size should be carefully setted.")
+               "better hardward. This size should be carefully set.")
       .SetDefault(platform::GetDefaultConvWorkspaceSizeLimitMB());
   AddComment(R"DOC(
 Convolution2D Transpose Operator.
@@ -368,7 +368,7 @@ void Conv3DTransposeOpMaker::Make() {
                "workspace is a section of GPU memory which will be "
                "allocated/freed each time the operator runs, larger "
                "workspace size can increase performance but also requires "
-               "better hardward. This size should be carefully setted.")
+               "better hardward. This size should be carefully set.")
       .SetDefault(platform::GetDefaultConvWorkspaceSizeLimitMB());
   AddComment(R"DOC(
 Convolution3D Transpose Operator.
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index fc73f938a9b2df5c2594c3e65a0ebb87b01cd459..5626d2bf655c2f376301c44f541c99a6257ca957 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -36,7 +36,7 @@ class CropOp : public framework::OperatorWithKernel {
       auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
       PADDLE_ENFORCE_EQ(
           int64_t(shape.size()), x_dim.size(),
-          "Shape size should be equal to dimention size of input tensor.");
+          "Shape size should be equal to dimension size of input tensor.");
       std::vector<int64_t> tensor_shape(shape.size());
       for (size_t i = 0; i < shape.size(); ++i) {
         tensor_shape[i] = static_cast<int64_t>(shape[i]);
diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc
index 83047ac8850a9880839f47ef353c6667b926a27d..5a06c50c89f0a2b35ec7057a06bf5188047f55d0 100644
--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ b/paddle/fluid/operators/crop_tensor_op.cc
@@ -82,7 +82,7 @@ class CropTensorOp : public framework::OperatorWithKernel {
     }
     PADDLE_ENFORCE_EQ(int64_t(shape.size()), x_dim.size(),
                       "Attr(shape)'size of Op(crop_tensor) should be equal to "
-                      "dimention size of input tensor.");
+                      "dimension size of input tensor.");
     std::vector<int64_t> out_shape(shape.size(), -1);
     for (size_t i = 0; i < shape.size(); ++i) {
       if (shape[i] > 0) {
diff --git a/paddle/fluid/operators/crop_tensor_op.h b/paddle/fluid/operators/crop_tensor_op.h
index 5337510b4f027f225fadaa3d0936e7bc5ba8b5fe..4c6b70d889a5fcc0f50a5ba8684fc222b25a3556 100644
--- a/paddle/fluid/operators/crop_tensor_op.h
+++ b/paddle/fluid/operators/crop_tensor_op.h
@@ -157,7 +157,7 @@ void CropTensorFunction(const framework::ExecutionContext& context) {
 
   // get shape from Input(ShapeTensor) of Input(Shape)
   std::vector<int> shape = GetShape(context);
-  // out_dims setted by arrt(shape)
+  // out_dims set by arrt(shape)
   if (shape.size() == 0) {
     for (int i = 0; i < out_dims.size(); ++i) {
       shape.push_back(out_dims[i]);
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index ab5d45b800de1f268165a5505c8b7f3e637ecd91..0553619a8b42fd711a3d25f76666dcc8c7c64c4d 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -203,7 +203,7 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
               "represents the cross entropy loss.");
     AddAttr<bool>("soft_label",
                   "(bool, default false), a flag indicating whether to "
-                  "interpretate the given labels as soft labels.")
+                  "interpretant the given labels as soft labels.")
         .SetDefault(false);
     AddAttr<int>("ignore_index",
                  "(int, default -100), Specifies a target value that is"
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
index be386a8eb8403394a05f59e50a30788575be1d88..8af29133f1a2b1f790a451ac46932bb8f26b5989 100644
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
@@ -63,7 +63,7 @@ class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
               "sequence in Output.")
         .AsDispensable();
     AddAttr<int>("blank",
-                 "(int, default: 0), the blank label setted in Connectionist "
+                 "(int, default: 0), the blank label set in Connectionist "
                  "Temporal Classification (CTC) op.")
         .SetDefault(0);
     AddAttr<bool>("merge_repeated",
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 89cd5c697743e61fa945680cf84d81aea74ecfa9..835bfcc484db25377fe84db558226e2896dc3188 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -33,8 +33,8 @@ class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of cumsum operator");
     AddOutput("Out", "Output of cumsum operator");
     AddAttr<int>("axis",
-                 "The dimenstion to accumulate along. -1 means the last "
-                 "dimenstion [default -1].")
+                 "The dimension to accumulate along. -1 means the last "
+                 "dimension [default -1].")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
     AddAttr<bool>("exclusive",
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
index e76649e828369c66702d08aced51e7edf881579a..e95fb0d45cc835d36791f7cba9ae364aa743441d 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
@@ -67,7 +67,7 @@ class DeformablePSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
         "the number of groups which input channels are divided."
         "(eg.number of input channels is k1*k2*(C+1), which k1 and k2 "
         "are group width and height and C+1 is number of output "
-        "chanels. eg.(4, 6), which 4 is height of group and 6 is "
+        "channels. eg.(4, 6), which 4 is height of group and 6 is "
         "width of group");
     AddAttr<int>("pooled_height",
                  "(int), "
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 070c822a7edbe346000e6f5f9b76f67686114b59..28705a7a2bf6c3a0a2606e13ab0abb8670c9ba49 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -117,7 +117,7 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
         .InEnum({"encode_center_size", "decode_center_size"});
     AddAttr<bool>("box_normalized",
                   "(bool, default true) "
-                  "whether treat the priorbox as a noramlized box")
+                  "whether treat the priorbox as a normalized box")
         .SetDefault(true);
     AddAttr<int>("axis",
                  "(int, default 0)"
@@ -140,7 +140,7 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
               "box_coder_op with shape [N, M, 4] representing the result of N "
               "target boxes encoded with M Prior boxes and variances. When "
               "code_type is 'decode_center_size', N represents the batch size "
-              "and M represents the number of deocded boxes.");
+              "and M represents the number of decoded boxes.");
 
     AddComment(R"DOC(
 
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index db69cf0301c2eb940418a74f1332a35457e560fd..16753c429455a38e3a0ccd158ebd73559b8bf136 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -403,7 +403,7 @@ class GenerateMaskLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
         "each element is a bounding box with (xmin, ymin, xmax, ymax) format.");
     AddInput("LabelsInt32",
              "(LoDTensor), This intput is a 2D LoDTensor with shape [R, 1], "
-             "each element repersents a class label of a roi");
+             "each element represents a class label of a roi");
     AddOutput(
         "MaskRois",
         "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. "
@@ -411,7 +411,7 @@ class GenerateMaskLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
         "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
     AddOutput("RoiHasMaskInt32",
               "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
-              "each element repersents the output mask rois index with regard "
+              "each element represents the output mask rois index with regard "
               "to input rois");
     AddOutput("MaskInt32",
               "(LoDTensor), This output is a 4D LoDTensor with shape [P, Q], "
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index b8195fbcc03be1a246a4a5099a7d7f49e17e9593..79780e0d4eeadb537dfe8975a8e61aba19adb7ca 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -521,11 +521,11 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
         "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
     AddOutput("LabelsInt32",
               "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
-              "each element repersents a class label of a roi");
+              "each element represents a class label of a roi");
     AddOutput("BboxTargets",
               "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
               "class_nums], "
-              "each element repersents a box label of a roi");
+              "each element represents a box label of a roi");
     AddOutput(
         "BboxInsideWeights",
         "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc
index 55012556e239b5a11adcc8393a20c180c49b1faa..6f8a8b0a08516a4df0b4064e2231094cb0f19051 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -63,7 +63,7 @@ class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
              "bottom coordinate of the box.");
     AddAttr<bool>("box_normalized",
                   "(bool, default true) "
-                  "whether treat the priorbox as a noramlized box")
+                  "whether treat the priorbox as a normalized box")
         .SetDefault(true);
     AddOutput("Out",
               "(LoDTensor, the lod is same as input X) The output of "
diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
index ee0708312ddbbe1b4390e3a96e533c2aa053e60a..36e9d6028015f9e3b54f455c07fdbabaad2e8cd4 100644
--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -393,7 +393,7 @@ class LocalityAwareNMSOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("nms_top_k",
                  "(int64_t) "
                  "Maximum number of detections to be kept according to the "
-                 "confidences aftern the filtering detections based on "
+                 "confidences after the filtering detections based on "
                  "score_threshold");
     AddAttr<float>("nms_threshold",
                    "(float, default: 0.3) "
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 62d6bb3ac15809919157f228ae058c68dd5355f2..9cdc46b4a26efe1830ebf7057cd23a1428c582be 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -424,7 +424,7 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("nms_top_k",
                  "(int64_t) "
                  "Maximum number of detections to be kept according to the "
-                 "confidences aftern the filtering detections based on "
+                 "confidences after the filtering detections based on "
                  "score_threshold");
     AddAttr<float>("nms_threshold",
                    "(float, default: 0.3) "
diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc
index 7c187066c666f45ec5954d25280f716e32aa964b..3c02796de01e395659f5036d9f1311bcac68366d 100644
--- a/paddle/fluid/operators/detection/target_assign_op.cc
+++ b/paddle/fluid/operators/detection/target_assign_op.cc
@@ -44,7 +44,7 @@ class TargetAssignOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(neg_dims.size(), 2,
                         "The rank of Input(NegIndices) must be 2.");
       PADDLE_ENFORCE_EQ(neg_dims[1], 1,
-                        "The last dimenstion of Out(NegIndices) must be 1.");
+                        "The last dimension of Out(NegIndices) must be 1.");
     }
 
     auto n = mi_dims[0];
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 5ffcfc0458f30dea2987b3b6ee27a1aa91486e21..dc7c465aae7451a5c82fec352d19194257f74da1 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -111,15 +111,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X",
              "The input tensor of YOLOv3 loss operator, "
              "This is a 4-D tensor with shape of [N, C, H, W]."
-             "H and W should be same, and the second dimention(C) stores"
+             "H and W should be same, and the second dimension(C) stores"
              "box locations, confidence score and classification one-hot"
              "keys of each anchor box");
     AddInput("GTBox",
              "The input tensor of ground truth boxes, "
              "This is a 3-D tensor with shape of [N, max_box_num, 5], "
              "max_box_num is the max number of boxes in each image, "
-             "In the third dimention, stores x, y, w, h coordinates, "
-             "x, y is the center cordinate of boxes and w, h is the "
+             "In the third dimension, stores x, y, w, h coordinates, "
+             "x, y is the center coordinate of boxes and w, h is the "
              "width and height and x, y, w, h should be divided by "
              "input image height to scale to [0, 1].");
     AddInput("GTLabel",
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_mul_op_dim.cc b/paddle/fluid/operators/elementwise/test_elementwise_mul_op_dim.cc
index 4477aa0a3f8acd7e7c1d186ef2c3648f71fa2bd0..7443c142d0f7cf8dc83680791474804dedf8c738 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_mul_op_dim.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_mul_op_dim.cc
@@ -79,7 +79,7 @@ TEST(ElementwiseMulOpTester, correct_dims) {
   MainTest(test_data);
 }
 
-// Checks if AreDimsAndFormatCorrect fails when channel_num is not divisable by
+// Checks if AreDimsAndFormatCorrect fails when channel_num is not devisable by
 // 16
 TEST(ElementwiseMulOpTester, incorrect_channel_num) {
   TestData test_data;
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cc b/paddle/fluid/operators/fused/fusion_group_op.cc
index 5880c3b317e6d5ed2a5b5cef80186928755e746a..503c0355855e4d8e0b258dedd98f4422c3795e66 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op.cc
@@ -76,7 +76,7 @@ class FusionGroupOpMaker : public framework::OpProtoAndCheckerMaker {
 fusion_group Operator.
 
 It is used to execute a generated CUDA kernel which fuse the computation of
-multiple operators into one. It supports serveral types:
+multiple operators into one. It supports several types:
 0, fused computation of elementwise operations in which all the dims of inputs
     and outputs should be exactly the same.
 )DOC");
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
index 71202b26443fece8985e5d043b9c68d5760a08c2..17cb4556d45ef3adee2adc0d2f19ea048e096982 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -76,7 +76,7 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
         }
       }
 
-      // Since concat is aftern flatten, the output is 2D tensor.
+      // Since concat is after flatten, the output is 2D tensor.
       // If concat_axis is 0, each input's permutated tensor is continuous.
       // If concat_axis is 1, the stride of 0-th dim of each input's
       // permutated tensor is odims()[1].
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 244de0e355222d34a63e9f9c453552adcdc86080..58476efa976e1e74d153aa2d93bd066a41e8285e 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -84,7 +84,7 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
         "Grid",
         "(Tensor) The input grid of GridSampleOp generated by AffineGridOp, "
         "This is a 4-D tensor with shape of [N, H, W, 2] is the concatenation "
-        "of x and y coordinates with shape [N, H, W] in last dimention");
+        "of x and y coordinates with shape [N, H, W] in last dimension");
     AddOutput("Output", "(Tensor) Output tensor with shape [N, C, H, W]");
     AddAttr<bool>(
         "use_cudnn",
@@ -93,11 +93,11 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
       This operation samples input X by using bilinear interpolation based on 
-      flow field grid, which is usually gennerated by affine_grid. The grid of
+      flow field grid, which is usually generated by affine_grid. The grid of
       shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
       with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
-      (in width dimension) of input data x and grid_y is indexng the 3rd 
-      dimention (in height dimension), finally results is the bilinear 
+      (in width dimension) of input data x and grid_y is indexing the 3rd 
+      dimension (in height dimension), finally results is the bilinear 
       interpolation value of 4 nearest corner points.
 
       Step 1:
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index da413dba6460af1f2376f491594a430afae0b1b0..114cc64edde728e7041ad8487429da4350f75268 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -113,7 +113,7 @@ class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddOutput(
         "BatchResetHiddenPrev",
-        "(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
+        "(LoDTensor) The reset hidden state LoDTensor organized in batches. "
         "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
         "with `BatchGate`.")
         .AsIntermediate();
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
index c5f7f7b3ff43f65ce62bb0f41408735d6f9399df..038ce5a7ae8c3033638ecd1bd848c4bc43ce44d1 100644
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -97,7 +97,7 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddOutput("ResetHiddenPrev",
               "(Tensor) Matrix with shape [batch_size, frame_size] for the "
-              "reseted hidden state of previous time step.")
+              "reset hidden state of previous time step.")
         .AsIntermediate();
     AddOutput("Hidden",
               "(Tensor) The GRU hidden state of the current time step "
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 8028b20e06da18cacf008cd3aeaa712f9a5ee589..bed9b81587976d364309a42b4239fb2f15325257 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -144,7 +144,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddOutput(
         "W_Out",
-        "(LoDTensor, optinal) using input 'W' as Output to make it mutable"
+        "(LoDTensor, optional) using input 'W' as Output to make it mutable"
         "When we are using prefetch")
         .AsIntermediate();
     AddAttr<AttrType>("num_classes", "(int, optional), The number of classes")
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index cc753b1f91fd851514b21bdb260a569fa4576af5..c68e97cf98a62f9079724616b709651165dbafe9 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -285,7 +285,7 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
           interpolation.
 
           Nearest neighbor interpolation is to perform nearest neighbor interpolation
-          in both the 3rd dimention(in height direction) and the 4th dimention(in width 
+          in both the 3rd dimension(in height direction) and the 4th dimension(in width 
           direction) on input tensor.
             
           Bilinear interpolation is an extension of linear interpolation for 
@@ -299,7 +299,7 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
           H-direction and W-direction in this op) on a rectilinear 3D grid. 
           The linear interpolation is performed on three directions.
 
-          Align_corners and align_mode are optinal parameters,the calculation method 
+          Align_corners and align_mode are optional parameters,the calculation method 
           of interpolation can be selected by them.
           
           Example:
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 099003586d7cb6af311dd434c1920c038ee18480..f9be26a6ee88d8484ddb45ac623170261e51a4cc 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -296,7 +296,7 @@ $$
 
 Function implementation:
 
-Inputs and outpus are in NCHW or NHWC format, while input.shape.ndims() equals 4.
+Inputs and outputs are in NCHW or NHWC format, while input.shape.ndims() equals 4.
 If NCHW, the dimensions 0 ~ 3 represent batch size, feature maps, rows,
 and columns, respectively.
 
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index c399cb5d44aaa50fab00fd170c021c8c70eee990..410adc7b283a543dd2a4e2a357050d224fc15362 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -105,7 +105,7 @@ class SimpleCode {
   SimpleCode(size_t code, size_t num_classes, const int64_t* ids)
       : c_(static_cast<size_t>(ids[code]) + num_classes) {}
   /**
-   * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
+   * Here the id of root should be 1 rather than 0, thus the encoding of class c
    * is `c + num_classes` and all siblings can get the same weight indice using
    * prefixes.
    * Weight index is the prefixes of encoding, thus leave out the right most
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 4fafe439edc5b3a89cc78dc66b874175ccbcf607..c805d5419795c4809db0bf1718d57d0b75d5b769 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -129,19 +129,19 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
         "CustomDistProbs",
         "(Tensor) It is used in 'CostumDist' sampler. "
         "It is a tensor with shape [num_total_classes]."
-        "The i-th element is the probsbility of the i-th class being sampled.")
+        "The i-th element is the probability of the i-th class being sampled.")
         .AsDispensable();
     AddInput(
         "CustomDistAlias",
         "(Tensor) It is used in 'CostumDist' sampler. "
         "It is a tensor with shape [num_total_classes]."
-        "The i-th element is the probsbility of the i-th class being sampled.")
+        "The i-th element is the probability of the i-th class being sampled.")
         .AsDispensable();
     AddInput(
         "CustomDistAliasProbs",
         "(Tensor) It is used in 'CostumDist' sampler. "
         "It is a tensor with shape [num_total_classes]."
-        "The i-th element is the probsbility of the i-th class being sampled.")
+        "The i-th element is the probability of the i-th class being sampled.")
         .AsDispensable();
 
     AddOutput("Cost",
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 4fb4f7a022f6bd8ff0843b62d735a3535ae77d81..160905834ae7f31b1d14cc6056d388f05d99cc44 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -36,7 +36,7 @@ class PadConstantLikeOp : public framework::OperatorWithKernel {
     auto y_dim = ctx->GetInputDim("Y");
 
     PADDLE_ENFORCE_EQ(x_dim.size(), y_dim.size(),
-                      "The dimention of X and Y should be the same.");
+                      "The dimension of X and Y should be the same.");
 
     for (int i = 0; i < x_dim.size(); ++i) {
       if ((!ctx->IsRuntime()) && ((x_dim[i] == -1) || (y_dim[i] == -1))) {
@@ -164,7 +164,7 @@ class PadConstantLikeOpGrad : public framework::OperatorWithKernel {
     auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_EQ(dout_dim.size(), y_dim.size(),
-                      "The dimention of X and Y should be the same.");
+                      "The dimension of X and Y should be the same.");
 
     auto y_grad_name = framework::GradVarName("Y");
     if (ctx->HasOutput(y_grad_name)) {
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
index caf6892a9879e0e9fe10610a57cbadef108f0f63..e6cafb2584e46216d3f11d4feefe695f8bae09b6 100644
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -325,7 +325,7 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
       } else {
         PADDLE_ENFORCE_EQ(rois->lod().empty(), false,
                           platform::errors::InvalidArgument(
-                              "the lod of Input ROIs shoule not be empty when "
+                              "the lod of Input ROIs should not be empty when "
                               "BatchRoINums is None!"));
         auto rois_lod = rois->lod().back();
         int rois_batch_size = rois_lod.size() - 1;
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index 25f45d0b2c978df8629b590974a606cd492802b2..5ec846c147373911281b04c79dc32a43171ddca0 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -293,7 +293,7 @@ class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_ENFORCE_EQ(rois->lod().empty(), false,
                         platform::errors::InvalidArgument(
-                            "the lod of Input ROIs shoule not be empty when "
+                            "the lod of Input ROIs should not be empty when "
                             "BatchRoINums is None!"));
       auto rois_lod = rois->lod().back();
       int rois_batch_size = rois_lod.size() - 1;
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 3da7c88f1fe001a8a2950edbedcf3582150f08ff..8a06f011a022e35a8d5c98be366587acdfa90a3c 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -24,8 +24,8 @@ namespace operators {
 // Returns true if the two dimensions are compatible.
 // A dimension is compatible with the other if:
 // 1. The length of the dimensions are same.
-// 2. Each non-negative number of the two dimentions are same.
-// 3. For negative number in a dimention, it means unknown so it is compatible
+// 2. Each non-negative number of the two dimensions are same.
+// 3. For negative number in a dimension, it means unknown so it is compatible
 //    with any number.
 bool DimensionIsCompatibleWith(const framework::DDim& first,
                                const framework::DDim& second) {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index de021c8b455bc5a31449662326b3e365c4dd014c..d17e6b65cdf6e0bee70ce02ae8103e135980b200 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -174,7 +174,7 @@ class ReduceOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GT(
         dims.size(), 0,
         "ShapeError: The input dim dimensions of Reduce "
-        "shoud be greater than 0. But received the dim dimesions of Reduce "
+        "should be greater than 0. But received the dim dimesions of Reduce "
         " = %d",
         dims.size());
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 7516909f4516b711bd712a5f54489423ca746dbd..cc4635fefdda37947b79b3798f53379bb8f2e0df 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -162,7 +162,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
             shape[i], 0,
             platform::errors::InvalidArgument(
                 "Each dimension value of 'shape' in ReshapeOp must not "
-                "be negtive except one unknown dimension. "
+                "be negative except one unknown dimension. "
                 "But received  shape = [%s], shape[%d] = %d.",
                 framework::make_ddim(shape), i, shape[i]));
       }
@@ -234,7 +234,7 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor<int32>, optional). Target shape of reshape operator. "
              "It has a higher priority than Attr(shape) but a lower priority "
              "than Input(ShapeTensor). The Attr(shape) still should be "
-             "set correctly to gurantee shape inference in compile time.")
+             "set correctly to guarantee shape inference in compile time.")
         .AsDispensable();
     AddInput(
         "ShapeTensor",
@@ -288,7 +288,7 @@ dimension value will be copied from Input(X) at runtime. Note that the index of
 [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
 
 3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
-Attr(shape) still should be set correctly to gurantee shape inference in
+Attr(shape) still should be set correctly to guarantee shape inference in
 compile-time.
 
 )DOC");
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 12dadc4eb9288e334dfc90b83f4ccd5658abf565..9d82017bfd2c8a14ebe46dcab1d1b665bfa5044b 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -86,7 +86,7 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Updates", "The updated value of scatter op");
     AddOutput("Out", "The output of scatter op");
     AddAttr<bool>("overwrite",
-                  "(bool, defalut: True) "
+                  "(bool, default: True) "
                   "The mode that updating the output when has same index,"
                   "If True, use the overwrite mode to update the output"
                   "of the same index, if False, use the accumulate mode to"
diff --git a/paddle/fluid/operators/select_input_op.cc b/paddle/fluid/operators/select_input_op.cc
index 33a5ff99a5d984d2327aac6b05421891f6c05e14..be0d8a138490a3f6106f37264fe84a9b8a5e38e3 100644
--- a/paddle/fluid/operators/select_input_op.cc
+++ b/paddle/fluid/operators/select_input_op.cc
@@ -67,7 +67,7 @@ class SelectInputOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     // Because this op is blocking whole control flow. I am implementing MVP
     // (minimal viable product) here.
     AddComment(R"DOC(
-Merge branches of LoDTensor into a single Output with a mask interger
+Merge branches of LoDTensor into a single Output with a mask integer
 specifying the output branchi.
 )DOC");
   }
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
index 3c6d36a0a613d275a1df9eecae500cb8a2cd0020..df2176429bba759e9040082d3efaeeb3ba1ca4d7 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
@@ -118,7 +118,7 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
         "sequences before padding.");
     AddAttr<int>(
         "padded_length",
-        "The length of padded sequences. It can be setted to -1 or "
+        "The length of padded sequences. It can be set to -1 or "
         "any positive int. When it is -1, all sequences will be padded up to "
         "the length of the longest one among them; when it a certain positive "
         "value, it must be greater than the length of the longest original "
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
index 09dba540282b1b8ab0dfcbfa1160386bcf48c860..10f382c9f062ea8d9004ecc9e0ca569e88c4a235 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
@@ -54,7 +54,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
     AddOutput("Out",
               "(Tensor) The output of SequencePoolOp does not contain LoD "
-              "infomation.");
+              "information.");
     AddOutput("MaxIndex",
               "(Tensor<int>) This tensor is used for the sequence max-pooling "
               "to record the max indexes.")
@@ -93,7 +93,7 @@ Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
 Besides, for the sake of simplicity, we assume M=1 and N=1,
 and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
 
-Thus, Out is a [3,1,1] Tensor without LoD infomation.
+Thus, Out is a [3,1,1] Tensor without LoD information.
 And for different pooltype, the value of Out is as follows:
 
 - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
index b442c41eed16c49641ef47dc48c3a50316813c2e..38b70d07c542b3ddd6198257284ef48772997fe5 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
@@ -63,7 +63,7 @@ class SequenceTopkAvgPoolingOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput(
         "Out",
         "(Tensor) The output of SequenceTopkPoolingOp does not contain LoD "
-        "infomation.");
+        "information.");
     AddOutput("pos", "(Tensor<int>) store the topk index ").AsIntermediate();
     AddAttr<std::vector<int>>("topks", "topks");
     AddAttr<int>("channel_num", "channel number");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
index 2b3c5a094064048c58844d49d71705cf78aafb72..d7ae82c783b0e2e918011c55b3848b93552df8e3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
@@ -96,7 +96,7 @@ class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
                     [ 6.0,  7.0,  8.0,  9.0, 10.0],
                     [11.0, 12.0, 13.0, 14.0, 15.0]], 
 `     
-      in which there are 3 sequences padded to length 5, and the acutal length 
+      in which there are 3 sequences padded to length 5, and the actual length 
       specified by Input(Length):
 
           Length.data = [2, 3, 4],
diff --git a/paddle/fluid/operators/shard_index_op.cc b/paddle/fluid/operators/shard_index_op.cc
index a02d03671591d1a859d168a8195fcc6cda0e3434..3c1de753acfdaf3f728f6c2088a06f4a35c69a58 100644
--- a/paddle/fluid/operators/shard_index_op.cc
+++ b/paddle/fluid/operators/shard_index_op.cc
@@ -63,7 +63,7 @@ class ShardIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("nshards",
                  "A positive integer to specify the number of shards.");
     AddAttr<int>("shard_id", "The current shard id");
-    AddAttr<int>("ignore_value", "An ingeter value out of sharded range")
+    AddAttr<int>("ignore_value", "An integer value out of sharded range")
         .SetDefault(-1);
     AddComment(R"DOC(
 This layer creates the sharded index for input. This layers is used in
@@ -80,7 +80,7 @@ to
     y = x % shard_size if x / shard_size == shard_id else ignore_value
 
 We take the distributed one-hot representation to show what this layer is
-used for. The distributed one-hot representation is seperated into multiple
+used for. The distributed one-hot representation is separated into multiple
 shards, and each shard is filling zeros except the one with the index
 inside. In order to create these sharded representation in each trainer,
 the original index should be recalculated (i.e. sharded) before.
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index c9b0795ddb0bf5764a4ca20c5d26445affc257ec..16e1e17ca53a40e84994f9d3dd280777b775ee35 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -73,12 +73,12 @@ class ShrinkRNNMemoryOp : public ArrayOp {
 class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
+    AddInput("X", "(LoDTensor) The RNN step memory to be shrank.");
     AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
     AddInput("I",
              "(LoDTensor) The step index. The RNN step memory 'X' will be "
-             "shrinked to match the size of the input of the index'th step.");
-    AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
+             "shrank to match the size of the input of the index'th step.");
+    AddOutput("Out", "(LoDTensor) The shrank RNN step memory.");
     AddComment(R"DOC(
 This operator is used to shrink output batch of memory defined in dynamic RNN.
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index ebb299ba1f3742a5d2d6d161e7163a8a73c14a1d..6e91ea44699931f41d6b4ed9faafe5ccb1bc1b54 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -31,7 +31,7 @@ class SoftmaxWithCrossEntropyOpMaker
              "by softmax.");
     AddInput(
         "Label",
-        "(Tensor) The input tesnor of groud truth label. If :attr:`soft_label` "
+        "(Tensor) The input tensor of groud truth label. If :attr:`soft_label` "
         "is set to false, Label is a Tensor<int64> in same shape with "
         "Input(Logits) except the shape in dimension :attr:`axis` as 1. If "
         "soft_label is set to true, Label is a Tensor<float/double> in same "
@@ -50,7 +50,7 @@ class SoftmaxWithCrossEntropyOpMaker
               "entropy loss.");
     AddAttr<bool>(
         "soft_label",
-        "(bool, default: false), A flag to indicate whether to interpretate "
+        "(bool, default: false), A flag to indicate whether to interpretant "
         "the given labels as soft labels.")
         .SetDefault(false);
     AddAttr<bool>(
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 8f725be665b38ac75c6e0e427fef6403fe8cabd0..dbda4b9b7e03a41a9630722dfe82fbde62ee5437 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -100,7 +100,7 @@ where:
 Therefore, the calculation can be separated into 3 steps:
 Step 1: row-wise operation to calculate max_i
 Step 2: row-wise operation to calculate logDiffMaxSum_i
-Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
+Step 3: calculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
 To save memory, we can share memory among max_i, logDiffMaxSum_i and
 cross\_entropy_i.
 In this way, the 3 steps should be changed to:
diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc
index 2cdcaeaf09db55b1ca403e77991375d796707dee..7527424aac5ee13ef17c317b8c9de77021a723e6 100644
--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ b/paddle/fluid/operators/spectral_norm_op.cc
@@ -93,7 +93,7 @@ class SpectralNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("U",
              "The weight_u tensor of spectral_norm operator, "
              "This can be a 1-D tensor in shape [H, 1],"
-             "H is the 1st dimentions of Weight after reshape"
+             "H is the 1st dimensions of Weight after reshape"
              "corresponding by Attr(dim). As for Attr(dim) = 1"
              "in conv2d layer with weight shape [M, C, K1, K2]"
              "Weight will be reshape to [C, M*K1*K2], U will"
@@ -101,7 +101,7 @@ class SpectralNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("V",
              "The weight_v tensor of spectral_norm operator, "
              "This can be a 1-D tensor in shape [W, 1], "
-             "W is the 2nd dimentions of Weight after reshape "
+             "W is the 2nd dimensions of Weight after reshape "
              "corresponding by Attr(dim). As for Attr(dim) = 1 "
              "in conv2d layer with weight shape [M, C, K1, K2] "
              "Weight will be reshape to [C, M*K1*K2], V will "
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 0e6781aa1c9df0cf3b212f35994f2885f0f4edc3..3a33c8be101ecc4323e34d7aa2ed514beac90e92 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -276,7 +276,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
             "size(%d).\n"
             "There are two possible causes for this problem: \n"
             "1. Check whether the runtime batch is larger than the max_batch "
-            "setted by EnableTensorrtEngine()\n"
+            "set by EnableTensorrtEngine()\n"
             "2. Check whether the model you are running has multiple trt "
             "subgraphs: \n "
             "\tIf there are multiple trt subgraphs, you need to ensure that "
diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index 6f0f523001905d53568bcd77c489003ebb75bf0c..394a89a0c07644f718141ee4d17bcd979874d758 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -51,7 +51,7 @@ class UnfoldOpMaker : public framework::OpProtoAndCheckerMaker {
 
 This Operator is used to extract sliding local blocks from a batched input tensor, also known
 as im2col when operated on batched 2D image tensor. For each block under the convolution filter,
-all element will be rearranged as a column. While the convolution filter silding over the input
+all element will be rearranged as a column. While the convolution filter sliding over the input
 feature map, a series of such columns will be formed. 
     )DOC");
   }
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 566fe662156dcef3ccd8c48dad6c7c85c1760bb6..df617742317b918e37b681dca7d9b6817ffe3d4c 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -177,7 +177,7 @@ class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
              "according to "
              "this given shape. It means that it has a higher priority than "
              "the shape attribute, while the shape attribute still should be "
-             "set correctly to gurantee shape inference in compile time.")
+             "set correctly to guarantee shape inference in compile time.")
         .AsDispensable();
     AddInput("ShapeTensorList",
              "(vector<Tensor<int64_t>> or vector<Tensor<int32_t>>, optional). "
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 0a0f7af6d9eb41e8d98078e8efaa0ec980c55eeb..543a6fb73d647f2517d220f1619121774c15e973 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -153,7 +153,7 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_LT(static_cast<int>(axes.size()), 6,
                             "Invalid dimensions, dynamic dimensions should be "
                             "within [1, 6] dimensions (Eigen limit).");
-          // Validity Check: the range of unsqueeze aixs.
+          // Validity Check: the range of unsqueeze axis.
           for (int axis : axes) {
             PADDLE_ENFORCE_LT(axis, 6,
                               "Invalid dimensions, input axis should be"
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 04217e0ff200d9cc68e9aad1549d7c6693e9361a..d52889e0e54b9218c71c2a57f5d761c94b8a12a4 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -123,10 +123,10 @@ An operator integrating the open-source
 https://arxiv.org/pdf/1512.02595v1.pdf),
 to compute Connectionist Temporal Classification (CTC) loss.
 It can be aliased as softmax with ctc, since a native softmax activation is
-interated to the warp-ctc library, to to normlize values for each row of the
+interated to the warp-ctc library, to to normalize values for each row of the
 input tensor.
 
-More detail of CTC loss can be found by refering to
+More detail of CTC loss can be found by referring to
 [Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with
 Recurrent Neural Networks](
 http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf).
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 74ab56c07cf695effb678925f7c91c41b2640cd5..609bc4245e99751492254a95b9f7db9cf95a3572 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -50,7 +50,7 @@ void PrintCuptiHint() {
   static bool showed = false;
   if (showed) return;
   showed = true;
-  LOG(WARNING) << "Invalid timestamp occured. Please try increasing the "
+  LOG(WARNING) << "Invalid timestamp occurred. Please try increasing the "
                   "FLAGS_multiple_of_cupti_buffer_size.";
 }
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index dc1a10418392d51982b14f40e3789359a09da3a4..beeae143f5fc18ea76b40bf5115dd18e21dcbda2 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -226,7 +226,7 @@ void BindImperative(py::module *m_ptr) {
     BackwardStrategy is a descriptor of how to run the backward process.
 
     **Note**:
-        **This API is only avaliable in** `Dygraph <../../user_guides/howto/dygraph/DyGraph.html>`_ **Mode**
+        **This API is only available in** `Dygraph <../../user_guides/howto/dygraph/DyGraph.html>`_ **Mode**
 
     Attribute:
         **sort_sum_gradient**:
@@ -339,7 +339,7 @@ void BindImperative(py::module *m_ptr) {
            },
            R"DOC(
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Returns a numpy array shows the value of current :ref:`api_guide_Variable_en`
 
@@ -375,7 +375,7 @@ void BindImperative(py::module *m_ptr) {
            },
            py::return_value_policy::copy, R"DOC(
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Returns a new Variable, detached from the current graph.
 
@@ -402,7 +402,7 @@ void BindImperative(py::module *m_ptr) {
       .def("clear_gradient", &imperative::VarBase::ClearGradient, R"DOC(
 
         **Notes**:
-        **1. This API is ONLY avaliable in Dygraph mode**
+        **1. This API is ONLY available in Dygraph mode**
 
         **2. Use it only Variable has gradient, normally we use this for Parameters since other temporal Variable will be deleted by Python's GC**
 
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
index f58491243beab9e4425d7c64e72c3daf8f75324b..22ecfac953fde822b6e0f60036dfed10a2fd2ffe 100644
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -224,7 +224,7 @@ def max_job_id():
 
 def movie_categories():
     """
-    Get movie categoriges dictionary.
+    Get movie categories dictionary.
     """
     __initialize_meta_info__()
     return CATEGORIES_DICT
diff --git a/python/paddle/dataset/mq2007.py b/python/paddle/dataset/mq2007.py
index d5740f30c898d5704636e1de9b2e1137d12e3c35..cfabd09705b37667d05d48ca81d80777257e3b8b 100644
--- a/python/paddle/dataset/mq2007.py
+++ b/python/paddle/dataset/mq2007.py
@@ -150,7 +150,7 @@ def gen_plain_txt(querylist):
   gen plain text in list for other usage
   Paramters:
   --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
+  querylist : querylist, one query match many document pairs in list, see QueryList
 
   return :
   ------
@@ -171,7 +171,7 @@ def gen_point(querylist):
   gen item in list for point-wise learning to rank algorithm
   Paramters:
   --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
+  querylist : querylist, one query match many document pairs in list, see QueryList
 
   return :
   ------
@@ -190,9 +190,9 @@ def gen_pair(querylist, partial_order="full"):
   gen pair for pair-wise learning to rank algorithm
   Paramters:
   --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
+  querylist : querylist, one query match many document pairs in list, see QueryList
   pairtial_order : "full" or "neighbour"
-    there is redudant in all possiable pair combinations, which can be simplifed
+    there is redundant in all possible pair combinations, which can be simplified
   gen pairs for neighbour items or the full partial order pairs
 
   return :
@@ -233,7 +233,7 @@ def gen_list(querylist):
   gen item in list for list-wise learning to rank algorithm
   Paramters:
   --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
+  querylist : querylist, one query match many document pairs in list, see QueryList
 
   return :
   ------
@@ -268,7 +268,7 @@ def query_filter(querylists):
 
 def load_from_text(filepath, shuffle=False, fill_missing=-1):
     """
-  parse data file into querys
+  parse data file into queries
   """
     prev_query_id = -1
     querylists = []
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index b9d46f74fd9d18055886dec6bd70d2be20e37aed..c4be745d524702a635f17ad6126953220a33650e 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 """
 paddle.distributed.launch is a module that spawns multiple distributed 
-process on each trainning node for gpu trainning.
+process on each training node for gpu training.
 Usage:
     In both of single node training or multiple node training, this module 
 launch a process on each of the given gpu card.
-    1. for single node trainning with all visible gpu cards:
+    1. for single node training with all visible gpu cards:
        python -m paddle.distributed.launch \
          your_training_py (arg1 arg2 and all others)
     
-    2. for single node trainning with [0,4) cards
+    2. for single node training with [0,4) cards
        python -m paddle.distributed.launch --selected_gpus="0,1,2,3" \
          your_training_py (arg1 arg2 and all others)
-    3. for mulitple node training such as two node:192.168.0.16, 192.168.0.17
+    3. for multiple node training such as two node:192.168.0.16, 192.168.0.17
         on 192.168.0.16:
             python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
                 --node_ip=192.168.0.16 \
@@ -114,14 +114,14 @@ POD_IP (current node ip address, not needed for local training)
         "--selected_gpus",
         type=str,
         default=None,
-        help="It's for gpu trainning and the trainning process will run on the selected_gpus,"
-        "each process is bound to a single GPU. And if it's not setted, this module will use all the gpu cards for training."
+        help="It's for gpu training and the training process will run on the selected_gpus,"
+        "each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
     )
 
     parser.add_argument(
         "--log_dir",
         type=str,
-        help="The path for each process's log.If it's not setted, the log will printed to default pipe."
+        help="The path for each process's log.If it's not set, the log will printed to default pipe."
     )
 
     #positional
diff --git a/python/paddle/distributed/launch_ps.py b/python/paddle/distributed/launch_ps.py
index f8489965e71d4b1457a144568478fe2107e562d2..49b6dccc98e294bf22c6150d91f3647e93060325 100644
--- a/python/paddle/distributed/launch_ps.py
+++ b/python/paddle/distributed/launch_ps.py
@@ -61,7 +61,7 @@ def parse_args():
         "--log_dir",
         default="logs",
         type=str,
-        help="The path for each process's log.If it's not setted, the log will printed to default pipe."
+        help="The path for each process's log.If it's not set, the log will printed to default pipe."
     )
 
     # positional
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index bbe83493d7ef74c7172c758cdfbe9f3ef0b05a14..7a2f201690848e6a2becb7d4edb59fd753e9cb9b 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -832,7 +832,7 @@ def _append_backward_ops_(block,
         target_block(Block): the block which is going to hold new generated grad ops
         no_grad_dict(dict):
             key(int)  block index
-            val(set) a set of varibale names. These varibales have no gradient
+            val(set) a set of variable names. These variables have no gradient
         grad_to_var(dict)(output argument):
             key(str): grad variable name
             val(str): corresponding forward variable name
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 2687690978d7de31db12c0c464e355b195a10f9a..d89b1cb41d8b5056cdfd57010db9791ea0c1ea1e 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -116,7 +116,7 @@ def var_conv_2d(input,
     """
     The var_conv_2d layer calculates the output base on the :attr:`input` with variable length,
     row, col, input channel, filter size and strides. Both :attr:`input`, :attr:`row`,
-    and :attr:`col` are 1-level LodTensor. The covolution operation is same as conv2d layer with 
+    and :attr:`col` are 1-level LodTensor. The convolution operation is same as conv2d layer with 
     padding. Besides, input.dims[1] should be 1. 
 
     .. code-block:: text
@@ -133,9 +133,9 @@ def var_conv_2d(input,
                 output.dims = [174, 1]  # where 174 = 90 + 84
 
     Args:
-        input (Variable): The input shoud be 1-level LodTensor with dims[1] equals 1.
-        row (Variable): The row shoud be 1-level LodTensor to provide height information.
-        col (Variable): The col shoud be 1-level LodTensor to provide width information.
+        input (Variable): The input should be 1-level LodTensor with dims[1] equals 1.
+        row (Variable): The row should be 1-level LodTensor to provide height information.
+        col (Variable): The col should be 1-level LodTensor to provide width information.
         input_channel (int): The number of input channel.
         output_channel (int): The number of output channel.
         filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
@@ -325,9 +325,9 @@ def sequence_topk_avg_pooling(input, row, col, topks, channel_num):
 
     Args:
         input (Variable): The input should be 2D LodTensor with dims[1] equals 1.
-        row (Variable): The row shoud be 1-level LodTensor to provide the height information
+        row (Variable): The row should be 1-level LodTensor to provide the height information
                         of the input tensor data.
-        col (Variable): The col shoud be 1-level LodTensor to provide the width information
+        col (Variable): The col should be 1-level LodTensor to provide the width information
                         of the input tensor data.
         topks (list): A list of incremental value to average the topk feature.
         channel_num (int): The number of input channel.
@@ -555,7 +555,7 @@ def multiclass_nms2(bboxes,
                                  low confidence score. If not provided, 
                                  consider all boxes.
         nms_top_k (int): Maximum number of detections to be kept according to
-                         the confidences aftern the filtering detections based
+                         the confidences after the filtering detections based
                          on score_threshold.
         nms_threshold (float): The threshold to be used in NMS. Default: 0.3
         nms_eta (float): The threshold to be used in NMS. Default: 1.0
diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py
index 17fe99ca4e6fa09dbb33b8cf2c0072fd71248661..603aa72a5a5f48c8b782bd23d1cee5dafc2d5bf1 100644
--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ b/python/paddle/fluid/contrib/layers/rnn_impl.py
@@ -181,7 +181,7 @@ def basic_gru(input,
         sequence_length (Variabe|None): A Tensor (shape [batch_size]) stores each real length of each instance,
                         This tensor will be convert to a mask to mask the padding ids
                         If it's None means NO padding ids
-        dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of earch layers, 
+        dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of each layers, 
                              NOT between time steps
         bidirectional (bool|False): If it is bidirectional
         batch_first (bool|True): The shape format of the input and output tensors. If true,
@@ -411,7 +411,7 @@ def basic_lstm(input,
         sequence_length (Variabe|None): A tensor (shape [batch_size]) stores each real length of each instance,
                         This tensor will be convert to a mask to mask the padding ids
                         If it's None means NO padding ids
-        dropout_prob(float|0.0): Dropout prob, dropout ONLY work after rnn output of earch layers, 
+        dropout_prob(float|0.0): Dropout prob, dropout ONLY work after rnn output of each layers, 
                              NOT between time steps
         bidirectional (bool|False): If it is bidirectional
         batch_first (bool|True): The shape format of the input and output tensors. If true,
diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py
index 1f7ec69dd7544a2835b9e336491c9d0fa2c76925..b5d85616cf03c7ed56f1e6f03a359e32aacee36d 100644
--- a/python/paddle/fluid/contrib/memory_usage_calc.py
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-This module privides a memory usage calculate function for user.
+This module provides a memory usage calculate function for user.
 The purpose of this API is to allow users to estimate memory usage of
 a program under a special batch size, then user can set appropriate
 batch size to fully utilize a GPU.
@@ -91,8 +91,9 @@ def memory_usage(program, batch_size):
             for x in var.shape:
                 if x < 0:
                     if neg_dim_count >= 1:
-                        raise ValueError("Var %s has more than one negtive dim."
-                                         % (var_name))
+                        raise ValueError(
+                            "Var %s has more than one negative dim." %
+                            (var_name))
                     neg_dim_count += 1
                     data_count *= batch_size * (-x)
                 else:
diff --git a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
index 471a796eb3e0a75a1fa0a9eb28499c9b168a3ee3..807d3c6a43078e1a316a558c13e5e3d375e846db 100644
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
@@ -147,7 +147,7 @@ class QuantizeTranspiler(object):
         """Rewrites a training input program in place for simulated
         quantization. Insert fake quantization and de-quantization ops into
         program to simulate the error introduced by quantization. And change
-        the graident ops' input by using the faked quantization weights and
+        the gradient ops' input by using the faked quantization weights and
         activation. Since the program is transformed in place, the graph
         connection will change.
 
diff --git a/python/paddle/fluid/contrib/slim/core/compressor.py b/python/paddle/fluid/contrib/slim/core/compressor.py
index 0faac37b4937590689e64275840831a88d0eb7c8..6d87a871ed281501ba1a3695c163ddfe5059463e 100644
--- a/python/paddle/fluid/contrib/slim/core/compressor.py
+++ b/python/paddle/fluid/contrib/slim/core/compressor.py
@@ -302,7 +302,7 @@ class Compressor(object):
                                  this optimizer is used to minimize the combined loss of student-net and
                                  teacher-net while train_optimizer is used to minimize loss of
                                  student-net in fine-tune stage. 
-            search_space(slim.nas.SearchSpace): The instance that define the searching space. It must inherite
+            search_space(slim.nas.SearchSpace): The instance that define the searching space. It must inherit
                               slim.nas.SearchSpace class and overwrite the abstract methods.
             log_period(int): The period of print log of training.
 
@@ -551,7 +551,7 @@ class Compressor(object):
 
     def run(self):
         """
-        Execute compressiong pass.
+        Execute compressing pass.
         """
         context = Context(
             place=self.place,
diff --git a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
index 1c01eb82d7f3c1b82615cbd78827ec4e9f7002c4..4a0e8ef005ac34abcab87222f7c3cefc22b75de1 100644
--- a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
@@ -63,7 +63,7 @@ class VarWrapper(object):
 
     def shape(self):
         """
-        Get the shape of the varibale.
+        Get the shape of the variable.
         """
         return self._var.shape
 
@@ -152,13 +152,13 @@ class OpWrapper(object):
 
     def inputs(self, name):
         """
-        Get all the varibales by the input name.
+        Get all the variables by the input name.
         """
         return [self._graph.var(var_name) for var_name in self._op.input(name)]
 
     def outputs(self, name):
         """
-        Get all the varibales by the output name.
+        Get all the variables by the output name.
         """
         return [self._graph.var(var_name) for var_name in self._op.output(name)]
 
@@ -233,7 +233,7 @@ class GraphWrapper(object):
         """
         Whether the given variable is parameter.
         Args:
-            var(VarWrapper): The given varibale.
+            var(VarWrapper): The given variable.
         """
         return isinstance(var._var, Parameter)
 
@@ -241,7 +241,7 @@ class GraphWrapper(object):
         """
         Whether the given variable is persistable.
         Args:
-            var(VarWrapper): The given varibale.
+            var(VarWrapper): The given variable.
         """
         return var._var.persistable
 
@@ -397,7 +397,7 @@ class GraphWrapper(object):
         """
         Get a new graph for training by appending some backward operators and optimization operators.
         Args:
-            optimizer: The optimzier used to generate training graph.
+            optimizer: The optimizer used to generate training graph.
             place: The place to run the graph.
             scope: The scope used to run the graph. Some new variable will be added into this scope.
             no_grad_var_names(list<str>): Names of variables that should be ignored while computing gradients. default: [].
diff --git a/python/paddle/fluid/contrib/slim/nas/controller_server.py b/python/paddle/fluid/contrib/slim/nas/controller_server.py
index 65cfbd7d86ff3783e358f73fff83d89fd98dc01a..3b5323a3ca42443461dacf1d4df0161ce85aa956 100644
--- a/python/paddle/fluid/contrib/slim/nas/controller_server.py
+++ b/python/paddle/fluid/contrib/slim/nas/controller_server.py
@@ -27,7 +27,7 @@ _logger = get_logger(
 
 class ControllerServer(object):
     """
-    The controller wrapper with a socket server to handle the request of search agentt.
+    The controller wrapper with a socket server to handle the request of search agent.
     """
 
     def __init__(self,
diff --git a/python/paddle/fluid/contrib/slim/prune/auto_prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/auto_prune_strategy.py
index f9dce622da2a3d40bd7aac6a13071856089d3b9a..c758c2b3da128f27a7b27b866963a18af7fe4a53 100644
--- a/python/paddle/fluid/contrib/slim/prune/auto_prune_strategy.py
+++ b/python/paddle/fluid/contrib/slim/prune/auto_prune_strategy.py
@@ -53,7 +53,7 @@ class AutoPruneStrategy(PruneStrategy):
             metric_name(str): The metric used to evaluate the model.
                          It should be one of keys in out_nodes of graph wrapper. Default: 'top1_acc'
             pruned_params(str): The pattern str to match the parameter names to be pruned. Default: 'conv.*_weights'
-            retrain_epoch(int): The training epochs in each seaching step. Default: 0
+            retrain_epoch(int): The training epochs in each searching step. Default: 0
             uniform_range(int): The token range in each position of tokens generated by controller. None means getting the range automatically. Default: None.
             init_tokens(list<int>): The initial tokens. None means getting the initial tokens automatically. Default: None.
         """
diff --git a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
index bbdebf3e5388af1e6f4bf6e50fd9d9aa2d4be1a2..8d9020dd95ede1ca0919e26c0398915e8e021f78 100644
--- a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
+++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
@@ -741,7 +741,7 @@ class SensitivePruneStrategy(PruneStrategy):
 
     def _format_sensitivities(self, sensitivities):
         """
-        Print formated sensitivities in debug log level.
+        Print formatted sensitivities in debug log level.
         """
         tb = pt.PrettyTable()
         tb.field_names = ["parameter", "size"] + [
diff --git a/python/paddle/fluid/contrib/slim/prune/pruner.py b/python/paddle/fluid/contrib/slim/prune/pruner.py
index 506b8fbe1de2e0f8a036f591bd2baacd5759c9c8..368e7831b3d07b1e0b88b6996e70e3357288db2f 100644
--- a/python/paddle/fluid/contrib/slim/prune/pruner.py
+++ b/python/paddle/fluid/contrib/slim/prune/pruner.py
@@ -42,7 +42,7 @@ class StructurePruner(Pruner):
             pruning_axis(dict): The key is the name of parameter to be pruned,
                                 '*' means all the parameters.
                                 The value is the axis to be used. Given a parameter
-                                with shape [3, 4], the result of pruning 50% on aixs 1
+                                with shape [3, 4], the result of pruning 50% on axis 1
                                 is a parameter with shape [3, 2].
             criterions(dict): The key is the name of parameter to be pruned,
                               '*' means all the parameters.
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 9edf473546f274d5b8862fe148b45b3c6ed71fa1..fa6a6e60ae36c84f940cd36c68660a221fcbd75e 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -666,10 +666,10 @@ class QuantizationFreezePass(object):
                  quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul']):
         """
         The freeze pass is used to adjust the quantize operator order, for example:
-            1) `activation -> quant -> dequant -> conv2d` will be freezed into
+            1) `activation -> quant -> dequant -> conv2d` will be frozen into
             `activation -> quant -> conv2d -> dequant`
-            2) `weight -> quant -> dequant -> conv2d` will be freezed into `weight -> conv2d`,
-            and weight will be sacled offline.
+            2) `weight -> quant -> dequant -> conv2d` will be frozen into `weight -> conv2d`,
+            and weight will be scaled offline.
 
         Args:
             scope(fluid.Scope): scope is used to get the weight tensor values.
@@ -994,8 +994,8 @@ class ConvertToInt8Pass(object):
 
     def apply(self, graph):
         """
-        Convert weights' tpye of the graph. After that, the data type of the
-        graph weigths is int8_t.
+        Convert weights' type of the graph. After that, the data type of the
+        graph weights is int8_t.
 
         Args:
             graph(IrGraph): the applied graph.
@@ -1065,7 +1065,7 @@ class ConvertToInt8Pass(object):
 class TransformForMobilePass(object):
     def __init__(self):
         """
-        This pass is used to convert the freezed graph for paddle-mobile execution.
+        This pass is used to convert the frozen graph for paddle-mobile execution.
         """
         self._fake_quant_op_names = _fake_quant_op_list
         self._fake_dequant_op_names = _fake_dequant_op_list
diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py
index d27b808438d53a004db4e85345a68c35d00fff98..7c7ed0972c7a74fb514bd0e74ea0c9da80e11fda 100644
--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -673,11 +673,11 @@ def save_checkpoint(executor,
     main_program and then saves these variables to the `checkpoint_dir`
     directory.
 
-    In the training precess, we generally save a checkpoint in each
+    In the training process, we generally save a checkpoint in each
     iteration. So there might be a lot of checkpoints in the
     `checkpoint_dir`. To avoid them taking too much disk space, the
     `max_num_checkpoints` are introduced to limit the total number of
-    checkpoints. If the number of existing checkpints is greater than
+    checkpoints. If the number of existing checkpoints is greater than
     the `max_num_checkpoints`, oldest ones will be scroll deleted.
 
     A variable is a checkpoint variable and will be saved if it meets
@@ -689,7 +689,7 @@ def save_checkpoint(executor,
     Args:
         executor(Executor): The executor to run for save checkpoint.
         checkpoint_dir(str): The folder where to save checkpoints.
-        trainer_id(int): currect trainer id, if id is equal to 0, the trainer
+        trainer_id(int): current trainer id, if id is equal to 0, the trainer
             is chief.
         trainer_args(dict|None): Current training arguments. Such as 'epoch_id'
             and 'step_id'.
@@ -772,7 +772,7 @@ def load_checkpoint(executor,
     main_program and then try to load these variables from the
     `checkpoint_dir` directory.
 
-    In the training precess, we generally save a checkpoint in each
+    In the training process, we generally save a checkpoint in each
     iteration. So there are more than one checkpoint in the
     `checkpoint_dir` (each checkpoint has its own sub folder), use
     `serial` to specify which serial of checkpoint you would like to
@@ -867,7 +867,7 @@ def _load_persist_vars_without_grad(executor,
                                     has_model_dir=False):
     """
     This function filters out all checkpoint variables from the give
-    program and then trys to load these variables from the given directory.
+    program and then tries to load these variables from the given directory.
 
     A variable is a checkpoint variable if it meets all following
     conditions:
@@ -898,7 +898,7 @@ def _load_persist_vars_without_grad(executor,
 
             # In this example, `_load_persist_vars_without_grad` function
             # will first filters out all checkpoint variables in the default
-            # main program, and then trys to load these variables form the
+            # main program, and then tries to load these variables form the
             # folder "./my_paddle_model/__model__".
     """
 
@@ -1135,12 +1135,12 @@ def _is_checkpoint_var(var):
 
 def _make_chekcpoint_dirs(dirs):
     """
-    _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
+    _make_chekcpoint_dirs will makedir local directory directly, when the directory is exist, it will ignore it.
     """
     assert dirs is not None
 
     if os.path.isfile(dirs):
-        raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
+        raise OSError(errno.ENOTDIR, "dirs path should be a Directory.", dirs)
 
     if not os.path.isdir(dirs):
         try:
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
index 962a5653f6135209de4e82d73b39cd3e8f8c9499..2de4f82bd14559a99581c5716523b2a78c2d7998 100644
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
@@ -312,9 +312,9 @@ class HDFSClient(object):
     @staticmethod
     def make_local_dirs(local_path):
         """
-        create a directiory local, is same to mkdir
+        create a directory local, is same to mkdir
         Args:
-            local_path: local path that wants to create a directiory.
+            local_path: local path that wants to create a directory.
         """
         try:
             os.makedirs(local_path)
diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
index 2d18a9a8620e210d5b0f6fbb90c3b59e31ac8086..8552bc8fc105835ae408ae716f0172d032552cd8 100644
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
@@ -137,7 +137,7 @@ def load_persistables_for_increment(dirname, executor, program,
                                     lookup_table_var, lookup_table_var_path):
     """
     WARNING: this function will only be used for distributed training with distributed lookup table.
-    for increment trainning, the pserver will not only load dense variables,
+    for increment training, the pserver will not only load dense variables,
     but also load the suitable lookup table var. Because of sliced lookup table
     var with HASH, we must load the correct sliced var.
 
@@ -417,7 +417,7 @@ def get_inference_model(main_program, feeded_var_names, target_vars):
 
     Args:
         main_program(Program|None): The original program, which will be pruned to
-                                    build the inference model. If is setted None,
+                                    build the inference model. If is set None,
                                     the default main program will be used.
                                     Default: None.
         feeded_var_names(list[str]): Names of variables that need to be feeded data
diff --git a/python/paddle/fluid/data.py b/python/paddle/fluid/data.py
index 6376b4f7749755d6ee59433937ee6f0bd3399896..179c3b07dbefc7c90ed2756d8b2ed98ec79764cf 100644
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
@@ -54,7 +54,7 @@ def data(name, shape, dtype='float32', lod_level=0):
            for more details.
        shape (list|tuple): List|Tuple of integers declaring the shape. You can
            set "None" at a dimension to indicate the dimension can be of any
-           size. For example, it is useful to set changable batch size as "None" 
+           size. For example, it is useful to set changeable batch size as "None" 
        dtype (np.dtype|VarType|str, optional): The type of the data. Supported
            dtype: bool, float16, float32, float64, int8, int16, int32, int64,
            uint8. Default: float32
@@ -75,7 +75,7 @@ def data(name, shape, dtype='float32', lod_level=0):
           # User can only feed data of the same shape to x
           x = fluid.data(name='x', shape=[3, 2, 1], dtype='float32')
 
-          # Creates a variable with changable batch size.
+          # Creates a variable with changeable batch size.
           # Users can feed data of any batch size into y,
           # but size of each data sample has to be [2, 1]
           y = fluid.data(name='y', shape=[None, 2, 1], dtype='float32')
diff --git a/python/paddle/fluid/data_feed_desc.py b/python/paddle/fluid/data_feed_desc.py
index eca9543c60a3a4f676a7bffe91cf975efac49437..4878c25fde5f7f90bd893639a58dbc2664ecff5a 100644
--- a/python/paddle/fluid/data_feed_desc.py
+++ b/python/paddle/fluid/data_feed_desc.py
@@ -53,7 +53,7 @@ class DataFeedDesc(object):
       data_feed = fluid.DataFeedDesc('data.proto')
 
     However, users usually shouldn't care about the message format; instead,
-    they are encouragd to use :code:`Data Generator` as a tool to generate a
+    they are encouraged to use :code:`Data Generator` as a tool to generate a
     valid data description, in the process of converting their raw log files to
     training files acceptable to AsyncExecutor.
 
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 6b49f7a8b4a102cb4def98a514a4a3c11aabebe9..70a429f65c52baa80838df204621c6db6b9ba483 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -339,10 +339,10 @@ class DataFeeder(object):
         """
         Similar with feed function, feed_parallel is used with multiple devices (CPU|GPU).
         Here :code:`iterable` is a list of python generators. The data return by each 
-        generator in the list will be fed into a seperate device.        
+        generator in the list will be fed into a separate device.        
 
         Parameters:
-            iterable (list|tuple): list of user-defined python geneators. The element 
+            iterable (list|tuple): list of user-defined python generators. The element 
                 number should match the :code:`num_places`.
             num_places (int, optional): the number of devices. If not provided (None), 
                 all available devices on the machine will be used. Default None.
@@ -379,7 +379,7 @@ class DataFeeder(object):
                 exe.run(fluid.default_startup_program())
                 program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(places=places)
 
-                # print sample feed_parallel r resultt
+                # print sample feed_parallel r result
                 # for item in list(feeder.feed_parallel([generate_reader(5, 0, 1), generate_reader(3, 10, 2)], 2)):
                 #     print(item['x'])
                 #     print(item['y'])
@@ -433,7 +433,7 @@ class DataFeeder(object):
 
         Parameters:
             reader(generator): a user defined python generator used to get :code:`mini-batch` of data.
-                A :code:`mini-batch` can be regarded as a python generator that returns batchs of input 
+                A :code:`mini-batch` can be regarded as a python generator that returns batches of input 
                 entities, just like the below :code:`_mini_batch` in the code example.                      
             multi_devices(bool): indicate whether to use multiple devices or not.
             num_places(int, optional): if :code:`multi_devices` is True, you can specify the number
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index b10ebcaa47e62f5f96467e105f189376a9ffb2a7..60dd4eb383126108f5412619e4941ed5d5c8b2b8 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -100,7 +100,7 @@ class DatasetBase(object):
         Args:
             record_candidate_size(int): size of instances candidate to shuffle 
                                         one slot
-            fea_eval(bool): wheather enable fea eval mode to enable slots shuffle.
+            fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
                             default is True.
             
         Examples:
@@ -822,7 +822,7 @@ class BoxPSDataset(InMemoryDataset):
 
     def wait_preload_done(self):
         """
-        Wait async proload done
+        Wait async preload done
         Wait Until Feed Pass Done
         Examples:
             .. code-block:: python
diff --git a/python/paddle/fluid/debugger.py b/python/paddle/fluid/debugger.py
index ef07dcebcabfe8aa3c0e3366597e40583a57db7c..9110b8daf38e16d61cb58ef173ed59d11d541aad 100644
--- a/python/paddle/fluid/debugger.py
+++ b/python/paddle/fluid/debugger.py
@@ -338,7 +338,7 @@ def run_fast_nan_inf_debug(executor,
                            use_program_cache=False,
                            dump_core=True):
     """
-    Run a program by the given executor. Catch the exception of NAN and INF, and save persistbales into the dumped core.
+    Run a program by the given executor. Catch the exception of NAN and INF, and save persistables into the dumped core.
     """
 
     assert (executor is not None)
diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py
index 902daf1a4ac754da1cc61cd00a89e3f12b4c2357..61e508ea72e8b529ce98d14c8ec3beff65148275 100644
--- a/python/paddle/fluid/distributed/downpour.py
+++ b/python/paddle/fluid/distributed/downpour.py
@@ -59,7 +59,7 @@ class DownpourSGD(object):
         """
         DownpounSGD is a distributed optimizer so
         that user can call minimize to generate backward
-        operators and optimization operators within minmize function
+        operators and optimization operators within minimize function
         Args:
             loss(Variable): loss variable defined by user
             startup_program(Program): startup program that defined by user
diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py
index 19d661c660efef8394bd2369f7759645ebbf3c5d..e89a1b71dd5ee625b4a07ee5c2b98f65f774047f 100644
--- a/python/paddle/fluid/distributed/ps_instance.py
+++ b/python/paddle/fluid/distributed/ps_instance.py
@@ -110,7 +110,7 @@ class PaddlePSInstance(object):
 
     def gather_ips(self):
         """
-        Return all servers and workers ip throught mpi allgather 
+        Return all servers and workers ip through mpi allgather 
         """
         self._ips = self.dh.comm.allgather(self._ip)
         return self._ips
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index 45e8959ef82e5ba2b84ee75de388876383fa5782..91bd7836e19b0bbdf86f4d2d8b95847b1074ba3b 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -88,9 +88,9 @@ class PiecewiseDecay(LearningRateDecay):
         boundaries(list): A list of steps numbers. The type of element in the list is python int. 
         values(list): A list of learning rate values that will be picked during
             different step boundaries. The type of element in the list is python float.
-        begin(int): The begin step to initilize the global_step in the description above.
+        begin(int): The begin step to initialize the global_step in the description above.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
 
@@ -158,7 +158,7 @@ class NaturalExpDecay(LearningRateDecay):
             default value is False.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
 
@@ -238,7 +238,7 @@ class ExponentialDecay(LearningRateDecay):
             default value is False.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
 
@@ -312,7 +312,7 @@ class InverseTimeDecay(LearningRateDecay):
             default value is False.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be 
             'float32', 'float64'. The default value is 'float32'.
 
@@ -393,7 +393,7 @@ class PolynomialDecay(LearningRateDecay):
         cycle(bool, optional): If set true, decay the learning rate every decay_steps. The default value is False.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
 
@@ -471,7 +471,7 @@ class CosineDecay(LearningRateDecay):
         epochs(int): The number of epochs.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
 
@@ -528,7 +528,7 @@ class NoamDecay(LearningRateDecay):
             it's a tensor with shape [1] and the data type can be int32 or int64. The type can also be python int.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
 
@@ -592,7 +592,7 @@ class LinearLrWarmup(LearningRateDecay):
         end_lr (float): Final learning rate of warm up.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
-            The defalult value is 1.
+            The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
     
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 64119a89ec5f62474452d9cb0c66be336867dfb5..c9ede3bdefe369785c16730dec9c868eb7c3d9a5 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -50,7 +50,7 @@ class Conv2D(layers.Layer):
     C will equal the number of input feature map divided by the groups.
     Please refer to UFLDL's `convolution
     <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
-    for more detials.
+    for more details.
     If bias attribution and activation type are provided, bias is added to the
     output of the convolution, and the corresponding activation function is
     applied to the final result.
@@ -1003,7 +1003,7 @@ class BatchNorm(layers.Layer):
 
     Parameters:
         num_channels(int): Indicate the number of channels of the input ``Tensor``.
-        act(str, optional): Activation to be applied to the output of batch normalizaiton. Default: None.
+        act(str, optional): Activation to be applied to the output of batch normalization. Default: None.
         is_test (bool, optional): A flag indicating whether it is in test phrase or not. Default: False.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
@@ -1242,7 +1242,7 @@ class Embedding(layers.Layer):
             default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
             user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. 
             The local word vector needs to be transformed into numpy format, and the shape of local word
-            vector shoud be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
+            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
             is used to load custom or pre-trained word vectors. See code example 2 for details.
         dtype(np.dtype|core.VarDesc.VarType|str): It refers to the data type of output Tensor.
             It must be "float32" or "float64". Default: "float32".
@@ -1382,7 +1382,7 @@ class LayerNorm(layers.Layer):
             omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
             a default :code:`ParamAttr` would be added as bias. The
             :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
-        act(str, optional): Activation to be applied to the output of layer normalizaiton.
+        act(str, optional): Activation to be applied to the output of layer normalization.
                   Default: None.
         dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
 
@@ -1435,7 +1435,7 @@ class LayerNorm(layers.Layer):
                 default_initializer=Constant(1.0))
         else:
             if self._param_attr:
-                logging.warn("param_attr are only avaliable with scale is True")
+                logging.warn("param_attr are only available with scale is True")
 
         if self._shift:
             assert self._bias_attr is not False
@@ -1446,7 +1446,7 @@ class LayerNorm(layers.Layer):
                 is_bias=True)
         else:
             if self._bias_attr:
-                logging.warn("bias_attr are only avaliable with shift is True")
+                logging.warn("bias_attr are only available with shift is True")
 
     def forward(self, input):
         input_shape = list(input.shape)
@@ -1702,7 +1702,7 @@ class NCE(layers.Layer):
              will create ParamAttr as bias_attr. If the Initializer of the bias_attr
              is not set, the bias is initialized zero. Default: None.
         num_neg_samples (int, optional): The number of negative classes. The default value is 10.
-        sampler (str, optional): The sampler used to sample class from negtive classes.
+        sampler (str, optional): The sampler used to sample class from negative classes.
                        It can be 'uniform', 'log_uniform' or 'custom_dist'.
                        default: 'uniform'.
         custom_dist (float[], optional): A float[] with size=num_total_classes.
@@ -2544,7 +2544,7 @@ class GroupNorm(layers.Layer):
         bias_attr(ParamAttr, optional): The parameter attribute for the learnable
                                         bias :math:`b`. If it is set to False, no bias will be added to the output units.
                                         If it is set to None, the bias is initialized zero. Default: None.
-        act(str, optional): Activation to be applied to the output of group normalizaiton. Default: None.
+        act(str, optional): Activation to be applied to the output of group normalization. Default: None.
         data_layout(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
 
     Returns:
@@ -2640,7 +2640,7 @@ class SpectralNorm(layers.Layer):
     and W is the product result of remaining dimensions.
 
     Step 2:
-    :attr:`power_iters` shoule be a positive interger, do following
+    :attr:`power_iters` should be a positive integer, do following
     calculations with U and V for :attr:`power_iters` rounds.
 
     .. math::
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 1390919151a76278591640206ee3e0c2a69d695e..b7e87ab8b6bbb7eea3e675b1bfd731c3ef7d6131 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -27,7 +27,7 @@ def monkey_patch_varbase():
     def set_value(self, value):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Set a new value for this Variable.
 
@@ -76,7 +76,7 @@ def monkey_patch_varbase():
     def backward(self, backward_strategy=None):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Run backward of current Graph which starts from current Variable
 
@@ -116,13 +116,13 @@ def monkey_patch_varbase():
             self._run_backward(backward_strategy, framework._dygraph_tracer())
         else:
             raise ValueError(
-                "Variable.backward() is only avaliable in DyGraph mode")
+                "Variable.backward() is only available in DyGraph mode")
 
     @framework.dygraph_only
     def gradient(self):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Get the Gradient of Current Variable
 
diff --git a/python/paddle/fluid/dygraph_grad_clip.py b/python/paddle/fluid/dygraph_grad_clip.py
index db7a76615f85a4db2db9e6f0353a3bafd8cc0ea0..c90795e09f9162eefce5ae847337c366b9282be1 100644
--- a/python/paddle/fluid/dygraph_grad_clip.py
+++ b/python/paddle/fluid/dygraph_grad_clip.py
@@ -55,7 +55,7 @@ class GradClipByValue(GradClipBase):
     Args:
         max_value (float): The maximum value to clip by. 
         min (float, optional): The minimum value to clip by. if not set by user, \
-        will be set to -max_value(max_value MUST be postive) by framework. 
+        will be set to -max_value(max_value MUST be positive) by framework. 
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index b2e0fc28ed7ffdb9c7c720e108d5e9473395f70f..90979c6b839ed6650f8a553b0464b57f270ed583 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -164,8 +164,8 @@ def dimension_is_compatible_with(first, second):
 
     A dimension is compatible with the other if:
     1. The length of the dimensions are same.
-    2. Each non-negative number of the two dimentions are same.
-    3. For negative number or 'None' in a dimention, it means unknown so it
+    2. Each non-negative number of the two dimensions are same.
+    3. For negative number or 'None' in a dimension, it means unknown so it
        is compatible with any number.
 
     Args:
@@ -200,8 +200,8 @@ def check_feed_shape_type(var, feed, num_places=1):
 
     A dimension is compatible with the other if:
     1. The length of the dimensions are same.
-    2. Each non-negative number of the two dimentions are same.
-    3. For negative number or 'None' in a dimention, it means unknown so it
+    2. Each non-negative number of the two dimensions are same.
+    3. For negative number or 'None' in a dimension, it means unknown so it
        is compatible with any number.
     
     Args:
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 7b9919d2ab755929f98a7887e61442daf9b51f80..1751298975931136dbd1c23193864d6204a246d6 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -458,7 +458,7 @@ def name_scope(prefix=None):
     if in_dygraph_mode():
         yield
     else:
-        assert prefix, "namescope prefix cannot be empty."
+        assert prefix, "namescope prefix canot be empty."
         global _name_scope
         _name_scope = _name_scope.child(prefix)
         yield
@@ -816,7 +816,7 @@ class Variable(object):
     There are many kinds of variables. Each kind of them has its own attributes
     and usages. Please refer to the `framework.proto <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto>`_ for details.
 
-    Most of a Variable's member variables can be setted to be None. It mean
+    Most of a Variable's member variables can be set to be None. It mean
     it is not available or will be specified later.
 
     Examples:
@@ -949,7 +949,7 @@ class Variable(object):
     def detach(self):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Returns a new Variable, detached from the current graph.
 
@@ -979,7 +979,7 @@ class Variable(object):
     def numpy(self):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Returns a numpy array shows the value of current :ref:`api_guide_Variable_en`
 
@@ -1011,7 +1011,7 @@ class Variable(object):
     def set_value(self, value):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Set a new value for this Variable.
 
@@ -1042,7 +1042,7 @@ class Variable(object):
     def backward(self, backward_strategy=None):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Run backward of current Graph which starts from current Variable
 
@@ -1080,7 +1080,7 @@ class Variable(object):
     def gradient(self):
         """
         **Notes**:
-            **This API is ONLY avaliable in Dygraph mode**
+            **This API is ONLY available in Dygraph mode**
 
         Get the Gradient of Current Variable
 
@@ -1128,7 +1128,7 @@ class Variable(object):
     def clear_gradient(self):
         """
         **Notes**:
-            **1. This API is ONLY avaliable in Dygraph mode**
+            **1. This API is ONLY available in Dygraph mode**
 
             **2. Use it only Variable has gradient, normally we use this for Parameters since other temporal Variable will be deleted by Python's GC**
 
@@ -1495,7 +1495,7 @@ class Variable(object):
         if length < 0:
             raise ValueError("length should not be negative")
         if step == 0:
-            raise ValueError("slice step cannot be zero")
+            raise ValueError("slice step canot be zero")
 
         # Find lower and upper bounds for start and stop.
         lower = -1 if step < 0 else 0
@@ -2965,7 +2965,7 @@ class IrVarNode(IrNode):
             shape(list): shape to be set.
         """
         assert self.node.var() is not None, \
-            "The node variable description cannot be None."
+            "The node variable description canot be None."
         self.node.var().set_shape(shape)
 
     def persistable(self):
@@ -2976,7 +2976,7 @@ class IrVarNode(IrNode):
             bool: indicate whether the variable is persistable.
         """
         assert self.node.var() is not None, \
-            "The node variable description cannot be None."
+            "The node variable description canot be None."
         return self.node.var().persistable()
 
     def type(self):
@@ -2987,7 +2987,7 @@ class IrVarNode(IrNode):
             core.VarDesc.VarType: the variable type.
         """
         assert self.node.var() is not None, \
-            "The node variable description cannot be None."
+            "The node variable description canot be None."
         return self.node.var().type()
 
     def dtype(self):
@@ -2998,7 +2998,7 @@ class IrVarNode(IrNode):
             core.VarDesc.VarType: the variable data type.
         """
         assert self.node.var() is not None, \
-            "The node variable description cannot be None."
+            "The node variable description canot be None."
         return self.node.var().dtype()
 
     def shape(self):
@@ -3009,7 +3009,7 @@ class IrVarNode(IrNode):
             list: the variable shape.
         """
         assert self.node.var() is not None, \
-            "The node variable description cannot be None."
+            "The node variable description canot be None."
         return self.node.var().shape()
 
     @property
@@ -3059,7 +3059,7 @@ class IrOpNode(IrNode):
             new_input_name(str): the new input name.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         self.node.op()._rename_input(old_input_name, new_input_name)
 
     def rename_output(self, old_output_name, new_output_name):
@@ -3071,7 +3071,7 @@ class IrOpNode(IrNode):
             new_output_name(str): the new output name.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         print("op: {}, old: {}, new: {}\n".format(self.node.op().type(
         ), old_output_name, new_output_name))
         self.node.op()._rename_output(old_output_name, new_output_name)
@@ -3087,7 +3087,7 @@ class IrOpNode(IrNode):
             list(str): the argument name list.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         return self.node.op().input(name)
 
     def output(self, name):
@@ -3101,7 +3101,7 @@ class IrOpNode(IrNode):
             list(str): the argument name list.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         return self.node.op().output(name)
 
     def set_type(self, new_type):
@@ -3112,7 +3112,7 @@ class IrOpNode(IrNode):
             new_type(str): new operator type to be set.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         return self.node.op().set_type(new_type)
 
     def set_attr(self, name, val):
@@ -3130,7 +3130,7 @@ class IrOpNode(IrNode):
         Update the value of the op desc's attribute by attribute's name.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         desc = self.node.op()
         if isinstance(val, Block):
             desc.set_block_attr(name, val.desc)
@@ -3151,7 +3151,7 @@ class IrOpNode(IrNode):
             list(str): input arguments' names of this op node.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         return self.node.op().input_arg_names()
 
     def output_arg_names(self):
@@ -3162,7 +3162,7 @@ class IrOpNode(IrNode):
             list(str): output arguments' names of this op node.
         """
         assert self.node.op() is not None, \
-            "The node operator description cannot be None."
+            "The node operator description canot be None."
         return self.node.op().output_arg_names()
 
     @property
@@ -3318,7 +3318,7 @@ class IrGraph(object):
             op_type(str): the type of the operator node.
             attrs(dict): the attributes of the operator node.
             inputs(dict): the inputs of the operator node.
-            outputs(dict): the outpus of the operator node.
+            outputs(dict): the outputs of the operator node.
 
         Returns:
             IrOpNode: the created operator node.
@@ -3459,7 +3459,7 @@ class IrGraph(object):
         """
         Perform the topology sort operation on the graph.
 
-        Notes: the `graph` cannot contain a circle.
+        Notes: the `graph` canot contain a circle.
 
         Returns:
             list(IrNode): nodes in topology order.
@@ -3805,9 +3805,9 @@ class Program(object):
 
                 prog = fluid.default_main_program()
                 prog_string = prog.to_string(throw_on_error=True, with_details=False)
-                print("program string without detial: {}".format(prog_string))
+                print("program string without detail: {}".format(prog_string))
                 prog_string_with_detail = prog.to_string(throw_on_error=True, with_details=True)
-                print("program string with detial: {}".format(prog_string_with_detail))
+                print("program string with detail: {}".format(prog_string_with_detail))
         """
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
@@ -4606,7 +4606,7 @@ class Parameter(Variable):
             Default: {'learning_rate': 1.0}
         regularizer(WeightDecayRegularizer): The Regularizer which will
             be applied on the parameter. Default: None
-        gradient_clip_attr(BaseGradientClipAttr): The gradint clip strategy
+        gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
             which will be applied on the parameter. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this parameter.
@@ -4712,7 +4712,7 @@ class ParamBase(core.VarBase):
             Default: {'learning_rate': 1.0}
         regularizer(WeightDecayRegularizer): The Regularizer which will
             be applied on the ParamBase. Default: None
-        gradient_clip_attr(BaseGradientClipAttr): The gradint clip strategy
+        gradient_clip_attr(BaseGradientClipAttr): The gradient clip strategy
             which will be applied on the ParamBase. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this ParamBase.
@@ -5022,7 +5022,7 @@ def load_op_library(lib_filename):
     Load a dynamic library, including custom operators and kernels.
     When library is loaded, ops and kernels registered in the library
     will be available in PaddlePaddle main process.
-    Please note, the type of custom operators cann't have the same type
+    Please note, the type of custom operators can't have the same type
     with the existing operators in the framework.
 
     Args:
diff --git a/python/paddle/fluid/incubate/data_generator/__init__.py b/python/paddle/fluid/incubate/data_generator/__init__.py
index 77c3fc6bf2d4fb75709ba9667860b14b2334f5a1..8d31a68e8083d6b34fa6f5e51be9391602da630b 100644
--- a/python/paddle/fluid/incubate/data_generator/__init__.py
+++ b/python/paddle/fluid/incubate/data_generator/__init__.py
@@ -143,7 +143,7 @@ class DataGenerator(object):
         '''
         Further processing the output of the process() function rewritten by
         user, outputting data that can be directly read by the datafeed,and
-        updating proto_info infomation.
+        updating proto_info information.
 
         Args:
             line(str): the output of the process() function rewritten by user.
@@ -243,7 +243,7 @@ class MultiSlotStringDataGenerator(DataGenerator):
         '''
         Further processing the output of the process() function rewritten by
         user, outputting data that can be directly read by the MultiSlotDataFeed,
-        and updating proto_info infomation.
+        and updating proto_info information.
 
         The input line will be in this format:
             >>> [(name, [str(feasign), ...]), ...]
@@ -284,7 +284,7 @@ class MultiSlotDataGenerator(DataGenerator):
         '''
         Further processing the output of the process() function rewritten by
         user, outputting data that can be directly read by the MultiSlotDataFeed,
-        and updating proto_info infomation.
+        and updating proto_info information.
 
         The input line will be in this format:
             >>> [(name, [feasign, ...]), ...] 
diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py
index 6600ed9aa4e952dcf3d883a019fcf9696e8450b8..bada19abcc32d2bc91d99203a7b66389aca912d3 100644
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@@ -542,8 +542,8 @@ class PaddleCloudRoleMaker(RoleMakerBase):
 class GeneralRoleMaker(RoleMakerBase):
     """
     This role maker is for general use, you can set os.environ to customize:
-        PADDLE_PSERVERS_IP_PORT_LIST : all pservers' ip:port, seperated by ','
-        PADDLE_TRAINER_ENDPOINTS     : all trainers' ip:port, seperated by ','
+        PADDLE_PSERVERS_IP_PORT_LIST : all pservers' ip:port, separated by ','
+        PADDLE_TRAINER_ENDPOINTS     : all trainers' ip:port, separated by ','
         TRAINING_ROLE                : TRAINER or PSERVER
         PADDLE_TRAINER_ID            : current trainer id (only for trainer),
                                        it is index in PADDLE_TRAINER_ENDPOINTS
diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py
index e33662cf0823b8814654175daa741fca5b3b1afe..5150d1084793aad0960d2d1abe5fa5bceeab4955 100644
--- a/python/paddle/fluid/incubate/fleet/collective/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py
@@ -220,7 +220,7 @@ class CollectiveOptimizer(DistributedOptimizer):
 
     def _check_collective_mode(self, main_program, optimizer, strategy):
         """
-        Check the conflict condtions.
+        Check the conflict conditions.
         """
         if strategy.use_local_sgd:
             strategy.mode = "collective"
@@ -392,7 +392,7 @@ class CollectiveOptimizer(DistributedOptimizer):
             tuple: (optimize_ops, params_grads) which are, list of operators appended;
             and list of (param, grad) Variables pair for optimization.
         Note that in parameter server mode, a worker will not get anything about optimize_os
-        Because optmizer algorithms run on pserver side. We will make this usable in pserver
+        Because optimizer algorithms run on pserver side. We will make this usable in pserver
         process, but currently the optimization part is written into Fleet(). A user does not
         need to care about how to startup a pserver node.
         """
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index ec5f6de81c9a7994dc0396c42380b0d6336002a2..d6ea97fc57bd5957e9c87e3c101f3d4614ce84c0 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -618,7 +618,7 @@ class DownpourOptimizer(DistributedOptimizer):
         """
         minimize a program through loss, loss can be a list in DistributedOptimizer.
         Note that in parameter server mode, a worker will not get anything about optimize_os
-        Because optmizer algorithms run on pserver side. We will make this usable in pserver
+        Because optimizer algorithms run on pserver side. We will make this usable in pserver
         process, but currently the optimization part is written into Fleet(). A user does not
         need to care about how to startup a pserver node.
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index c5e105cc8d617e55985ce7305ca028ddf4535312..1d119039f12c7351a242462160afc855a5a8b598 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -156,7 +156,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
         """
         DownpounSGD is a distributed optimizer so
         that user can call minimize to generate backward
-        operators and optimization operators within minmize function
+        operators and optimization operators within minimize function
         Args:
             loss(Variable): loss variable defined by user
             startup_program(Program): startup program that defined by user
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
index 4a1fd20afc0cf18c151b13e5309a361277d12563..2b46459280b614d9e175e1a21f82eac9599db344 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -901,7 +901,7 @@ class FleetUtil(object):
             hadoop_fs_name(str): hadoop fs name
             hadoop_fs_ugi(str): hadoop fs ugi
             hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
-            save_combine(bool): whether to save in a file or seperate files,
+            save_combine(bool): whether to save in a file or separate files,
                                 default is True
 
         Examples:
@@ -990,7 +990,7 @@ class FleetUtil(object):
             hadoop_fs_ugi(str): hadoop fs ugi
             hadoop_home(str): hadoop home, default is "$HADOOP_HOME"
             var_names(list): save persistable var names, default is None
-            save_combine(bool): whether to save in a file or seperate files,
+            save_combine(bool): whether to save in a file or separate files,
                                 default is True
 
         Examples:
@@ -1300,7 +1300,7 @@ class FleetUtil(object):
               from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
               fleet_util = FleetUtil()
               metric_list = fleet_util.get_global_metrics(myscope,
-                                                          stat_pos.nane,
+                                                          stat_pos.name,
                                                           stat_neg.name,
                                                           local_sqrerr.name,
                                                           local_abserr.name,
@@ -1487,7 +1487,7 @@ class FleetUtil(object):
               from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
               fleet_util = FleetUtil()
               fleet_util.print_global_metrics(myscope,
-                                              stat_pos.nane,
+                                              stat_pos.name,
                                               stat_neg.name,
                                               local_sqrerr.name,
                                               local_abserr.name,
diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
index 23a22531a45f11aa61e25304dfe973989aacbcf6..c16d7e3cc458f3f2052507497a3ba7ebaec3d042 100644
--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
@@ -274,10 +274,10 @@ class HDFSClient(object):
     @staticmethod
     def make_local_dirs(local_path):
         """
-        create a directiory local, is same to mkdir
+        create a directory local, is same to mkdir
 
         Args:
-            local_path(str): local path that wants to create a directiory.
+            local_path(str): local path that wants to create a directory.
         """
         try:
             os.makedirs(local_path)
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index b14234b59984b051b6cd0c98da8a0c1ff7d32655..dfea275d7b84c2f6976a873d4b31ba288e96cd0b 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -782,7 +782,7 @@ class BilinearInitializer(Initializer):
         super(BilinearInitializer, self).__init__()
 
     def __call__(self, var, block):
-        """Add biliear initialization ops for a variable
+        """Add bilinear initialization ops for a variable
 
         Args:
             var (Variable): Variable that needs to be initialized.
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index 458f386919b1544d71eee2c1393f071ff2381899..bf771d801e63bfe6d77025c51d7d17aa9d7a9fd6 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -204,7 +204,7 @@ def embedding(input,
             default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
             user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. 
             The local word vector needs to be transformed into numpy format, and the shape of local word
-            vector shoud be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
+            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
             is used to load custom or pre-trained word vectors. See code example 2 for details.
         dtype(str|core.VarDesc.VarType): It refers to the data type of output Tensor.
             It must be float32 or float64. Default: float32.
@@ -219,7 +219,7 @@ def embedding(input,
           import numpy as np
           data = fluid.data(name='x', shape=[None, 10], dtype='int64')
 
-          # exampel 1
+          # example 1
           emb_1 = fluid.embedding(input=data, size=[128, 64])
 
           # example 2: load custom or pre-trained word vectors
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 42366aad88eea5e5fbb92edeb8f1535cf31e429b..201cc61e4d479dc11b169e02481ac4ff4780c2b8 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -44,7 +44,7 @@ class SimpleLayer(Layer):
 
 
 def run_check():
-    ''' intall check to verify if install is success
+    ''' install check to verify if install is success
 
     This func should not be called only if you need to verify installation
     '''
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index dd312bce11763f3d6e4fc23ce624b29489cbba99..1830950866cf0d87a7c40470e8f5b5631e63d736 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -219,7 +219,7 @@ def save_vars(executor,
     variables that make `predicate(variable) == True`. The first way has a higher priority.
 
     The `dirname` is used to specify the folder where to save variables.
-    If you prefer to save variables in separate files in the `dirname` floder,
+    If you prefer to save variables in separate files in the `dirname` folder,
     do not set `filename`. If you prefer to save all variables in a single file,
     use `filename` to specify it.
 
@@ -435,7 +435,7 @@ def _save_distributed_persistables(executor, dirname, main_program):
 
     def __save_remote_params(executor, dirname, remote_params_map):
         """
-        recive params on pserver through rpc.
+        receive params on pserver through rpc.
         if the params are be sliced, will concat them to one, then save it.
         """
         if not remote_params_map:
@@ -571,7 +571,7 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
                             You can refer to :ref:`api_guide_executor_en` for 
                             more details.
         dirname(str): The saving directory path.
-        main_program(Program, optional): The program whose persistbale variables will
+        main_program(Program, optional): The program whose persistable variables will
                                          be saved. You can refer to 
                                          :ref:`api_guide_Program_en` for more details.
                                          If it is None, the default main program will 
@@ -835,7 +835,7 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
     """
     This API filters out all variables with ``persistable==True`` from the
     given ``main_program`` and then tries to load these variables from the
-    directory ``dirnameme`` or the file ``filename``.
+    directory ``dirname`` or the file ``filename``.
 
     Use the ``dirname`` to specify the directory where persistable variables
     (refer to :ref:`api_guide_model_save_reader_en`) were saved. If variables
@@ -846,7 +846,7 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
         executor(Executor): The executor used for loading persistable variables.
                             See :ref:`api_guide_executor_en` for more details about it.
         dirname(str): The directory path.
-        main_program(Program, optional): The program whose persistbale variables will
+        main_program(Program, optional): The program whose persistable variables will
                                     be loaded. If it is None, the ``default_main_program``
                                     will be used automatically. See :ref:`api_guide_Program_en`
                                     for more about ``Program``.
@@ -1050,14 +1050,14 @@ def save_inference_model(dirname,
         executor(Executor): The executor that saves the inference model. You can refer 
                             to :ref:`api_guide_executor_en` for more details.
         main_program(Program, optional): The original program, which will be pruned to
-                                         build the inference model. If is setted None,
+                                         build the inference model. If is set None,
                                          the global default :code:`_main_program_` will be used.
                                          Default: None.
         model_filename(str, optional): The name of file to save the inference program
-                                       itself. If is setted None, a default filename
+                                       itself. If is set None, a default filename
                                        :code:`__model__` will be used.
         params_filename(str, optional): The name of file to save all related parameters.
-                                        If it is setted None, parameters will be saved
+                                        If it is set None, parameters will be saved
                                         in separate files .
         export_for_deployment(bool): If True, programs are modified to only support
                                      direct inference deployment. Otherwise,
@@ -1086,7 +1086,7 @@ def save_inference_model(dirname,
 
             path = "./infer_model"
 
-            # User defined network, here a softmax regresssion example
+            # User defined network, here a softmax regession example
             image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
             feeder = fluid.DataFeeder(feed_list=[image, label], place=fluid.CPUPlace())
@@ -1408,7 +1408,7 @@ def get_parameter_value_by_name(name, executor, program=None):
     Raises:
         TypeError: If given `name` is not an instance of basestring.
         TypeError: If the parameter with the given name doesn't exist.
-        AssertionError: If there is a varibale named `name` in the
+        AssertionError: If there is a variable named `name` in the
                         given program but it is not a Parameter.
 
     Examples:
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 14dda7a0ea4c22f07288fb220c5ac58668bc428c..4385e64583b19e5e094f724c4d371983c4fd17d4 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -215,7 +215,7 @@ def Print(input,
     Args:
         input (Variable): A Tensor to print.
         summarize (int): Number of elements in the tensor to be print. If it's
-                vaule is -1, then all elements in the tensor will be print.
+                value is -1, then all elements in the tensor will be print.
         message (str): A string message to print as a prefix.
         first_n (int): Only log `first_n` number of times.
         print_tensor_name (bool, optional): Print the tensor name. Default: True.
@@ -703,7 +703,7 @@ class StaticRNN(object):
         Args:
             mem(Variable): the memory variable.
             var(Variable): the plain variable generated in RNN block, used to update memory.
-                           var and mem should hava same dims and data type.
+                           var and mem should have same dims and data type.
 
         Returns:
             None
@@ -1019,7 +1019,7 @@ def lod_rank_table(x, level=0):
     of LoD, this layer creates a LodRankTable object. A LoDRankTable object
     contains a list of bi-element tuples. Each tuple consists of an index and
     a length, both of which are int type. Refering to specified level of LoD,
-    the index is the sequence index number and the length representes the
+    the index is the sequence index number and the length represents the
     sequence length. Please note that the list is ranked in descending order by
     the length. The following is an example:
 
@@ -1179,7 +1179,7 @@ def increment(x, value=1.0, in_place=True):
     Notice that the number of elements in :attr:`x` must be equal to 1.
 
     Parameters:
-        x (Variable): A tensor that must alway contain only one element, its data type supports
+        x (Variable): A tensor that must always contain only one element, its data type supports
             float32, float64, int32 and int64.
         value (float, optional): The amount to increment the data of :attr:`x`. Default: 1.0.
         in_place (bool, optional): Whether the OP should be performed in-place. Default: True.
@@ -1668,7 +1668,7 @@ def array_length(array):
     """
     This OP is used to get the length of the input array :ref:`api_fluid_LoDTensorArray` .
     It can be used together with :ref:`api_fluid_layers_array_read` , :ref:`api_fluid_layers_array_write` , 
-    :ref:`api_fluid_layers_While` OP to traverse, read and wirte LoDTensorArray.
+    :ref:`api_fluid_layers_While` OP to traverse, read and write LoDTensorArray.
 
     Args:
         array (LoDTensorArray): The input array that will be used to compute the length.
@@ -1749,7 +1749,7 @@ class ConditionalBlock(object):
 
     Args:
         inputs (Variable): bool conditions.
-        is_scalar_condition (bool): whether the branch is controled by a scalar.
+        is_scalar_condition (bool): whether the branch is controlled by a scalar.
         name(str): name of this ConditionalBlock.
 
     Examples:
@@ -2539,7 +2539,7 @@ class DynamicRNN(object):
     The total number of time steps is determined by the longest sequence.
     DynamicRNN will not pad all sequences to the same length, instead it will
     sort the sequences internally by the sequence length in descending order.
-    The input sequences will be shrinked because only sequences of which the
+    The input sequences will be shrank because only sequences of which the
     length is larger than the time step will participate the remaining calculation.
 
     If defined :code:`drnn = DynamicRNN()`, then users can call :code:`drnn()`
@@ -2827,7 +2827,7 @@ class DynamicRNN(object):
                 Optional data types are: bool, float16, float32, float64, int8, int16, int32, int64, uint8.
 
         Returns:
-            Variable: The input LoDTensor after sorted and shrinked. If there are :code:`num_sequences` \
+            Variable: The input LoDTensor after sorted and shrank. If there are :code:`num_sequences` \
                 sequences in RNN's input LoDTensor whose length is larger than :code:`step_idx` , \
                 the static input Tensor will be sorted to the same order as RNN's input and \
                 will only retain data corresponding to those :code:`num_sequences` sequences. \
@@ -2926,7 +2926,7 @@ class DynamicRNN(object):
 
     def __call__(self, *args, **kwargs):
         """
-        This function is used to get the output  sequneces of DynamicRNN.
+        This function is used to get the output  sequences of DynamicRNN.
 
         Args:
             None
@@ -2968,10 +2968,10 @@ class DynamicRNN(object):
                 If setting shape to :math:`\{D_1, D_2, ...\}` , the shape of memory Tensor
                 will be :math:`\{batch\_size, D_1, D_2, ...\}` , where batch_size is
                 determined by RNN's input sequences. The default value is None.
-            value (float, optional): When init is None, it is used as initalized value
+            value (float, optional): When init is None, it is used as initialized value
                 of memory. The default value is 0.0.
             need_reorder (bool, optional): When init is not None, it determines whether
-                the memory needs to reorder like the RNN's input sequeneces. It should be
+                the memory needs to reorder like the RNN's input sequences. It should be
                 set to True when the initialized memory depends on the order of input samples.
                 The default value is False.
             dtype (str|numpy.dtype, optional): When init is None, it is used to set the
@@ -2979,9 +2979,9 @@ class DynamicRNN(object):
                 are: "float32", "float64", "int32", "int64".
 
         Returns:
-            Variable: The memory LoDTensor after shrinked.  If there are :code:`num_sequences` \
+            Variable: The memory LoDTensor after shrank.  If there are :code:`num_sequences` \
                 sequences in RNN's input LoDTensor whose length is larger than :code:`step_idx` , \
-                the memory Tensor also need to be shrinked and will only retain data \
+                the memory Tensor also need to be shrank and will only retain data \
                 corresponding to those :code:`num_sequences` sequences.
 
         Raises:
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 1d0dbce39c221a4430e6b5df4ba0a1f1a62cf296..a4fa34e8fb3fc28e6f8485771065b1c72738d4ae 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -98,7 +98,7 @@ def retinanet_target_assign(bbox_pred,
     the training process.
 
     Retinanet predicts a :math:`C`-vector for classification and a 4-vector for box
-    regresion for each anchor, hence the target label for each positive(or negative)
+    regression for each anchor, hence the target label for each positive(or negative)
     sample is a :math:`C`-vector and the target locations for each positive sample
     is a 4-vector. As for a positive sample, if the category of its assigned
     ground-truth box is class :math:`i`, the corresponding entry in its length
@@ -156,7 +156,7 @@ def retinanet_target_assign(bbox_pred,
             of :attr:`is_crowd` is int32.
         im_info(Variable): A 2-D Tensor with shape [N, 3] represents the size
             information of input images. :math:`N` is the batch size, the size
-            informarion of each image is a 3-vector which are the height and width
+            information of each image is a 3-vector which are the height and width
             of the network input along with the factor scaling the origin image to
             the network input. The data type of :attr:`im_info` is float32.
         num_classes(int32): The number of categories for classification, the default
@@ -557,7 +557,7 @@ def detection_output(loc,
             categories will be considered. Default: 0.
         nms_threshold(float): The threshold to be used in NMS. Default: 0.3.
         nms_top_k(int): Maximum number of detections to be kept according
-            to the confidences aftern filtering detections based on
+            to the confidences after filtering detections based on
             score_threshold and before NMS. Default: 400.
         keep_top_k(int): Number of total bboxes to be kept per image after
             NMS step. -1 means keeping all bboxes after NMS step. Default: 200.
@@ -660,7 +660,7 @@ def iou_similarity(x, y, box_normalized=True, name=None):
     Args:
         x (Variable): ${x_comment}.The data type is float32 or float64.
         y (Variable): ${y_comment}.The data type is float32 or float64.
-        box_normalized(bool): Whether treat the priorbox as a noramlized box.
+        box_normalized(bool): Whether treat the priorbox as a normalized box.
             Set true by default.
     Returns:
         Variable: ${out_comment}.The data type is same with x.
@@ -775,7 +775,7 @@ def box_coder(prior_box,
         code_type(str): The code type used with the target box. It can be
             `encode_center_size` or `decode_center_size`. `encode_center_size` 
             by default.
-        box_normalized(bool): Whether treat the priorbox as a noramlized box.
+        box_normalized(bool): Whether treat the priorbox as a normalized box.
             Set true by default.
         name(str, optional): For detailed information, please refer 
             to :ref:`api_guide_Name`. Usually name is no need to set and 
@@ -793,7 +793,7 @@ def box_coder(prior_box,
         output tensor of box_coder_op with shape [N, M, 4] representing the 
         result of N target boxes encoded with M Prior boxes and variances. 
         When code_type is 'decode_center_size', N represents the batch size 
-        and M represents the number of deocded boxes.
+        and M represents the number of decoded boxes.
 
     Examples:
  
@@ -908,13 +908,13 @@ def yolov3_loss(x,
     Args:
         x (Variable): ${x_comment}The data type is float32 or float64. 
         gt_box (Variable): groud truth boxes, should be in shape of [N, B, 4],
-                          in the third dimenstion, x, y, w, h should be stored. 
-                          x,y is the center cordinate of boxes, w, h are the
+                          in the third dimension, x, y, w, h should be stored. 
+                          x,y is the center coordinate of boxes, w, h are the
                           width and height, x, y, w, h should be divided by 
                           input image height to scale to [0, 1].
                           N is the batch number and B is the max box number in 
                           an image.The data type is float32 or float64. 
-        gt_label (Variable): class id of ground truth boxes, shoud be in shape
+        gt_label (Variable): class id of ground truth boxes, should be in shape
                             of [N, B].The data type is int32. 
         anchors (list|tuple): ${anchors_comment}
         anchor_mask (list|tuple): ${anchor_mask_comment}
@@ -924,7 +924,7 @@ def yolov3_loss(x,
         name (string): The default value is None.  Normally there is no need 
                        for user to set this property.  For more information, 
                        please refer to :ref:`api_guide_Name`
-        gt_score (Variable): mixup score of ground truth boxes, shoud be in shape
+        gt_score (Variable): mixup score of ground truth boxes, should be in shape
                             of [N, B]. Default None.
         use_label_smooth (bool): ${use_label_smooth_comment}
 
@@ -1415,7 +1415,7 @@ def ssd_loss(location,
 
       1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
 
-      1.2 Compute matched boundding box by bipartite matching algorithm.
+      1.2 Compute matched bounding box by bipartite matching algorithm.
 
     2. Compute confidence for mining hard examples
 
@@ -1525,10 +1525,10 @@ def ssd_loss(location,
     def __reshape_to_2d(var):
         return nn.flatten(x=var, axis=2)
 
-    # 1. Find matched boundding box by prior box.
+    # 1. Find matched bounding box by prior box.
     #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
     iou = iou_similarity(x=gt_box, y=prior_box)
-    #   1.2 Compute matched boundding box by bipartite matching algorithm.
+    #   1.2 Compute matched bounding box by bipartite matching algorithm.
     matched_indices, matched_dist = bipartite_match(iou, match_type,
                                                     overlap_threshold)
 
@@ -1653,7 +1653,7 @@ def prior_box(input,
     sequence according to the aspect_ratios.
 
     Parameters:
-       input(Variable): 4-D tenosr(NCHW), the data type should be float32 or float64.
+       input(Variable): 4-D tensor(NCHW), the data type should be float32 or float64.
        image(Variable): 4-D tensor(NCHW), the input image data of PriorBoxOp,
             the data type should be float32 or float64.
        min_sizes(list|tuple|float): the min sizes of generated prior boxes.
@@ -2051,7 +2051,7 @@ def multi_box_head(inputs,
        min_max_aspect_ratios_order(bool): If set True, the output prior box is
             in order of [min, max, aspect_ratios], which is consistent with
             Caffe. Please note, this order affects the weights order of
-            convolution layer followed by and does not affect the fininal
+            convolution layer followed by and does not affect the final
             detection results. Default: False.
 
     Returns:
@@ -2610,7 +2610,7 @@ def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
             target_size / original_size, target_size is the size after resize,
             original_size is the original image size.
         gt_classes (Variable): A 2-D LoDTensor with shape [M, 1]. Data type
-            shoule be int. M is the total number of ground-truth, each
+            should be int. M is the total number of ground-truth, each
             element is a class label.
         is_crowd (Variable): A 2-D LoDTensor with same shape and same data type
             as gt_classes, each element is a flag indicating whether a
@@ -2628,7 +2628,7 @@ def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
             float32. R is the total number of RoIs, each element is a bounding
             box with (xmin, ymin, xmax, ymax) format in the range of original image.
         labels_int32 (Variable): A 2-D LoDTensor in shape of [R, 1] with type
-            of int32. R is the same as it in `rois`. Each element repersents
+            of int32. R is the same as it in `rois`. Each element represents
             a class label of a RoI.
         num_classes (int): Class number.
         resolution (int): Resolution of mask predictions.
@@ -2637,15 +2637,15 @@ def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
         mask_rois (Variable):  A 2D LoDTensor with shape [P, 4] and same data
         type as `rois`. P is the total number of sampled RoIs. Each element
         is a bounding box with [xmin, ymin, xmax, ymax] format in range of
-        orignal image size.
+        original image size.
 
         mask_rois_has_mask_int32 (Variable): A 2D LoDTensor with shape [P, 1]
-        and int data type, each element repersents the output mask RoI
+        and int data type, each element represents the output mask RoI
         index with regard to input RoIs.
 
         mask_int32 (Variable): A 2D LoDTensor with shape [P, K * M * M] and int
         data type, K is the classes number and M is the resolution of mask
-        predictions. Each element repersents the binary mask targets.
+        predictions. Each element represents the binary mask targets.
 
     Examples:
         .. code-block:: python
@@ -2745,7 +2745,7 @@ def generate_proposals(scores,
             N is batch size, A is number of anchors, H and W are height and
             width of the feature map. The data type must be float32.
         bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W]
-            represents the differece between predicted box locatoin and
+            represents the difference between predicted box location and
             anchor location. The data type must be float32.
         im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin
             image information for N batch. Height and width are the input sizes 
@@ -2841,7 +2841,7 @@ def box_clip(input, im_info, name=None):
         input(Variable): The input Tensor with shape :math:`[N_1, N_2, ..., N_k, 4]`,
             the last dimension is 4 and data type is float32 or float64.
         im_info(Variable): The 2-D Tensor with shape [N, 3] with layout 
-            (height, width, scale) represeting the information of image. 
+            (height, width, scale) representing the information of image. 
             Height and width are the input sizes and scale is the ratio of network input
             size and original size. The data type is float32 or float64.
         name(str, optional): For detailed information, please refer 
@@ -2851,7 +2851,7 @@ def box_clip(input, im_info, name=None):
     Returns:
         Variable:
 
-        output(Variable): The cliped tensor with data type float32 or float64. 
+        output(Variable): The clipped tensor with data type float32 or float64. 
         The shape is same as input.
 
         
@@ -2919,7 +2919,7 @@ def retinanet_detection_output(bboxes,
             The data type of each element is float32 or float64.
         im_info(Variable): A 2-D Tensor with shape :math:`[N, 3]` represents the size
             information of input images. :math:`N` is the batch size, the size
-            informarion of each image is a 3-vector which are the height and width
+            information of each image is a 3-vector which are the height and width
             of the network input along with the factor scaling the origin image to
             the network input. The data type of :attr:`im_info` is float32.
         score_threshold(float): Threshold to filter out bounding boxes
@@ -2946,7 +2946,7 @@ def retinanet_detection_output(bboxes,
     that there is no detection if :attr:`score_threshold` are used at all
     levels. Hence, this OP do not filter out anchors from the highest FPN level
     before NMS. And the last element in :attr:`bboxes`:, :attr:`scores` and
-    :attr:`anchors` is required to be from the hightest FPN level.
+    :attr:`anchors` is required to be from the highest FPN level.
 
     Returns:
         Variable(The data type is float32 or float64):
@@ -3090,7 +3090,7 @@ def multiclass_nms(bboxes,
                                  low confidence score. If not provided, 
                                  consider all boxes.
         nms_top_k (int): Maximum number of detections to be kept according to
-                         the confidences aftern the filtering detections based
+                         the confidences after the filtering detections based
                          on score_threshold.
         nms_threshold (float): The threshold to be used in NMS. Default: 0.3
         nms_eta (float): The threshold to be used in NMS. Default: 1.0
@@ -3201,7 +3201,7 @@ def locality_aware_nms(bboxes,
                                  low confidence score. If not provided,
                                  consider all boxes.
         nms_top_k (int): Maximum number of detections to be kept according to
-                         the confidences aftern the filtering detections based
+                         the confidences after the filtering detections based
                          on score_threshold.
         nms_threshold (float): The threshold to be used in NMS. Default: 0.3
         nms_eta (float): The threshold to be used in NMS. Default: 1.0
diff --git a/python/paddle/fluid/layers/distributions.py b/python/paddle/fluid/layers/distributions.py
index 69b99bf577f7515d6a5cf67dffdfa6b319412d85..396ab443a4b962e1759d596cf40e8d85ed786cd8 100644
--- a/python/paddle/fluid/layers/distributions.py
+++ b/python/paddle/fluid/layers/distributions.py
@@ -561,7 +561,7 @@ class MultivariateNormalDiag(Distribution):
             a.entropy()
             # [2.033158] with shape: [1]
             b.entropy()
-            # [1.7777451] with shaoe: [1]
+            # [1.7777451] with shape: [1]
 
             a.kl_divergence(b)
             # [0.06542051] with shape: [1]
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 7c6e5aa1859e762cbada4392b24545ebcacb6c4e..180aec4d9b2c23bd0edf0879f2ccd098b250f005 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -253,7 +253,7 @@ def Send(endpoints, send_vars, dummy_output=None, sync=True):
     side when server have finished running server side program.
 
     Args:
-        endpoints (str): comma seperated IP:PORT pairs in the order
+        endpoints (str): comma separated IP:PORT pairs in the order
                    of send_vars to send
         send_vars (list): variables to send to server
         sync (bool): whether to wait the request finish
@@ -296,7 +296,7 @@ def Recv(endpoints, get_vars, dummy_input=None, sync=True):
     Receive variables from server side
 
     Args:
-        endpoints (str): comma seperated IP:PORT pairs in the order
+        endpoints (str): comma separated IP:PORT pairs in the order
                    of send_vars to send
         get_vars (list): vars to get from server after send completes.
         sync (bool): whether to wait the request finish
@@ -603,7 +603,7 @@ def py_reader(capacity,
          import paddle.dataset.mnist as mnist
 
          def network(image, label):
-             # user defined network, here a softmax regresssion example
+             # user defined network, here a softmax regession example
              predict = fluid.layers.fc(input=image, size=10, act='softmax')
              return fluid.layers.cross_entropy(input=predict, label=label)
 
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 6ca0769d9970ceb204be0b086cc3a0832faaf0fa..eae8d43bfc4dd6145146b8343776366812195db1 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -109,7 +109,7 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     training progresses. By using this function, the learning rate will be decayed by
     'decay_rate' every 'decay_steps' steps.
 
-    Decayed learning rate calcualtes as follows:
+    Decayed learning rate calculates as follows:
 
     >>> if staircase == True:
     >>>     decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
@@ -165,7 +165,7 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     training progresses. By using this function, the learning rate will be decayed by
     natural exponential power 'decay_rate' every 'decay_steps' steps.
 
-    Decayed learning rate calcualtes as follows:
+    Decayed learning rate calculates as follows:
 
     >>> if not staircase:
     >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
@@ -178,7 +178,7 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
         decay_steps(int): The learning rate decay steps. See the decay computation above.
         decay_rate(float): The learning rate decay rate. See the decay computation above.
         staircase(bool): If True, decay the learning rate at discrete intervals, which 
-                         means the learning rate will be decayed by natual exponential power
+                         means the learning rate will be decayed by natural exponential power
                          `decay_rate` every `decay_steps`. If False, learning rate will be
                          decayed continuously and following the formula above. Default: False
 
@@ -222,7 +222,7 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     training progresses. By using this function, an inverse decay function will be
     applied to the initial learning rate.
 
-    Decayed learning rate calcualtes as follows:
+    Decayed learning rate calculates as follows:
 
     >>> if staircase == True:
     >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 5b9a5f26b3a28a85eb2f87b72c12adc78d835a4d..23c062a419b4d979ebfb021fb68dac2c74783ccb 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -497,7 +497,7 @@ def warpctc(input,
     (https://github.com/baidu-research/warp-ctc)
     to compute Connectionist Temporal Classification (CTC) loss.
     It can be aliased as softmax with CTC, since a native softmax activation is
-    interated to the Warp-CTC library to normlize values for each row of the
+    interated to the Warp-CTC library to normalize values for each row of the
     input tensor.
 
     Args:
@@ -523,7 +523,7 @@ def warpctc(input,
        norm_by_times(bool, default false): Whether to normalize the gradients
          by the number of time-step, which is also the sequence's length.
          There is no need to normalize the gradients if warpctc layer was
-         follewed by a mean_op.
+         followed by a mean_op.
        input_length(Variable): The length for each input sequence if it is 
          of Tensor type, it should have shape `[batch_size]` and dtype int64.
        label_length(Variable): The length for each label sequence if it is
@@ -663,12 +663,12 @@ def nce(input,
         num_neg_samples (int): ${num_neg_samples_comment}.
         name(str|None): For detailed information, please refer to 
             :ref:`api_guide_Name` . Usually name is no need to set and None by default.
-        sampler (str, optional): The sampler used to sample class from negtive classes.
+        sampler (str, optional): The sampler used to sample class from negative classes.
                        It can be 'uniform', 'log_uniform' or 'custom_dist'.
                        default: 'uniform'.
         custom_dist (nd.array|None): A numpy ndarray with size=num_total_classes.
                        It is used when sampler is set to 'custom_dist'.
-                       custom_dist[i] is the probsbility of i-th class to be sampled.
+                       custom_dist[i] is the probability of i-th class to be sampled.
                        default: None.
         seed (int, optional): The seed used in sampler. Default 0, means no random seed.
         is_sparse(bool, optional): The flag indicating whether to use sparse update, 
@@ -1194,7 +1194,7 @@ def softmax_with_cross_entropy(logits,
             Label is a ``Tensor``  in the same shape with :attr:`logits`. 
             If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor`` 
             in the same shape with :attr:`logits` expect shape in dimension :attr:`axis` as 1.
-        soft_label (bool, optional): A flag to indicate whether to interpretate the given
+        soft_label (bool, optional): A flag to indicate whether to interpretant the given
             labels as soft labels. Default False.
         ignore_index (int, optional): Specifies a target value that is ignored and does
                                       not contribute to the input gradient. Only valid
@@ -1665,7 +1665,7 @@ def mse_loss(input, label):
 
     Parameters: 
         input (Variable): Input tensor, the data type should be float32.
-        label (Variable): Label tensor, the data type shoulf be float32.
+        label (Variable): Label tensor, the data type should be float32.
 
     Returns:
         Variable: The tensor variable storing the mean square error difference of input and label.
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 3136fca1095ad959d7d9e3a0da423781ea70b014..b804984efbdf98af409a91c17a57b2b1f560bce0 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -277,7 +277,7 @@ def fc(input,
         input (Variable|list of Variable): A Tensor(or LoDTensor) with shape :math:`[N_1, N_2,..., N_k]` or
             a list of Tensor(or LoDTensor). The dimensions of the input Tensor is at least 2 and the data
             type should be float32 or float64.
-        size(int): The number of output units in this layer, which also means the feature size of ouput
+        size(int): The number of output units in this layer, which also means the feature size of output
             Tensor(or LoDTensor).
         num_flatten_dims (int): The fc layer can accept an input Tensor with more than
             two dimensions. If this happens, the multidimensional tensor will first be flattened
@@ -445,7 +445,7 @@ def embedding(input,
             default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
             user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. 
             The local word vector needs to be transformed into numpy format, and the shape of local word
-            vector shoud be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
+            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
             is used to load custom or pre-trained word vectors. See code example 2 for details.
         dtype(str|core.VarDesc.VarType): It refers to the data type of output Tensor.
             It must be float32 or float64. Default: float32.
@@ -460,7 +460,7 @@ def embedding(input,
           import numpy as np
           data = fluid.data(name='x', shape=[None, 1], dtype='int64')
 
-          # exampel 1
+          # example 1
           emb_1 = fluid.embedding(input=data, size=[128, 64])
 
           # example 2: load custom or pre-trained word vectors
@@ -819,7 +819,7 @@ def dropout(x,
 
             import paddle.fluid as fluid
             x = fluid.data(name="data", shape=[None, 32, 32], dtype="float32")
-            droped = fluid.layers.dropout(x, dropout_prob=0.5)
+            dropped = fluid.layers.dropout(x, dropout_prob=0.5)
     """
 
     def get_attrs(prog, dropout_prob, is_test, seed):
@@ -934,7 +934,7 @@ def chunk_eval(input,
             a LoDTensor, its shape would be `[N, 1]` where `N` stands for the total
             sequence lengths in this mini-batch. The data type should be int64.
         label (Variable): A Tensor or LoDTensor representing the ground-truth labels.
-            It shoud have the same shape, lod and data type as ``input`` .
+            It should have the same shape, lod and data type as ``input`` .
         chunk_scheme (str): Indicate the tagging schemes used here. The value must
             be IOB, IOE, IOBES or plain.
         num_chunk_types (int): The number of chunk types.
@@ -1090,7 +1090,7 @@ def softmax(input, use_cudnn=False, name=None, axis=-1):
     Args:
         input (Variable): The input variable. A multi-dimension ``Tensor`` with type float32 or float64.
         use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn \
-            library is installed. To improve numerical stablity, set use_cudnn to \
+            library is installed. To improve numerical stability, set use_cudnn to \
             False by default.
         name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . Default: None.
             will be named automatically. Default: None.
@@ -1215,7 +1215,7 @@ def conv2d(input,
             If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
             Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
-            on both sides for each dimention.If `padding` is a string, either 'VALID' or
+            on both sides for each dimension.If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_height, pad_width]` or
             `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when 
@@ -1483,7 +1483,7 @@ def conv3d(input,
             tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
             Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
-            on both sides for each dimention. If `padding` is a string, either 'VALID' or
+            on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
             `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
@@ -2171,7 +2171,7 @@ def adaptive_pool2d(input,
 
           # average adaptive pool2d
           # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n],
-          # output shape is [N, C, m, n], adaptive pool divide H and W dimentions
+          # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
           # of input data into m * n grids averagely and performs poolings in each
           # grid to get output.
           # adaptive average pool performs calculations as follow:
@@ -2193,7 +2193,7 @@ def adaptive_pool2d(input,
 
           # max adaptive pool2d
           # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n],
-          # output shape is [N, C, m, n], adaptive pool divide H and W dimentions
+          # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
           # of input data into m * n grids averagely and performs poolings in each
           # grid to get output.
           # adaptive average pool performs calculations as follow:
@@ -2312,7 +2312,7 @@ def adaptive_pool3d(input,
 
           # average adaptive pool3d
           # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n],
-          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions
+          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
           # of input data into l * m * n grids averagely and performs poolings in each
           # grid to get output.
           # adaptive average pool performs calculations as follow:
@@ -2341,7 +2341,7 @@ def adaptive_pool3d(input,
 
           # max adaptive pool3d
           # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n],
-          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions
+          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
           # of input data into l * m * n grids averagely and performs poolings in each
           # grid to get output.
           # adaptive average pool performs calculations as follow:
@@ -2985,7 +2985,7 @@ def layer_norm(input,
             omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
             a default :code:`ParamAttr` would be added as bias. The
             :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
-        act(str, optional): Activation to be applied to the output of layer normalizaiton.
+        act(str, optional): Activation to be applied to the output of layer normalization.
                   Default: None.
         name(str): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
 
@@ -3026,7 +3026,7 @@ def layer_norm(input,
         inputs['Scale'] = scale
     else:
         if param_attr:
-            warnings.warn("param_attr is only avaliable with scale is True.")
+            warnings.warn("param_attr is only available with scale is True.")
     if shift:
         assert bias_attr is not False, "bias_attr should not be False when using shift."
         bias = helper.create_parameter(
@@ -3034,7 +3034,7 @@ def layer_norm(input,
         inputs['Bias'] = bias
     else:
         if bias_attr:
-            warnings.warn("bias_attr is only avaliable with shift is True.")
+            warnings.warn("bias_attr is only available with shift is True.")
 
     # create output
     mean_out = helper.create_variable_for_type_inference(
@@ -3085,7 +3085,7 @@ def group_norm(input,
             attribute. If a bool type, only False is supported, which means there is no bias parameter.
             Default: None, the default bias parameter attribute is used. For more information, please
             refer to :ref:`api_guide_ParamAttr` .
-        act(str, optional): Activation to be applied to the output of group normalizaiton.
+        act(str, optional): Activation to be applied to the output of group normalization.
         data_layout(str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -3174,7 +3174,7 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
     and W is the product result of remaining dimensions.
 
     Step 2:
-    :attr:`power_iters` shoule be a positive interger, do following
+    :attr:`power_iters` should be a positive integer, do following
     calculations with U and V for :attr:`power_iters` rounds. Calculations
     as follows:
 
@@ -3944,7 +3944,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             x = fluid.data(name='x', shape=[2, 4], dtype='float32')
             fluid.layers.reduce_mean(x)  # [0.4375]
             fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
@@ -3954,7 +3954,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
             # y is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1.0, 2.0], [3.0, 4.0]],
             #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
             fluid.layers.reduce_mean(y, dim=[1, 2]) # [2.5, 6.5]
             fluid.layers.reduce_mean(y, dim=[0, 1]) # [4.0, 5.0]
@@ -4015,7 +4015,7 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             x = fluid.data(name='x', shape=[2, 4], dtype='float32')
             fluid.layers.reduce_max(x)  # [0.9]
             fluid.layers.reduce_max(x, dim=0)  # [0.2, 0.3, 0.6, 0.9]
@@ -4025,7 +4025,7 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
             # y is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1.0, 2.0], [3.0, 4.0]],
             #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
             fluid.layers.reduce_max(y, dim=[1, 2]) # [4.0, 8.0]
             fluid.layers.reduce_max(y, dim=[0, 1]) # [7.0, 8.0]
@@ -4076,7 +4076,7 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             x = fluid.data(name='x', shape=[2, 4], dtype='float32')
             fluid.layers.reduce_min(x)  # [0.1]
             fluid.layers.reduce_min(x, dim=0)  # [0.1, 0.2, 0.5, 0.7]
@@ -4086,7 +4086,7 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
             # y is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1.0, 2.0], [3.0, 4.0]],
             #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
             fluid.layers.reduce_min(y, dim=[1, 2]) # [1.0, 5.0]
             fluid.layers.reduce_min(y, dim=[0, 1]) # [1.0, 2.0]
@@ -4115,7 +4115,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
         input (Variable): The input variable which is a Tensor, the data type is float32,
             float64, int32, int64.
         dim (list|int, optional): The dimensions along which the product is performed. If
-            :attr:`None`, multipy all elements of :attr:`input` and return a
+            :attr:`None`, multiply all elements of :attr:`input` and return a
             Tensor variable with a single element, otherwise must be in the
             range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
             the dimension to reduce is :math:`rank + dim[i]`.
@@ -4137,7 +4137,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             x = fluid.data(name='x', shape=[2, 4], dtype='float32')
             fluid.layers.reduce_prod(x)  # [0.0002268]
             fluid.layers.reduce_prod(x, dim=0)  # [0.02, 0.06, 0.3, 0.63]
@@ -4148,7 +4148,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             # y is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1.0, 2.0], [3.0, 4.0]],
             #      [[5.0, 6.0], [7.0, 8.0]]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32')
             fluid.layers.reduce_prod(y, dim=[1, 2]) # [24.0, 1680.0]
             fluid.layers.reduce_prod(y, dim=[0, 1]) # [105.0, 384.0]
@@ -4845,7 +4845,7 @@ def ctc_greedy_decoder(input,
         in result were empty, the result LoDTensor will be [-1] with  empty \
         LoD [[]].
 
-        For padding mode, returns a tuple of (output, output_length), which was describled as below: 
+        For padding mode, returns a tuple of (output, output_length), which was described as below: 
 
         output, 2-D Tensor, shape is [batch_size, N], data type is int64.
 
@@ -5039,7 +5039,7 @@ def im2sequence(input,
             is :math:`[batchsize, 2]` . It is just for batch inference when not None. Default is None.
 
         out_stride(int32 | List[int32]): The scaling of image through CNN. It is valid only when input_image_size is not None.
-            If out_stride is List,  it must contain two intergers,
+            If out_stride is List,  it must contain two integers,
             :math:`[out\_stride\_height, out\_stride\_W]` . Otherwise,
             the out_stride_height = out_stride_width = out_stride. Default is 1.
 
@@ -5254,7 +5254,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
     It takes the first dimension of :attr:`x` and :attr:`y` as batch size.
     For each instance, it computes the smooth L1 loss element by element first
-    and then sums all the losses. So the shape of ouput Variable is
+    and then sums all the losses. So the shape of output Variable is
     [batch_size, 1].
 
     Args:
@@ -5479,7 +5479,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
     When ``shape`` and ``actual_shape`` are set at the same time,
     ``actual_shape`` has a higher priority than ``shape``
     but at this time ``shape`` can only be an integer list or tuple, and ``shape`` still should be set correctly to
-    gurantee shape inference in compile-time.
+    guarantee shape inference in compile-time.
 
     Some tricks exist when specifying the target shape.
 
@@ -5632,7 +5632,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                 else:
                     assert dim_size > 0, (
                         "Each dimension value of 'shape' in reshape must not "
-                        "be negtive except one unknown dimension. "
+                        "be negative except one unknown dimension. "
                         "But received shape[%d] = %s." %
                         (dim_idx, str(dim_size)))
         return attrs_shape
@@ -6163,7 +6163,7 @@ def pad_constant_like(x, y, pad_value=0., name=None):
             Out.shape = (2, 3, 2, 3)
 
     Args:
-        x (Variable): Tensor, its shape spicifies the shape of output.
+        x (Variable): Tensor, its shape specifies the shape of output.
         y (Variable): Tensor, its rank is the same with :attr:`x`, and for each dimension :math:`i` , 
                       :math:`y\_shape[i] <= x\_shape[i]` . The data type can be float32 or float64.
         pad_value (float): The constant value used to pad.
@@ -6498,7 +6498,7 @@ def image_resize(input,
         'NEAREST' : Nearest neighbor interpolation
 
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
-    in both the 3rd dimention(in height direction) and the 4th dimention(in width 
+    in both the 3rd dimension(in height direction) and the 4th dimension(in width 
     direction) on input tensor.
             
     Bilinear interpolation is an extension of linear interpolation for 
@@ -6512,7 +6512,7 @@ def image_resize(input,
     H-direction and W-direction in this op) on a rectilinear 3D grid. 
     The linear interpolation is performed on three directions.
 
-    Align_corners and align_mode are optinal parameters,the calculation method 
+    Align_corners and align_mode are optional parameters,the calculation method 
     of interpolation can be selected by them.
 
     Example:
@@ -6629,7 +6629,7 @@ def image_resize(input,
                                 will be deprecated. When using actual_shape to 
                                 specify output shape, one of :attr:`out_shape` 
                                 and :attr:`scale` should also be set, otherwise 
-                                errors would be occured in graph constructing stage.
+                                errors would be occurred in graph constructing stage.
                                 Default: None
         align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the 
                                input and output tensors are aligned, preserving the values at the 
@@ -6659,7 +6659,7 @@ def image_resize(input,
         ValueError: out_shape length should be 2 for input 4-D tensor.
         ValueError: out_shape length should be 3 for input 5-D tensor.
         ValueError: scale should be greater than zero.
-        TypeError: align_corners shoule be a bool value
+        TypeError: align_corners should be a bool value
         ValueError: align_mode can only be '0' or '1'
         ValueError: data_format can only be 'NCHW', 'NHWC', 'NCDHW' or 'NDHWC'.
 
@@ -6897,7 +6897,7 @@ def resize_bilinear(input,
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation
 
-    Align_corners and align_mode are optinal parameters,the calculation 
+    Align_corners and align_mode are optional parameters,the calculation 
     method of interpolation can be selected by them.
 
     Example:
@@ -6954,7 +6954,7 @@ def resize_bilinear(input,
                                 will be deprecated. When using actual_shape to 
                                 specify output shape, one of :attr:`out_shape` 
                                 and :attr:`scale` should also be set, otherwise 
-                                errors would be occured in graph constructing stage.
+                                errors would be occurred in graph constructing stage.
                                 Default: None
         align_corners(bool): ${align_corners_comment}
         align_mode(bool): ${align_mode_comment}
@@ -7059,7 +7059,7 @@ def resize_trilinear(input,
     For details of trilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Trilinear_interpolation
 
-    Align_corners and align_mode are optinal parameters,the calculation 
+    Align_corners and align_mode are optional parameters,the calculation 
     method of interpolation can be selected by them.
 
     Example:
@@ -7118,7 +7118,7 @@ def resize_trilinear(input,
                                 will be deprecated. When using actual_shape to 
                                 specify output shape, one of :attr:`out_shape` 
                                 and :attr:`scale` should also be set, otherwise 
-                                errors would be occured in graph constructing stage.
+                                errors would be occurred in graph constructing stage.
                                 Default: None
         align_corners(bool): ${align_corners_comment}
         align_mode(bool): ${align_mode_comment}
@@ -7272,7 +7272,7 @@ def resize_nearest(input,
                                 will be deprecated. When using actual_shape to 
                                 specify output shape, one of :attr:`out_shape` 
                                 and :attr:`scale` should also be set, otherwise 
-                                errors would be occured in graph constructing stage.
+                                errors would be occurred in graph constructing stage.
                                 Default: None
         align_corners(bool): ${align_corners_comment}
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
@@ -7581,7 +7581,7 @@ def scatter(input, index, updates, name=None, overwrite=True):
     Args:
         input (Variable): The input N-D Tensor with rank>=1. Data type can be float32.
         index (Variable): The index 1-D Tensor. Data type can be int32, int64. The length of index cannot exceed updates's length, and the value in index cannot exceed input's length.
-        updates (Variable): update input with updates parameter based on index. shape should be the same as input, and dim value with dim > 1 shoule be the same as input.
+        updates (Variable): update input with updates parameter based on index. shape should be the same as input, and dim value with dim > 1 should be the same as input.
         name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
         overwrite (bool): The mode that updating the output when there are same indices.
             If True, use the overwrite mode to update the output of the same index,
@@ -8169,13 +8169,13 @@ def crop_tensor(x, shape=None, offsets=None, name=None):
         x (Variable): 1-D to 6-D Tensor, the data type is float32, float64, int32 or int64.
         shape (list|tuple|Variable): The output shape is specified
             by `shape`. Its data type is int32. If a list/tuple, it's length must be
-            the same as the dimension size of `x`. If a Variable, it shoule be a 1-D Tensor.
+            the same as the dimension size of `x`. If a Variable, it should be a 1-D Tensor.
             When it is a list, each element can be an integer or a Tensor of shape: [1].
             If Variable contained, it is suitable for the case that the shape may
             be changed each iteration.
         offsets (list|tuple|Variable, optional): Specifies the cropping
             offsets at each dimension. Its data type is int32. If a list/tuple, it's length
-            must be the same as the dimension size of `x`. If a Variable, it shoule be a 1-D
+            must be the same as the dimension size of `x`. If a Variable, it should be a 1-D
             Tensor. When it is a list, each element can be an integer or a Tensor of shape: [1].
             If Variable contained, it is suitable for the case that the offsets may be changed
             each iteration. Default: None, the offsets are 0 at each dimension.
@@ -9357,7 +9357,7 @@ def expand(x, expand_times, name=None):
             else:
                 attrs_expand_times.append(times)
                 assert times > 0, (
-                    "Each element given in expand_times must not be negtive.")
+                    "Each element given in expand_times must not be negative.")
         return attrs_expand_times
 
     def get_new_expand_times_tensor(list_expand_times):
@@ -11198,7 +11198,7 @@ def logical_not(x, out=None, name=None):
             # Graph organizing
             x = fluid.layers.data(name='x', shape=[2], dtype='bool')
             res = fluid.layers.logical_not(x)
-            # The comment lists another availble method.
+            # The comment lists another avaliable method.
             # res = fluid.layers.fill_constant(shape=[2], dtype='bool', value=0)
             # fluid.layers.logical_not(x, out=res)
 
@@ -11495,7 +11495,7 @@ def space_to_depth(x, blocksize, name=None):
         dimension.
     The attr blocksize indicates the input block size.
 
-    space_to_depth will reorgnize the elements of input with shape[batch, channel, height, width] \
+    space_to_depth will reorganize the elements of input with shape[batch, channel, height, width] \
         according to blocksize to construct output with shape \
         [batch, channel * blocksize * blocksize, height/blocksize, width/blocksize]:
 
@@ -11848,11 +11848,11 @@ def hash(input, hash_size, num_hash=1, name=None):
 def grid_sampler(x, grid, name=None):
     """
     This operation samples input X by using bilinear interpolation based on
-    flow field grid, which is usually gennerated by :code:`affine_grid` . The grid of
+    flow field grid, which is usually generated by :code:`affine_grid` . The grid of
     shape [N, H, W, 2] is the concatenation of (x, y) coordinates
     with shape [N, H, W] each, where x is indexing the 4th dimension
-    (in width dimension) of input data x and y is indexng the 3rd
-    dimention (in height dimension), finally results is the bilinear
+    (in width dimension) of input data x and y is indexing the 3rd
+    dimension (in height dimension), finally results is the bilinear
     interpolation value of 4 nearest corner points. The output tensor 
     shape will be [N, C, H, W].
 
@@ -12223,7 +12223,7 @@ def shuffle_channel(x, group, name=None):
                         
     Args: 
         x(Variable): The input tensor variable. It should be a 4-D tensor with shape [N, C, H, W]
-        group(int): Indicating the conuts of subgroups, It should divide the number of channels.
+        group(int): Indicating the counts of subgroups, It should divide the number of channels.
 
     Returns:
         out(Variable): the channels shuffling result is a tensor variable with the 
@@ -12687,7 +12687,7 @@ def prroi_pool(input,
         pooled_height (integer): The pooled output height. Default: 1.
         pooled_width (integer): The pooled output width. Default: 1.
         batch_roi_nums (Variable): The number of roi for each image in batch. It 
-                         shoule be 1-D Tensor, with shape [N] and dtype int64, 
+                         should be 1-D Tensor, with shape [N] and dtype int64, 
                          where N is the batch size. Default: None. Be note: The lod of input should be
                          empty when batch_roi_nums has values;
         name (str, default None): The name of this operation.
@@ -12857,7 +12857,7 @@ def continuous_value_model(input, cvm, use_cvm=True):
 
     :attr:`input` is an embedding vector including show and click value, whose shape is :math:`[N, D]` (N is batch size. D is `2 + embedding dim` ).
     Show and click at first two dims of embedding vector D.
-    If :attr:`use_cvm` is True, it will caculate :math:`log(show)` and :math:`log(click)` , and output shape is :math:`[N, D]` .
+    If :attr:`use_cvm` is True, it will calculate :math:`log(show)` and :math:`log(click)` , and output shape is :math:`[N, D]` .
     If :attr:`use_cvm` is False, it will remove show and click from :attr:`input` , and output shape is :math:`[N, D - 2]` .
     :attr:`cvm` is show_click info, whose shape is :math:`[N, 2]` .
 
@@ -13019,7 +13019,7 @@ def unique(x, dtype='int32'):
 
 def unique_with_counts(x, dtype='int32'):
     """
-    This OP return a unique tensor for `x` , and count tensor that the count of unqiue result in raw input, \
+    This OP return a unique tensor for `x` , and count tensor that the count of unique result in raw input, \
     and an index tensor pointing to this unique tensor. 
 
     **NOTICE**: This op support the variable type of Tensor only.
@@ -13032,7 +13032,7 @@ def unique_with_counts(x, dtype='int32'):
         tuple, the variable type in tuple is Tensor, the output :attr:`out` data type is the same as input :attr:`x`, \
         and data type of output :attr:`index` and :attr:`count` will be int32 or int64.: The :attr:`out` is unique tensor for input :attr:`x`,\
         the data shape is :math:`[K]`, the `K` may be different to the `N` in shape of :attr:`x`. :attr:`index` is an index tensor pointing\
-        to :attr:`out`, the data shape is :math:`[N]` , the data shape is the same as input :attr:`x`. :attr:`count` is count of unqiue element in\
+        to :attr:`out`, the data shape is :math:`[N]` , the data shape is the same as input :attr:`x`. :attr:`count` is count of unique element in\
         the :attr:`x`, the data shape is :math:`[K]`, the data shape is the same as output :attr:`out`.
 
     Examples:
@@ -13163,7 +13163,7 @@ def deformable_conv(input,
         deformable_groups (int): The number of deformable group partitions.
             Default: deformable_groups = 1.
         im2col_step (int): Maximum number of images per im2col computation; 
-            The total batch size should be divisable by this value or smaller
+            The total batch size should be devisable by this value or smaller
             than this value; if you face out of memory problem, you can try
             to use a smaller value here.
             Default: im2col_step = 64.
@@ -13298,7 +13298,7 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
 
     This op returns a col buffer of sliding local blocks of input x, also known
     as im2col for batched 2D image tensors. For each block under the convolution filter,
-    all element will be rearranged as a column. While the convolution filter silding over
+    all element will be rearranged as a column. While the convolution filter sliding over
     the input feature map, a series of such columns will be formed.
 
     For each input :math:`x` with shape [N, C, H, W], the output shape [N, Cout, Lout]
@@ -13335,7 +13335,7 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
         dilations(int|list):      the dilations of convolution kernel, shold be
-                                  [dilation_h, dilation_w], or an integer dialtion treated as
+                                  [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.  
                              Normally there is no need for user to set this property.  
@@ -13344,7 +13344,7 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
     
     Returns:
         The tensor variable corresponding to the sliding local blocks. 
-        The output shape is [N, Cout, Lout] as decribled above. 
+        The output shape is [N, Cout, Lout] as decriabled above. 
         Cout is the  total number of values within each block, 
         and Lout is the total number of such blocks. 
         The data type of output is the same as the input :math:`x`
@@ -13462,7 +13462,7 @@ def deformable_roi_pooling(input,
                          Equals the reciprocal of total stride in convolutional layers, Default: 1.0.
         group_size (list|tuple): The number of groups which input channels are divided and the input is list or tuple, which value type is int32. (eg.number of input channels 
                           is k1 * k2 * (C + 1), which k1 and k2 are group width and height and C+1 is number of output
-                          chanels.) eg.(4, 6), which 4 is height of group and 6 is width of group. Default: [1, 1].
+                          channels.) eg.(4, 6), which 4 is height of group and 6 is width of group. Default: [1, 1].
         pooled_height (int): The pooled output height which value type is int32. Default: 1.
         pooled_width (int): The pooled output width which value type is int32. Default: 1.
         part_size (list|tuple): The height and width of offset which values in list or tuple is int32, eg.(4, 6), which height is 4 and width is 6, and values always equal to pooled_height \
@@ -13470,7 +13470,7 @@ def deformable_roi_pooling(input,
         sample_per_part (int): The number of samples in each bin which value type is int32. If value is bigger, it will consume more performance. Default: 1.
         trans_std (float): Coefficient of offset which value type is float32. It controls weight of offset. Default: 0.1.
         position_sensitive (bool): Whether to choose deformable psroi pooling mode or not, and value type is bool(True or False). If value is False, input dimension equals to output dimension. \
-                                   If value is True, input dimension shoule be output dimension * pooled_height * pooled_width. Default: False.
+                                   If value is True, input dimension should be output dimension * pooled_height * pooled_width. Default: False.
         name (str|None): Name of layer. Default: None.
     Returns:
         Variable: Output of deformable roi pooling is that, if position sensitive is False, input dimension equals to output dimension. If position sensitive is True,\
@@ -13602,10 +13602,10 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
     
     Args:
         - **input** (Variable): Input indices, last dimension must be 1.
-        - **index_num** (scalar): An interger defining the range of the index.
+        - **index_num** (scalar): An integer defining the range of the index.
         - **nshards** (scalar): The number of shards
         - **shard_id** (scalar): The index of the current shard
-        - **ignore_value** (scalar): An ingeter value out of sharded index range
+        - **ignore_value** (scalar): An integer value out of sharded index range
 
     Returns:
         Variable: The sharded index of input.
@@ -13810,7 +13810,7 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
         Variable: A Tensor of the specified shape filled with uniform_random values.
 
     Raises:
-        TypeError: The shape type should be list or tupple or variable.
+        TypeError: The shape type should be list or tuple or variable.
     
     Examples:
         .. code-block:: python
@@ -13864,7 +13864,7 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
             else:
                 attrs_shape.append(dim_size)
                 assert dim_size > 0, (
-                    "Each dimension size given in shape must not be negtive "
+                    "Each dimension size given in shape must not be negative "
                     "except one unknown dimension.")
         return attrs_shape
 
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 97d8a5bb3cd12fea2149031f201b50d727e04948..5951d8699813408df4601e0bdce7078a4beca978 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -142,7 +142,7 @@ The cumulative sum of the elements along a given axis. By default, the first ele
 
 Args:
     x (Variable): Input of cumsum operator, the Tensor/LoDTensor needed to be cumsumed. 
-    axis (int, optional): The dimenstion to accumulate along. -1 means the last dimenstion. Default is -1.
+    axis (int, optional): The dimension to accumulate along. -1 means the last dimension. Default is -1.
     exclusive (bool, optional): Whether to perform exclusive cumsum. Default is False.
     reverse (bool, optional): If true, the cumsum is performed in the reversed direction. Default is False.
 
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 43409c80f3ace1ab907b6230dab9f8a1a0832a23..dd274233a6b2eaaf5411e5b8504ef64dc4ea93c3 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -99,11 +99,11 @@ class RNNCell(object):
             batch_ref: A (possibly nested structure of) tensor variable[s].
                 The first dimension of the tensor will be used as batch size to
                 initialize states.
-            shape: A (possiblely nested structure of) shape[s], where a shape is
+            shape: A (possibly nested structure of) shape[s], where a shape is
                 represented as a list/tuple of integer). -1(for batch size) will
                 beautomatically inserted if shape is not started with it. If None,
                 property `state_shape` will be used. The default value is None.
-            dtype: A (possiblely nested structure of) data type[s]. The structure
+            dtype: A (possibly nested structure of) data type[s]. The structure
                 must be same as that of `shape`, except when all tensors' in states
                 has the same data type, a single data type can be used. If None and
                 property `cell.state_shape` is not available, float32 will be used
@@ -171,7 +171,7 @@ class RNNCell(object):
         """
         Abstract method (property).
         Used to initialize states.
-        A (possiblely nested structure of) shape[s], where a shape is represented
+        A (possibly nested structure of) shape[s], where a shape is represented
         as a list/tuple of integers (-1 for batch size would be automatically
         inserted into a shape if shape is not started with it). 
         Not necessary to be implemented if states are not initialized by
@@ -186,9 +186,9 @@ class RNNCell(object):
         """
         Abstract method (property).
         Used to initialize states.
-        A (possiblely nested structure of) data types[s]. The structure must be
+        A (possibly nested structure of) data types[s]. The structure must be
         same as that of `shape`, except when all tensors' in states has the same
-        data type, a signle data type can be used.
+        data type, a single data type can be used.
         Not necessary to be implemented if states are not initialized
         by `get_initial_states` or the `dtype` argument is provided when using
         `get_initial_states`.
@@ -356,7 +356,7 @@ class LSTMCell(RNNCell):
             inputs(Variable): A tensor with shape `[batch_size, input_size]`,
                 corresponding to :math:`x_t` in the formula. The data type
                 should be float32.
-            states(Variable): A list of containing two tensers, each shaped
+            states(Variable): A list of containing two tensors, each shaped
                 `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}`
                 in the formula. The data type should be float32.
 
@@ -391,7 +391,7 @@ def rnn(cell,
         **kwargs):
     """
     rnn creates a recurrent neural network specified by RNNCell `cell`,
-    which performs :code:`cell.call()` repeatedly until reachs to the maximum
+    which performs :code:`cell.call()` repeatedly until reaches to the maximum
     length of `inputs`.
 
     Parameters:
@@ -408,7 +408,7 @@ def rnn(cell,
         sequence_length(Variable, optional): A tensor with shape `[batch_size]`.
             It stores real length of each instance, thus enables users to extract
             the last valid state when past a batch element's sequence length for
-            correctness. If not provided, the padddings would be treated same as
+            correctness. If not provided, the paddings would be treated same as
             non-padding inputs. Default None.
         time_major(bool, optional): Indicate the data layout of Tensor included
             in `input` and `output` tensors. If `False`, the data layout would
@@ -590,7 +590,7 @@ class Decoder(object):
                 :math:`[time\_step, batch\_size, ...]` , which is done by the caller. 
             final_states(Variable): A (possibly nested structure of) tensor variable[s].
                 It is the `next_states` returned by `decoder.step` at last decoding step,
-                thus has the same structrue, shape and data type with states at any time
+                thus has the same structure, shape and data type with states at any time
                 step.
 
         Returns:
@@ -664,7 +664,7 @@ class BeamSearchDecoder(Decoder):
                 **Note that fluid.embedding should be used here rather than
                 fluid.layers.embedding, since shape of ids is [batch_size, beam_size].
                 when using fluid.layers.embedding, must unsqueeze in embedding_fn.**
-                If not provided, the id to embedding transfomation must be built into
+                If not provided, the id to embedding transformation must be built into
                 `cell.call`. Default None.
             output_fn(optional): A callable to apply to the cell's output prior to
                 calculate scores and select candidate token ids. Default None.
@@ -687,7 +687,7 @@ class BeamSearchDecoder(Decoder):
         `beam_size` times.
 
         Parameters:
-            x(Variable): A tenosr with shape `[batch_size, ...]`. The data type
+            x(Variable): A tensor with shape `[batch_size, ...]`. The data type
                 should be float32, float64, int32, int64 or bool.
             beam_size(int): The beam width used in beam search.
 
@@ -716,7 +716,7 @@ class BeamSearchDecoder(Decoder):
         tensor with shape `[batch_size, beam_size, ...]`. 
 
         Parameters:
-            x(Variable): A tenosr with shape `[batch_size * beam_size, ...]`. The
+            x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The
                 data type should be float32, float64, int32, int64 or bool.
 
         Returns:
@@ -732,7 +732,7 @@ class BeamSearchDecoder(Decoder):
         tensor with shape `[batch_size * beam_size, ...]`. 
 
         Parameters:
-            x(Variable): A tenosr with shape `[batch_size, beam_size, ...]`. The
+            x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The
                 data type should be float32, float64, int32, int64 or bool.
 
         Returns:
@@ -1030,7 +1030,7 @@ class BeamSearchDecoder(Decoder):
                 `[time_step, batch_size, ...]`, which is done by the caller. 
             final_states(Variable): A structure(namedtuple) of tensor variables.
                 It is the `next_states` returned by `decoder.step` at last
-                decoding step, thus has the same structrue, shape and data type
+                decoding step, thus has the same structure, shape and data type
                 with states at any time step.
             sequence_lengths(Variable): An `int64` tensor shaped `[batch_size, beam_size]`.
                 It contains sequence lengths for each beam determined during
@@ -1059,7 +1059,7 @@ def dynamic_decode(decoder,
     """
     Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned
     Tensor indicating finished status contains all True values or the number of
-    decoding step reachs to :attr:`max_step_num`.
+    decoding step reaches to :attr:`max_step_num`.
 
     :code:`decoder.initialize()` would be called once before the decoding loop.
     If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
@@ -1074,7 +1074,7 @@ def dynamic_decode(decoder,
             Tensor by :code:`decoder.step()` indicating finished status contains
             all True. Default `None`.
         output_time_major(bool, optional): Indicate the data layout of Tensor included
-            in the final outpus(the first returned value of this method). If
+            in the final outputs(the first returned value of this method). If
             attr:`False`, the data layout would be batch major with shape
             `[batch_size, seq_len, ...]`.  If attr:`True`, the data layout would
             be time major with shape `[seq_len, batch_size, ...]`. Default: `False`.
@@ -2080,7 +2080,7 @@ def lstm(input,
         name (str, optional): A name for this layer. If set None, the layer
                          will be named automatically. Default: None.
         default_initializer(Initializer, optional): Where use initializer to initialize the Weight
-                         If set None, defaule initializer will be used. Default: None.
+                         If set None, default initializer will be used. Default: None.
         seed(int, optional): Seed for dropout in LSTM, If it's -1, dropout will use random seed. Default: 1.
 
 
@@ -2365,9 +2365,9 @@ def dynamic_lstmp(input,
         inputs['C0'] = c_0
 
     if cell_clip:
-        assert cell_clip >= 0, "cell_clip should not be negtive."
+        assert cell_clip >= 0, "cell_clip should not be negative."
     if proj_clip:
-        assert proj_clip >= 0, "proj_clip should not be negtive."
+        assert proj_clip >= 0, "proj_clip should not be negative."
 
     helper.append_op(
         type='lstmp',
@@ -2628,7 +2628,7 @@ def gru_unit(input,
     Returns:
         tuple: The tuple contains three Tensor variables with the same data type \
             as ``input`` . They represent the hidden state for next time step ( :math:`h_t` ), \
-            reseted previous hidden state ( :math:`r_t \odot h_{t-1}` ), and the \
+            reset previous hidden state ( :math:`r_t \odot h_{t-1}` ), and the \
             concatenation of :math:`h_t, r_t, \\tilde{h_t}` . And they have shape \
             :math:`[N, D]` , :math:`[N, D]` , :math:`[N, D \times 3]` separately. \
             Usually only the hidden state for next time step ( :math:`h_t` ) is used \
@@ -2716,7 +2716,7 @@ def beam_search(pre_ids,
     scores calculation to perform beam search for one time step. Specifically,
     after ``ids`` and ``scores`` have been produced, it selects the top-K
     ( `k` is ``beam_size`` ) candidate word ids of current step from ``ids``
-    according to the correspongding ``scores``. Additionally, ``pre_id`` and
+    according to the corresponding ``scores``. Additionally, ``pre_id`` and
     ``pre_scores`` are the output of `beam_search` at previous step, they
     are needed for special use to handle ended candidate translations.
 
@@ -2750,7 +2750,7 @@ def beam_search(pre_ids,
             ids.
         scores(Variable): A LodTensor variable containing the accumulated
             scores corresponding to ``ids`` . Both its shape and lod are same as
-            thoes of ``ids`` . The data type should be float32.
+            those of ``ids`` . The data type should be float32.
         beam_size(int): The beam width used in beam search.
         end_id(int): The id of end token.
         level(int): **It can be ignored and mustn't change currently.**
@@ -2883,7 +2883,7 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
 
     Returns:
         tuple: The tuple contains two LodTensor variables. The two LodTensor, \
-            containing the full sequences of ids and the correspongding accumulated \
+            containing the full sequences of ids and the corresponding accumulated \
             scores, have the same shape flattened to 1D and have the same 2 level \
             lod. The lod can be used to get how many predicted sequences each sample \
             has and how many ids each predicted sequence has.
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index 2a4fe0b69f66c3f75c211f10aa3be48cccbcf478..74823900cf5138bca4ba39fb404a36b5db6899dc 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -109,7 +109,7 @@ def sequence_conv(input,
             the same as input whether :attr:`padding` is set true or false. Because the length of
             input sequence may be shorter than :attr:`filter\_size`, which will cause the convolution
             result to not be computed correctly. These padding data will not be trainable or updated
-            while trainnig. Default: True.
+            while training. Default: True.
         padding_start (int): It is used to indicate the start index for padding the input
             sequence, which can be negative. The negative number means to pad
             :attr:`|padding_start|` time-steps of all-zero data at the beginning of each instance.
@@ -626,7 +626,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
         ref_level: 0
 
         then output is a 1-level LoDTensor out:
-            out.lod =  [[2,        2,        2,        2]]    #lod based on offfset
+            out.lod =  [[2,        2,        2,        2]]    #lod based on offset
             out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
             out.dims = [8, 1]
 
@@ -844,7 +844,7 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
          to ``maxlen``). The padding value is defined by ``pad_value``, and will be \
         appended to the tail of sequences. The result is a Python tuple ``(Out, Length)``: \
         the LodTensor ``Out`` is the padded sequences, and LodTensor ``Length`` is \
-        the length information of input sequences. For removing paddding data (unpadding \
+        the length information of input sequences. For removing padding data (unpadding \
 	operation), See :ref:`api_fluid_layers_sequence_unpad` .
 
     Please note that the input ``x`` should be LodTensor.
@@ -869,7 +869,7 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
             x.data = [[a1,a2],[b1,b2],[c1,c2],[d1,d2],[e1,e2]]
         pad_value:
             pad_value.data = [0]
-        defualt maxlen = None, (the virtual value is 3, according to the shape of x)
+        default maxlen = None, (the virtual value is 3, according to the shape of x)
 
         the output tuple (Out, Length):
             Out.data = [[[a1,a2],[b1,b2],[0,0]],[[c1,c2],[d1,d2],[e1,e2]]]
@@ -881,7 +881,7 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
             x.data = [[a1,a2],[b1,b2],[c1,c2],[d1,d2],[e1,e2]]
         pad_value:
             pad_value.data = [p1,p2]
-        defualt maxlen = None, (the virtual value is 3)
+        default maxlen = None, (the virtual value is 3)
 
         get tuple (Out, Length):
             Out.data = [[[a1,a2],[b1,b2],[p1,p2]],[[c1,c2],[d1,d2],[e1,e2]]]
@@ -891,7 +891,7 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
 
     Args:
         x (Variable): Input 1-level LodTensor with dims ``[M, K]``. The batch \
-            size is described by lod infor (the number of sequnces ). \
+            size is described by lod infor (the number of sequences ). \
             The data type should be float32, float64, int8, int32 or int64.
         pad_value (Variable): Padding value. It can be a scalar or a 1D tensor \
             with length ``K``. If it's a scalar, it will be automatically broadcasted \
@@ -962,7 +962,7 @@ def sequence_unpad(x, length, name=None):
 		      [ 6.0,  7.0,  8.0,  9.0, 10.0],
 		      [11.0, 12.0, 13.0, 14.0, 15.0]],
 
-	in which there are 3 sequences padded to length 5, and the acutal length
+	in which there are 3 sequences padded to length 5, and the actual length
 	specified by input Variable **length**:
 
 	    length.data = [2, 3, 4],
@@ -1077,7 +1077,7 @@ def sequence_scatter(input, index, updates, name=None):
     
     **The index and updates parameters of the OP must be LoDTensor.**
      
-    Plus the updates data to the correspoding input according to the index.
+    Plus the updates data to the corresponding input according to the index.
  
     The updated algorithm is as follows: output[instance_index][index [pos]] = input[instance_index][index [pos]] +  updates[pos], 
     where instance_idx is the K sample corresponding to pos in batch.
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index c8b8e634137a0513bec53107f55ca2571879f014..5c467f2d36df07483d5889a15dbcce05efdc9594 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -512,9 +512,9 @@ def assign(input, output=None):
 def fill_constant(shape, dtype, value, force_cpu=False, out=None):
     """
     This OP creates a Tensor with specified `shape` and `dtype`, and
-    initializes it with a constant specifed by `value`.
+    initializes it with a constant specified by `value`.
 
-    The attribute `stop_gradient` of the created Tensor is setted to True.
+    The attribute `stop_gradient` of the created Tensor is set to True.
 
     Args:
         shape(list|tuple|Variable): Shape of the Tensor to be created.
@@ -524,7 +524,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
         dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output tensor which can
             be float16, float32, float64, int32, int64.
         value(float): The constant value used to initialize the Tensor to be created.
-        force_cpu(True): data should be on CPU if it's true, defalut value is False.
+        force_cpu(True): data should be on CPU if it's true, default value is False.
         out(Variable, optional): Optional output which can be any created 
             Variable that meets the requirements to store the result of operation.
             if out is None, a new Varibale will be create to store the result.
@@ -686,7 +686,7 @@ def fill_constant_batch_size_like(input,
             The default value is 0.
         output_dim_idx(int): Used to specify which dimension of Tensor is created to be set
             the value of batch_size of input Tensor. The default value is 0.
-        force_cpu(bool): data should be on CPU if it's true, defalut value is False.
+        force_cpu(bool): data should be on CPU if it's true, default value is False.
 
     Returns:
         Variable: Tensor which will be created according to dtype.
@@ -1079,7 +1079,7 @@ def save_combine(x, file_path, overwrite=True):
 
 def load_combine(out, file_path):
     """
-    Loads a list of vairables from a single file.
+    Loads a list of variable from a single file.
 
     Args:
         out(list): The list of variables to be read from the disk file.
@@ -1288,7 +1288,7 @@ def zeros_like(x, out=None):
         x(Variable): The input tensor which specifies shape and dtype, the input data dtype could be bool, float32, float64, int32, int64.
         out(Variable, optional): If is :attr:`None` , the op will create the variable as output, the data type and shape of \
             this variable will be same as input :attr:`x`. If is a tensor, the data type and shape need to be same as input :attr:`x`. 
-            The defalut value is :attr:`None` .
+            The default value is :attr:`None` .
 
     Returns:
         Variable: The N-D tensor, the element in tensor is related to input data type, if the input data type is bool, \
diff --git a/python/paddle/fluid/log_helper.py b/python/paddle/fluid/log_helper.py
index 0933d7b904808a1d5deae1bb5add831cceb0f50e..ab20ed4c48c7a1ed569e03be5d589f3bf1c59b14 100644
--- a/python/paddle/fluid/log_helper.py
+++ b/python/paddle/fluid/log_helper.py
@@ -31,7 +31,7 @@ def get_logger(name, level, fmt=None):
         fmt (str): Format of logger output
 
     Returns:
-        logging.Logger: logging logger with given setttings
+        logging.Logger: logging logger with given settings
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 43c83e6c68b7f42bb8410d7c1a08c7d4fb1d2134..cc9d26037622824d338131a14577e4a23d09cf6e 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -71,7 +71,7 @@ class MetricBase(object):
     2. aggregate the existing evaluation results as the overall performance.
 
     The class Metric is the base class for all classes in paddle.fluid.metrics, it defines
-    the fundmental APIs for all metrics classes, including:
+    the fundamental APIs for all metrics classes, including:
 
     1. update(preds, labels): given the prediction results (preds) and the labels (labels)
     of some mini-batch, compute the evaluation result of that mini-batch, and memorize the
@@ -142,7 +142,7 @@ class MetricBase(object):
             None
 
         Returns:
-            a python dict, which costains the inner states of the metric instance
+            a python dict, which contains the inner states of the metric instance
 
         Return types:
             a python dict
@@ -275,7 +275,7 @@ class Precision(MetricBase):
     relevant instances among the retrieved instances. Refer to
     https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers
 
-    Noted that this class mangages the precision score only for binary classification task.
+    Noted that this class manages the precision score only for binary classification task.
 
     Args:
        name (str, optional): Metric name. For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -358,7 +358,7 @@ class Recall(MetricBase):
     Refer to:
     https://en.wikipedia.org/wiki/Precision_and_recall
 
-    Noted that this class mangages the recall score only for binary classification task.
+    Noted that this class manages the recall score only for binary classification task.
 
     Args:
        name (str, optional): Metric name. For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -391,7 +391,7 @@ class Recall(MetricBase):
     def __init__(self, name=None):
         super(Recall, self).__init__(name)
         self.tp = 0  # true positive
-        self.fn = 0  # false negtive
+        self.fn = 0  # false negative
 
     def update(self, preds, labels):
         """
@@ -529,10 +529,10 @@ class ChunkEvaluator(MetricBase):
         .. code-block:: python
 
             import paddle.fluid as fluid
-            # init the chunck-level evaluation manager
+            # init the chunk-level evaluation manager
             metric = fluid.metrics.ChunkEvaluator()
 
-            # suppose the model predict 10 chuncks, while 8 ones are correct and the ground truth has 9 chuncks.
+            # suppose the model predict 10 chucks, while 8 ones are correct and the ground truth has 9 chucks.
             num_infer_chunks = 10
             num_label_chunks = 9 
             num_correct_chunks = 8
@@ -542,7 +542,7 @@ class ChunkEvaluator(MetricBase):
 
             print("precision: %.2f, recall: %.2f, f1: %.2f" % (numpy_precision, numpy_recall, numpy_f1))
 
-            # the next batch, predicting 3 prefectly correct chuncks.
+            # the next batch, predicting 3 perfectly correct chucks.
             num_infer_chunks = 3
             num_label_chunks = 3
             num_correct_chunks = 3
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index f8ad54751fe131c86bf00ce65a97319e6d00fdc0..118b9d60e3b5a09551865d96470706329e567e0b 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -397,7 +397,7 @@ def scaled_dot_product_attention(queries,
             where :math:`N` stands for batch size, :math:`L_k` for the sequence length
             of key, :math:`d_v \\times h` for the feature size of value, :math:`h` for head
             number. The data type should be the same as ``queries`` .
-        num_heads (int, optional): Indicate the number of head. If the numher
+        num_heads (int, optional): Indicate the number of head. If the number
             is 1, linear projection would not be performed on inputs. Default: 1.
         dropout_rate (float, optional): The rate to drop the attention weight.
             Default: 0.0, which means no dropout.
@@ -410,7 +410,7 @@ def scaled_dot_product_attention(queries,
             Multi-Head Attention.
 
     Raises:
-        ValueError: Inputs quries, keys and values should all be 3-D tensors.
+        ValueError: Inputs queries, keys and values should all be 3-D tensors.
         ValueError: The hidden size of queries and keys should be the same.
         ValueError: The max sequence length in query batch and in key batch should be the same.
         ValueError: he hidden size of keys must be divisible by the number of attention heads.
@@ -429,7 +429,7 @@ def scaled_dot_product_attention(queries,
     """
     if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
         raise ValueError(
-            "Inputs quries, keys and values should all be 3-D tensors.")
+            "Inputs queries, keys and values should all be 3-D tensors.")
 
     if queries.shape[-1] != keys.shape[-1]:
         raise ValueError(
@@ -474,7 +474,7 @@ def scaled_dot_product_attention(queries,
 
     def __split_heads(x, num_heads):
         """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
+        Reshape the last dimension of input tensor x so that it becomes two
         dimensions.
 
         Args:
@@ -496,13 +496,13 @@ def scaled_dot_product_attention(queries,
             x=x,
             shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
 
-        # permuate the dimensions into:
+        # permute the dimensions into:
         # [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
         return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
 
     def __combine_heads(x):
         """
-        Reshape the last two dimensions of inpunt tensor x so that it becomes
+        Reshape the last two dimensions of input tensor x so that it becomes
         one dimension.
 
         Args:
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index d9c2cb702f60e235f93155f1735418c71f89e5ba..dd7995c6f7f785c8fb8b0caba271c8bccb54c8ad 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -112,12 +112,12 @@ class Optimizer(object):
     @framework.dygraph_only
     def state_dict(self):
         '''
-        Get state dict information from optimizer. It contain all the variable used by optimizer. For Adam opimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict.
-        If the optimzier never be called(minimize function), the state_dict is empty.
+        Get state dict information from optimizer. It contain all the variable used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict.
+        If the optimizer never be called(minimize function), the state_dict is empty.
 
         Args: None
         Return:
-            state_dict(dict) : dict contains all the variablel used by optimizer
+            state_dict(dict) : dict contains all the variable used by optimizer
         
         Examples:
             .. code-block:: python
@@ -153,7 +153,7 @@ class Optimizer(object):
     @framework.dygraph_only
     def set_dict(self, state_dict):
         '''
-        Load optimizer state dict. For Adam opimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.
+        Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.
 
         Args: 
             state_dict(dict) : Dict contains all the Variable needed by optimizer
@@ -289,7 +289,7 @@ class Optimizer(object):
     def current_step_lr(self):
         """
         .. note::
-          **This API is ONLY avaliable in Dygraph mode**
+          **This API is ONLY available in Dygraph mode**
         
         Get current step learning rate. The return value is all the same When LearningRateDecay is not used,
         otherwise return the step learning rate.
@@ -1613,7 +1613,7 @@ class AdagradOptimizer(Optimizer):
 
 class AdamOptimizer(Optimizer):
     """
-    The Adam optimzier uses an optimization described at the end
+    The Adam optimizer uses an optimization described at the end
     of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
     it can dynamically adjusts the learning rate of each parameter using
     the 1st moment estimates and the 2nd moment estimates of the gradient.
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index a82a75e10bb1ae3c35435401554fcf9a8208843e..dae6d99ee77d7c150aae50fe1081ef927a803a84 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -196,7 +196,7 @@ class WeightNormParamAttr(ParamAttr):
     Args:
         dim(int): Dimension over which to compute the norm. Dim is a non-negative
             number which is less than the rank of weight Tensor. For Example, dim can
-            be choosed from 0, 1, 2, 3 for convolution whose weight shape is [cout, cin, kh, kw]
+            be chosen from 0, 1, 2, 3 for convolution whose weight shape is [cout, cin, kh, kw]
             and rank is 4. Default None, meaning that all elements will be normalized.
         name(str, optional): The parameter's name. Default None, meaning that the name would
             be created automatically. Please refer to :ref:`api_guide_Name` for more details.
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index c0b0c86ecd2b30be82d46cf3af7bcd13045cff6e..730e9c10a735de3abbc8423de39873c11f31253c 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -312,7 +312,7 @@ def profiler(state,
 
             #### Examples Results ####
             #### 1) sorted_key = 'total', 'calls', 'max', 'min', 'ave' ####
-            # The only difference in 5 sorted_key results is the following sentense: 
+            # The only difference in 5 sorted_key results is the following sentence: 
             # "Sorted by number of xxx in descending order in the same thread."
             # The reason is that in this example, above 5 columns are already sorted.
             ------------------------->     Profiling Report     <-------------------------
diff --git a/python/paddle/fluid/tests/demo/pipeline_train.py b/python/paddle/fluid/tests/demo/pipeline_train.py
index 54fa719e29d3ed0cd6ec6fefd388f9ce1f3604c2..bebc0761bf0d64093c15ebd0cadec54e90d179e9 100644
--- a/python/paddle/fluid/tests/demo/pipeline_train.py
+++ b/python/paddle/fluid/tests/demo/pipeline_train.py
@@ -71,7 +71,7 @@ def parse_args():
     parser.add_argument(
         '--emb_lr_rate', type=float, default=0.5, help='learning rate')
     parser.add_argument(
-        '--step', type=int, default=1, help='gnn propogation steps')
+        '--step', type=int, default=1, help='gnn propagation steps')
     parser.add_argument(
         '--lr_dc', type=float, default=0.1, help='learning rate decay rate')
     parser.add_argument(
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index b8d83323600a7d9ca437ceeafd95fef74bf4f056..c3ff3c0feb788639d86c0df545b942abc1a36caa 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -223,7 +223,7 @@ input_descs = {
     # The actual data shape of label_word is:
     # [batch_size * max_trg_len_in_batch, 1]
     "lbl_word": [(batch_size * seq_len, long_type(1)), "int64"],
-    # This input is used to mask out the loss of paddding tokens.
+    # This input is used to mask out the loss of padding tokens.
     # The actual data shape of label_weight is:
     # [batch_size * max_trg_len_in_batch, 1]
     "lbl_weight": [(batch_size * seq_len, long_type(1)), "float32"],
@@ -972,7 +972,7 @@ def multi_head_attention(queries,
     """
     if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
         raise ValueError(
-            "Inputs: quries, keys and values should all be 3-D tensors.")
+            "Inputs: queries, keys and values should all be 3-D tensors.")
 
     def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
         """
@@ -997,7 +997,7 @@ def multi_head_attention(queries,
 
     def __split_heads(x, n_head):
         """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
+        Reshape the last dimension of input tensor x so that it becomes two
         dimensions and then transpose. Specifically, input a tensor with shape
         [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
         with shape [bs, n_head, max_sequence_length, hidden_dim].
@@ -1011,13 +1011,13 @@ def multi_head_attention(queries,
         reshaped = layers.reshape(
             x=x, shape=[0, 0, n_head, hidden_size // n_head])
 
-        # permuate the dimensions into:
+        # permute the dimensions into:
         # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
         return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
 
     def __combine_heads(x):
         """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
+        Transpose and then reshape the last two dimensions of input tensor x
         so that it becomes one dimension, which is reverse to __split_heads.
         """
         if len(x.shape) == 3: return x
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index 48509067cdef0104ddfe29907fd5b806043306bf..db9e8d2c6bda011bef7c23e7fb51e246137a3906 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -126,7 +126,7 @@ class TestSqrtDoubleGradCheck(unittest.TestCase):
 class TestSquareDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
index 52d44d69fae8ad9751a01d68a4ef21db3e7aab46..e86f18a62167b7feab1549072fc296f847c00491 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
@@ -28,7 +28,7 @@ from decorator_helper import prog_scope
 class TestElementwiseMulDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
@@ -55,7 +55,7 @@ class TestElementwiseMulDoubleGradCheck(unittest.TestCase):
 class TestElementwiseMulBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
@@ -82,7 +82,7 @@ class TestElementwiseMulBroadcastDoubleGradCheck(unittest.TestCase):
 class TestElementwiseAddDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
@@ -109,7 +109,7 @@ class TestElementwiseAddDoubleGradCheck(unittest.TestCase):
 class TestElementwiseAddBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
@@ -136,7 +136,7 @@ class TestElementwiseAddBroadcastDoubleGradCheck(unittest.TestCase):
 class TestElementwiseSubDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
@@ -163,7 +163,7 @@ class TestElementwiseSubDoubleGradCheck(unittest.TestCase):
 class TestElementwiseSubBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.005
         dtype = np.float64
@@ -190,7 +190,7 @@ class TestElementwiseSubBroadcastDoubleGradCheck(unittest.TestCase):
 class TestElementwiseDivDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.0001
         dtype = np.float64
@@ -218,7 +218,7 @@ class TestElementwiseDivDoubleGradCheck(unittest.TestCase):
 class TestElementwiseDivBroadcastDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         shape = [2, 3, 7, 9]
         eps = 0.0001
         dtype = np.float64
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index 5ce405dccae4cfd66cde471c097698b0869f29fe..fceaa0c14c4dcfb75bcb597cade1dc838a71c065 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -74,7 +74,7 @@ def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
     else:
         # Avoid sorting possibly large arrays;
         # First partition to get top K unsorted
-        # and then sort just thoes
+        # and then sort just those
         inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN]
         order = np.argsort(-scores[inds].squeeze())
         order = inds[order]
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index c57d10a24aa0a6e21466dd2255c98a573ec1de35..7e9dad69def1df44f990b1cfb0840d9e8aa862a1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -266,7 +266,7 @@ input_descs = {
     # The actual data shape of label_word is:
     # [batch_size * max_trg_len_in_batch, 1]
     "lbl_word": [(batch_size * seq_len, 1), "int64"],
-    # This input is used to mask out the loss of paddding tokens.
+    # This input is used to mask out the loss of padding tokens.
     # The actual data shape of label_weight is:
     # [batch_size * max_trg_len_in_batch, 1]
     "lbl_weight": [(batch_size * seq_len, 1), "float32"],
diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
index 27ee3b08a4e6e7fca28f6d50d05fabbcea44b171..8a9204c73fc030047cadb2793e6c25cfb9d66c70 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
@@ -45,7 +45,7 @@ class LinearChainCrfForward(object):
         self.w_exps = transition_exps[2:, :]
 
         # The output of linear chain crf operator.
-        # alpha is a memo table in dynamic programming to caculate
+        # alpha is a memo table in dynamic programming to calculate
         # nomalization factor.
         self.alpha = np.zeros(
             (seq_start_positions[-1], self.tag_num), dtype="float64")
diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
index ae11f23299c61789b42eba0f33c528ad92a238e0..b978e721746f72687fe235eef498f0990de8c210 100644
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -131,12 +131,12 @@ class TestNCECase1SelectedRows(unittest.TestCase):
 
     @staticmethod
     def get_train_data(batch_size):
-        batchs = []
+        batches = []
         for i in range(batch_size):
             input = np.random.randn(batch_size, 10).astype(np.float32)
             labels = np.random.randint(0, 20, (batch_size, 1))
-            batchs.append([input, labels])
-        return batchs
+            batches.append([input, labels])
+        return batches
 
     def get_optimizer(self):
         # SGD optimizer
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index e38028feea221477e89db162e83c64f32a56292c..c6cfe01dce40458684c7464ca5ebddd389c62cbe 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -104,7 +104,7 @@ class TestReduceMeanWithDimDoubleGradCheck(unittest.TestCase):
 class TestMulDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not inlcude -1.
         x_shape = [7, 11]
         y_shape = [11, 9]
         eps = 0.005
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 826737aeefad9d01c4456d41fc97f080c6da789c..3dfd9023f5af30ff289c4dc55a0c275402bc3067 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -329,7 +329,7 @@ class TestReshapeOpError(unittest.TestCase):
 
             self.assertRaises(AssertionError, test_shape_2)
 
-            # The argument shape have more than one negtive value.
+            # The argument shape have more than one negative value.
             def test_shape_3():
                 fluid.layers.reshape(x3, [-1, -2, 5])
 
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py
index d4b92f9849acfb0d50a4118649b4b74afb35feac..ad141c96bbf192c53f3473f13b565734b7669223 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -285,7 +285,7 @@ class TestSaveLoadBase(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -299,7 +299,7 @@ class TestSaveLoadBase(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.load(main_program, "./test_1.pdparams", exe)
@@ -394,7 +394,7 @@ class TestSaveLoadPartial(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -408,7 +408,7 @@ class TestSaveLoadPartial(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.load(test_program, "./test_1.pdopt", None)
@@ -496,7 +496,7 @@ class TestSaveLoadSetStateDict(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -510,7 +510,7 @@ class TestSaveLoadSetStateDict(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.load(main_program, "./test_1", exe)
@@ -605,7 +605,7 @@ class TestProgramStatePartial(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -619,7 +619,7 @@ class TestProgramStatePartial(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             #fluid.load(test_program, "./test_1", None )
@@ -652,7 +652,7 @@ class TestProgramStatePartial(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.set_program_state(test_program, program_state_1)
@@ -672,7 +672,7 @@ class TestProgramStatePartial(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.set_program_state(test_program, program_state_2)
@@ -692,7 +692,7 @@ class TestProgramStatePartial(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.set_program_state(test_program, program_state_3)
@@ -777,7 +777,7 @@ class TestVariableInit(unittest.TestCase):
             if isinstance(var, framework.Parameter) or var.persistable:
                 t = np.array(fluid.global_scope().find_var(var.name)
                              .get_tensor())
-                # make sure all the paramerter or optimzier var have been update
+                # make sure all the paramerter or optimizer var have been update
                 base_map[var.name] = t
 
         for var in program.list_vars():
@@ -868,7 +868,7 @@ class TestLoadFromOldInterface(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -883,7 +883,7 @@ class TestLoadFromOldInterface(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             fluid.load(main_program, "test_path", exe)
@@ -984,7 +984,7 @@ class TestLoadFromOldInterfaceSingleFile(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -1000,7 +1000,7 @@ class TestLoadFromOldInterfaceSingleFile(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             file_model_path = os.path.join("test_path", "model_single")
@@ -1136,7 +1136,7 @@ class TestProgramStateOldSave(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -1150,7 +1150,7 @@ class TestProgramStateOldSave(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             #fluid.load(test_program, "./test_1", None )
@@ -1247,7 +1247,7 @@ class TestProgramStateOldSaveSingleModel(unittest.TestCase):
                 if isinstance(var, framework.Parameter) or var.persistable:
                     t = np.array(fluid.global_scope().find_var(var.name)
                                  .get_tensor())
-                    # make sure all the paramerter or optimzier var have been update
+                    # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -1262,7 +1262,7 @@ class TestProgramStateOldSaveSingleModel(unittest.TestCase):
 
                     new_t = np.array(fluid.global_scope().find_var(var.name)
                                      .get_tensor())
-                    # make sure all the paramerter or optimzier var have been set to zero
+                    # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             #fluid.load(test_program, "./test_1", None )
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index 1782d432490c796362590805ab20cad1f6a61359..970eb2daea5687315dcf97cd292f372eb40c0b86 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -57,7 +57,7 @@ def multi_head_attention(queries,
     """
     if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
         raise ValueError(
-            "Inputs: quries, keys and values should all be 3-D tensors.")
+            "Inputs: queries, keys and values should all be 3-D tensors.")
 
     def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
         """
@@ -91,7 +91,7 @@ def multi_head_attention(queries,
 
     def __split_heads(x, n_head):
         """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
+        Reshape the last dimension of input tensor x so that it becomes two
         dimensions and then transpose. Specifically, input a tensor with shape
         [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
         with shape [bs, n_head, max_sequence_length, hidden_dim].
@@ -104,13 +104,13 @@ def multi_head_attention(queries,
         reshaped = layers.reshape(
             x=x, shape=[batch_size, -1, n_head, hidden_size // n_head])
 
-        # permuate the dimensions into:
+        # permute the dimensions into:
         # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
         return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
 
     def __combine_heads(x):
         """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
+        Transpose and then reshape the last two dimensions of input tensor x
         so that it becomes one dimension, which is reverse to __split_heads.
         """
         if len(x.shape) == 3: return x
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
index dc78ffe70b3dfda75a799583e85b76d8d921e078..08e4056487ad7526225e2ead1a92f92781eae982 100644
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -98,7 +98,7 @@ def op_to_code(op, skip_op_callstack=True):
         op: A fluid operator.
 
     Returns:
-        string: The foramtted string.
+        string: The formatted string.
     """
 
     outputs_str = "{"
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index f6d754c3a7719d7a4746901760c91bad501a70d3..31177cc2c613be323d3e47983a1478b0d1fb9071 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -160,7 +160,7 @@ class DistributeTranspilerConfig(object):
           Minimum number of splitted elements in block, default is 8192.
 
           According to : https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
-          We can use bandwidth effiently when data size is larger than 2MB.If you
+          We can use bandwidth efficiently when data size is larger than 2MB.If you
           want to change it, please be sure you have read the slice_variable function. You can find
           the definition of slice_variable in
           https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -202,7 +202,7 @@ class DistributeTranspilerConfig(object):
     #The picture here illustrates the principle:
     #https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
     use_hierarchical_allreduce = False
-    #Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu cards' number in most cases.
+    #Nccl ranks in a node when use hierarchical allreduce, it's set to gpu cards' number in most cases.
     hierarchical_allreduce_inter_nranks = 0
 
     # if mode is collective
@@ -1460,7 +1460,7 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
             endpoint (str): current pserver endpoint.
             pserver_program (Program): deprecated, call get_pserver_program first.
             startup_program (Program): deprecated, should pass startup_program
-                when initalizing
+                when initializing
 
         Returns:
             Program: parameter server side startup program.
diff --git a/python/paddle/fluid/transpiler/geo_sgd_transpiler.py b/python/paddle/fluid/transpiler/geo_sgd_transpiler.py
index 4c2172f8676a2752a7544f3d5edec5dd0d53f0b8..484f6aa5eb5b9d7d7f043e20fda23c94838768e7 100644
--- a/python/paddle/fluid/transpiler/geo_sgd_transpiler.py
+++ b/python/paddle/fluid/transpiler/geo_sgd_transpiler.py
@@ -308,7 +308,7 @@ class GeoSgdTranspiler(DistributeTranspiler):
             }) for ep in self.pserver_endpoints
         ]
 
-        # step 5. Create delta var of Geo-Sgd & record vars infomation
+        # step 5. Create delta var of Geo-Sgd & record vars information
         for origin_name, splited_vars in self.param_var_mapping.items():
             origin_var = self.origin_program.global_block().var(origin_name)
             self.vars_info[origin_name] = collections.OrderedDict()
diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py
index 3d2273be0c097ce9a2c1c0633d174316b90189ae..a91007c0d385cc883282dad165ce7b516a40c46a 100644
--- a/python/paddle/fluid/transpiler/ps_dispatcher.py
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
@@ -19,7 +19,7 @@ class PSDispatcher(object):
     """
     PSDispatcher is the base class for dispatching vars
     into different pserver instance.
-    You need to implement the `dispatch` inferface.
+    You need to implement the `dispatch` interface.
     """
 
     def __init__(self, pserver_endpoints):
@@ -88,7 +88,7 @@ class HashName(PSDispatcher):
 
 class RoundRobin(PSDispatcher):
     """
-    Distribute variables to serveral endpoints using
+    Distribute variables to several endpoints using
     RondRobin<https://en.wikipedia.org/wiki/Round-robin_scheduling> method.
 
     Args:
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index becf61934ed00a7e1cf22d05d9489dc51d4746c2..a81746e4a2c4004999891f3a8cf1bf9233d8572f 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -458,12 +458,12 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
     """
     This API use python ``multiprocessing`` to read data from ``readers`` parallelly,
     and then ``multiprocess.Queue`` or ``multiprocess.Pipe`` is used to merge 
-    these data. A seperate process will be created for each reader in the 
+    these data. A separate process will be created for each reader in the 
     ``readers`` list, please guarantee every reader can work independently 
     to avoid conflicts in parallel environment.
     
 
-    ``Multiprocess.Queue`` require the rw access right to /dev/shm, and it's not suppported 
+    ``Multiprocess.Queue`` require the rw access right to /dev/shm, and it's not supported 
     in some platforms.
 
     Parameters:
diff --git a/python/paddle/utils/image_util.py b/python/paddle/utils/image_util.py
index 94029cff9fb5095b568a5facc4c6ff6fec3af4f3..b113f574e9fac069ed065336b35102ff6a3a6255 100644
--- a/python/paddle/utils/image_util.py
+++ b/python/paddle/utils/image_util.py
@@ -98,7 +98,7 @@ def preprocess_img(im, img_mean, crop_size, is_train, color=True):
     Does data augmentation for images.
     If is_train is false, cropping the center region from the image.
     If is_train is true, randomly crop a region from the image,
-    and randomy does flipping.
+    and random does flipping.
     im: (K x H x W) ndarrays
     """
     im = im.astype('float32')
diff --git a/python/paddle/utils/plotcurve.py b/python/paddle/utils/plotcurve.py
index a95e5497e23571e61e5d7652830a99efd7793083..9c298acf01db66459ca163bf1297f8c7d2be6cb0 100644
--- a/python/paddle/utils/plotcurve.py
+++ b/python/paddle/utils/plotcurve.py
@@ -37,7 +37,7 @@ optional arguments:
 
 The keys must be in the order of paddle output(!!!).
 
-For example, paddle.INFO contrains the following log
+For example, paddle.INFO contains the following log
    I0406 21:26:21.325584  3832 Trainer.cpp:601]  Pass=0 Batch=7771 AvgCost=0.624935 Eval: error=0.260972
 
 To use this script to generate plot for AvgCost, error:
diff --git a/python/paddle/utils/preprocess_img.py b/python/paddle/utils/preprocess_img.py
index fc67949dfe0ef21487de29678781aa2bfd93f354..e54393fa4a029a510699e3e2bafef9f4d78c51e0 100644
--- a/python/paddle/utils/preprocess_img.py
+++ b/python/paddle/utils/preprocess_img.py
@@ -135,7 +135,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
 
     def create_dataset_from_dir(self, path):
         """
-        Create a Dataset object for image classfication.
+        Create a Dataset object for image classification.
         Each folder in the path directory corresponds to a set of images of
         this label, and the name of the folder is the name of the
         path: the path of the image dataset.
diff --git a/python/paddle/utils/preprocess_util.py b/python/paddle/utils/preprocess_util.py
index 05b2067d01a2c544d7f5bd68320e79c805282286..76fc83acdc0ee2631576a737a0f4fea42acec47d 100644
--- a/python/paddle/utils/preprocess_util.py
+++ b/python/paddle/utils/preprocess_util.py
@@ -39,7 +39,7 @@ def save_list(l, outfile):
 
 def exclude_pattern(f):
     """
-    Return whether f is in the exlucde pattern.
+    Return whether f is in the exclude pattern.
     Exclude the files that starts with . or ends with ~.
     """
     return f.startswith(".") or f.endswith("~")
@@ -81,7 +81,7 @@ def list_files(path):
 def get_label_set_from_dir(path):
     """
     Return a dictionary of the labels and label ids from a path.
-    Assume each direcotry in the path corresponds to a unique label.
+    Assume each directory in the path corresponds to a unique label.
     The keys of the dictionary is the label name.
     The values of the dictionary is the label id.
     """
@@ -198,7 +198,7 @@ class DataBatcher:
 
     def __init__(self, train_data, test_data, label_set):
         """
-        train_data, test_data: Each one is a dataset object repesenting
+        train_data, test_data: Each one is a dataset object representing
         training and testing data, respectively.
         label_set: a dictionary storing the mapping from label name to label id.
         """
@@ -256,7 +256,7 @@ class DataBatcher:
 class DatasetCreater(object):
     """
     A virtual class for creating datasets.
-    The derived clasas needs to implemnt the following methods:
+    The derived class needs to implement the following methods:
        - create_dataset()
        - create_meta_file()
     """