From 8dde7aeab03c7436103b9f1aa23ea74540a7aba7 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Thu, 15 Sep 2022 14:15:13 +0800
Subject: [PATCH] [CodeStyle] trim trailing whitespace in .h, .cc, .cu, etc.
 (#46006)

---
 paddle/fluid/framework/custom_operator.cc     |   6 +-
 .../new_executor/executor_statistics.cc       |   6 +-
 paddle/fluid/operators/activation_op.cc       |   2 +-
 .../operators/add_position_encoding_op.cc     |   2 +-
 paddle/fluid/operators/addmm_op.cc            |   2 +-
 paddle/fluid/operators/affine_grid_op.cc      |   8 +-
 paddle/fluid/operators/allclose_op.cc         |   4 +-
 .../amp/check_finite_and_unscale_op.cc        |   4 +-
 .../operators/amp/update_loss_scaling_op.cc   |   4 +-
 paddle/fluid/operators/argsort_op.cc          |   6 +-
 .../fluid/operators/array_to_lod_tensor_op.cc |   6 +-
 paddle/fluid/operators/assign_pos_op.cc       |   2 +-
 paddle/fluid/operators/attention_lstm_op.cc   |   2 +-
 paddle/fluid/operators/bmm_op.cc              |   4 +-
 .../fluid/operators/broadcast_tensors_op.cc   |   2 +-
 paddle/fluid/operators/center_loss_op.cc      |   4 +-
 paddle/fluid/operators/channel_shuffle_op.cc  |   4 +-
 paddle/fluid/operators/chunk_eval_op.cc       |   6 +-
 .../operators/cinn/cinn_instruction_run_op.cc |   2 +-
 .../fluid/operators/class_center_sample_op.cc |   4 +-
 .../operators/collective/global_scatter_op.cc |   4 +-
 paddle/fluid/operators/complex_view_op.cc     |   6 +-
 .../operators/controlflow/compare_all_op.cc   |   2 +-
 .../fluid/operators/controlflow/depend_op.cc  |   2 +-
 paddle/fluid/operators/copy_cross_scope_op.cc |   6 +-
 paddle/fluid/operators/crf_decoding_op.cc     |  14 +--
 paddle/fluid/operators/crop_op.cc             |  12 +-
 paddle/fluid/operators/crop_tensor_op.cc      |  20 ++--
 paddle/fluid/operators/cross_entropy_op.cc    |  16 +--
 paddle/fluid/operators/ctc_align_op.cc        |   6 +-
 paddle/fluid/operators/cudnn_lstm_op.cc       |   4 +-
 paddle/fluid/operators/cumprod_op.cc          |   2 +-
 paddle/fluid/operators/decode_jpeg_op.cc      |   6 +-
 .../fluid/operators/deformable_conv_v1_op.cc  |   6 +-
 .../operators/deformable_psroi_pooling_op.cc  |   2 +-
 .../fluid/operators/detection/box_clip_op.cc  |   4 +-
 .../fluid/operators/detection/box_coder_op.cc |  10 +-
 .../detection/box_decoder_and_assign_op.cc    |   6 +-
 .../detection/collect_fpn_proposals_op.cc     |   4 +-
 .../detection/generate_proposals_v2_op.cc     |   8 +-
 .../operators/detection/iou_similarity_op.cc  |   2 +-
 .../operators/detection/matrix_nms_op.cc      |   2 +-
 .../detection/mine_hard_examples_op.cc        |  10 +-
 .../retinanet_detection_output_op.cc          |   2 +-
 .../detection/rpn_target_assign_op.cc         |  10 +-
 .../operators/detection/target_assign_op.cc   |   2 +-
 .../fluid/operators/detection/yolo_box_op.cc  |  16 +--
 .../operators/detection/yolov3_loss_op.cc     |  28 ++---
 paddle/fluid/operators/dgc_op.cc              |   4 +-
 paddle/fluid/operators/diag_embed_op.cc       |   8 +-
 paddle/fluid/operators/diag_op.cc             |   2 +-
 paddle/fluid/operators/edit_distance_op.cc    |   2 +-
 paddle/fluid/operators/fill_any_op.cc         |   2 +-
 paddle/fluid/operators/filter_by_instag_op.cc |   6 +-
 paddle/fluid/operators/fold_op.cc             |   6 +-
 .../operators/fused/fused_attention_op.cc     |   6 +-
 ...sed_bias_dropout_residual_layer_norm_op.cc |   4 +-
 .../fused/fused_gate_attention_op.cc          |   6 +-
 .../operators/fused/fused_gemm_epilogue_op.cc |  44 +++----
 paddle/fluid/operators/fused/fusion_gru_op.cc |   2 +-
 .../fused/fusion_squared_mat_sub_op.cc        |   2 +-
 paddle/fluid/operators/fused/multi_gru_op.cc  |   2 +-
 .../fluid/operators/fused/resnet_unit_op.cc   |   4 +-
 .../fluid/operators/fused_token_prune_op.cc   |   2 +-
 paddle/fluid/operators/gather_nd_op.cc        |  14 +--
 paddle/fluid/operators/gaussian_random_op.cc  |   2 +-
 paddle/fluid/operators/gelu_op.cc             |   2 +-
 paddle/fluid/operators/graph_send_recv_op.cc  |   8 +-
 .../fluid/operators/graph_send_ue_recv_op.cc  |   2 +-
 paddle/fluid/operators/grid_sampler_op.cc     |  12 +-
 paddle/fluid/operators/hash_op.cc             |   2 +-
 paddle/fluid/operators/hinge_loss_op.cc       |   2 +-
 paddle/fluid/operators/increment_op.cc        |   2 +-
 paddle/fluid/operators/index_sample_op.cc     |   6 +-
 paddle/fluid/operators/interpolate_op.cc      |  60 +++++-----
 paddle/fluid/operators/interpolate_v2_op.cc   |  60 +++++-----
 paddle/fluid/operators/isclose_op.cc          |   2 +-
 paddle/fluid/operators/kldiv_loss_op.cc       |  12 +-
 paddle/fluid/operators/kron_op.cc             |  10 +-
 paddle/fluid/operators/label_smooth_op.cc     |  20 ++--
 paddle/fluid/operators/logspace_op.cc         |  10 +-
 .../operators/lookup_table_dequant_op.cc      |   2 +-
 paddle/fluid/operators/lstmp_op.cc            |  18 +--
 paddle/fluid/operators/lu_op.cc               |   2 +-
 paddle/fluid/operators/lu_unpack_op.cc        |   2 +-
 paddle/fluid/operators/margin_rank_loss_op.cc |  10 +-
 .../fluid/operators/match_matrix_tensor_op.cc |   4 +-
 paddle/fluid/operators/matmul_v2_op.cc        |   4 +-
 paddle/fluid/operators/mean_iou_op.cc         |   6 +-
 paddle/fluid/operators/memcpy_op.cc           |   2 +-
 paddle/fluid/operators/meshgrid_op.cc         |   2 +-
 paddle/fluid/operators/metrics/accuracy_op.cc |   4 +-
 paddle/fluid/operators/mode_op.cc             |   2 +-
 .../fluid/operators/modified_huber_loss_op.cc |   2 +-
 paddle/fluid/operators/nll_loss_op.cc         |   8 +-
 paddle/fluid/operators/norm_op.cc             |   2 +-
 paddle/fluid/operators/optimizers/dpsgd_op.cc |   2 +-
 paddle/fluid/operators/optimizers/lamb_op.cc  |   6 +-
 .../pow2_decay_with_linear_warmup_op.cc       |   8 +-
 .../optimizers/proximal_adagrad_op.cc         |   4 +-
 .../operators/optimizers/proximal_gd_op.cc    |   2 +-
 .../fluid/operators/optimizers/rmsprop_op.cc  |   2 +-
 paddle/fluid/operators/pad2d_op.cc            |   2 +-
 paddle/fluid/operators/pad3d_op.cc            |   2 +-
 paddle/fluid/operators/pad_op.cc              |   2 +-
 paddle/fluid/operators/partial_sum_op.cc      |   4 +-
 paddle/fluid/operators/pixel_shuffle_op.cc    |   4 +-
 paddle/fluid/operators/pixel_unshuffle_op.cc  |   4 +-
 paddle/fluid/operators/pool_with_index_op.cc  |  10 +-
 paddle/fluid/operators/psroi_pool_op.cc       |   4 +-
 paddle/fluid/operators/random_crop_op.cc      |   2 +-
 paddle/fluid/operators/randperm_op.cc         |   2 +-
 paddle/fluid/operators/rank_attention_op.cc   |   2 +-
 paddle/fluid/operators/rank_loss_op.cc        |   6 +-
 paddle/fluid/operators/real_op.cc             |   6 +-
 .../reorder_lod_tensor_by_rank_op.cc          |   2 +-
 paddle/fluid/operators/reverse_op.cc          |   2 +-
 paddle/fluid/operators/roi_align_op.cc        |   6 +-
 paddle/fluid/operators/roi_pool_op.cc         |   2 +-
 paddle/fluid/operators/roll_op.cc             |   4 +-
 paddle/fluid/operators/row_conv_op.cc         |  18 +--
 paddle/fluid/operators/run_program_op.cc      |  10 +-
 paddle/fluid/operators/sample_logits_op.cc    |   2 +-
 paddle/fluid/operators/searchsorted_op.cc     |   2 +-
 paddle/fluid/operators/select_output_op.cc    |   2 +-
 .../sequence_ops/sequence_enumerate_op.cc     |   4 +-
 .../sequence_ops/sequence_erase_op.cc         |  12 +-
 .../sequence_ops/sequence_mask_op.cc          |   2 +-
 .../operators/sequence_ops/sequence_pad_op.cc |  20 ++--
 .../sequence_ops/sequence_unpad_op.cc         |  12 +-
 paddle/fluid/operators/shard_index_op.cc      |  10 +-
 paddle/fluid/operators/similarity_focus_op.cc |  20 ++--
 paddle/fluid/operators/sparse_attention_op.cc |   4 +-
 paddle/fluid/operators/spectral_norm_op.cc    |   2 +-
 .../fluid/operators/squared_l2_distance_op.cc |  14 +--
 paddle/fluid/operators/tdm_child_op.cc        |   2 +-
 .../teacher_student_sigmoid_loss_op.cc        |   2 +-
 paddle/fluid/operators/temporal_shift_op.cc   |  18 +--
 paddle/fluid/operators/top_k_op.cc            |   4 +-
 paddle/fluid/operators/top_k_v2_op.cc         |   4 +-
 paddle/fluid/operators/tril_indices_op.cc     |   6 +-
 paddle/fluid/operators/tril_triu_op.cc        |   4 +-
 paddle/fluid/operators/unfold_op.cc           |   2 +-
 paddle/fluid/operators/unique_op.cc           |   4 +-
 .../fluid/operators/unique_with_counts_op.cc  |   2 +-
 paddle/fluid/operators/var_conv_2d_op.cc      |   4 +-
 .../platform/profiler/chrometracing_logger.cc |  72 ++++++------
 paddle/fluid/pybind/cuda_streams_py.cc        |  22 ++--
 paddle/fluid/pybind/imperative.cc             | 108 +++++++++---------
 paddle/fluid/pybind/parallel_executor.cc      |  16 +--
 paddle/fluid/pybind/pybind.cc                 |  10 +-
 paddle/fluid/pybind/tensor.cc                 |   8 +-
 paddle/phi/core/enforce.h                     |   2 +-
 153 files changed, 602 insertions(+), 602 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 8c8d702e28..c58d1a57ec 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -511,9 +511,9 @@ class CustomOpMaker : public OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Custom Operator.
 
-According to the Tensor operation function implemented by the user 
-independently of the framework, it is encapsulated into a framework 
-operator to adapt to various execution scenarios such as dynamic graph, 
+According to the Tensor operation function implemented by the user
+independently of the framework, it is encapsulated into a framework
+operator to adapt to various execution scenarios such as dynamic graph,
 mode static graph mode, and inference mode.
 
 )DOC");
diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc
index a381943587..c1ba3b193f 100644
--- a/paddle/fluid/framework/new_executor/executor_statistics.cc
+++ b/paddle/fluid/framework/new_executor/executor_statistics.cc
@@ -600,9 +600,9 @@ void StatisticsEngine::Log(const std::string& filepath) {
   for (size_t idx = 0; idx < statistics_.size(); ++idx) {
     const auto& evt_stat = statistics_[idx];
     ofs << platform::string_format(std::string(R"JSON(
-  { 
-    "statistical item" : "%s", 
-    "total time(ns)" : %llu, 
+  {
+    "statistical item" : "%s",
+    "total time(ns)" : %llu,
     "total number of times" : %llu,
     "normalization time(ns)" : %llu
   },)JSON"),
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 1337533f3b..41160bf46c 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -607,7 +607,7 @@ class LogitOpMaker : public framework::OpProtoAndCheckerMaker {
                    "(float, default 1e-6f) the epsilon for input clamp bound")
         .SetDefault(1e-6f);
     AddComment(R"DOC(
-Logit Operator. 
+Logit Operator.
 
 this function is defined as follow:
 $ logit=ln\left ( {\frac {x} {1-x}} \right ) $
diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
index f4e7481bdd..cd4a9fbdb3 100644
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -87,7 +87,7 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
         });
     AddComment(R"DOC(
     Add Position Encoding Operator.
-    
+
     The add position encoding calculates the output based on the input, alpha, beta.
     The size of each dimension of the parameters checked in the infer-shape.
   )DOC");
diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc
index 8642d572e2..c7d6201ed2 100644
--- a/paddle/fluid/operators/addmm_op.cc
+++ b/paddle/fluid/operators/addmm_op.cc
@@ -77,7 +77,7 @@ class AddMMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 AddMM Operator.
 This operator is used to perform matrix multiplication for input $x$ and $y$ with coefficient $alpha$.
-$input$ with coefficient $beta$ is added to the final result. 
+$input$ with coefficient $beta$ is added to the final result.
 The equation is:
 
 $$Out = alpha * x * y + beta * input$$
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index a459196b76..1c0b8800f7 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -177,7 +177,7 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
                   [x_14, x_15, x_16]]
                  [[x_21, x_22, x_23]
                   [x_24, x_25, x_26]]]
-    
+
         OutputShape = [2, 3, 5, 5]
 
     Step 1:
@@ -185,12 +185,12 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
         Generate relative coordinates according to OutputShape.
         The values of relative coordinates are in the interval between -1 and 1.
         The shape of the relative coordinates is [2, H, W] as below:
-    
+
         C = [[[-1.  -1.  -1.  -1.  -1. ]
               [-0.5 -0.5 -0.5 -0.5 -0.5]
               [ 0.   0.   0.   0.   0. ]
               [ 0.5  0.5  0.5  0.5  0.5]
-              [ 1.   1.   1.   1.   1. ]] 
+              [ 1.   1.   1.   1.   1. ]]
              [[-1.  -0.5  0.   0.5  1. ]
               [-1.  -0.5  0.   0.5  1. ]
               [-1.  -0.5  0.   0.5  1. ]
@@ -198,7 +198,7 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
               [-1.  -0.5  0.   0.5  1. ]]]
         C[0] is the coordinates in height axis and  C[1] is the coordinates in
         width axis.
-    
+
     Step2:
         Tanspose and reshape C to shape [H * W, 2] and append ones to last
         dimension. The we get:
diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc
index aa3cd5d414..fa6bc1d6f7 100644
--- a/paddle/fluid/operators/allclose_op.cc
+++ b/paddle/fluid/operators/allclose_op.cc
@@ -47,7 +47,7 @@ class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
                   "compared as equal. Default: :math:`False` .")
         .SetDefault(false);
 
-    AddComment(R"DOC( 
+    AddComment(R"DOC(
 This operator checks if all :math:`x` and :math:`y` satisfy the condition:
 
 .. math::
@@ -110,7 +110,7 @@ REGISTER_OP_VERSION(allclose)
                       "The added input 'Atol' is not"
                       "dispensable."))
     .AddCheckpoint(
-        R"ROC(Delete two float attributes [rtol] and [atol], 
+        R"ROC(Delete two float attributes [rtol] and [atol],
         then add 2 string attributes [atol, rtol]. Don't be surprised.
         This is because float cannot represent hight-precision
         floating-point values, and our framework doesn't support
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
index 3404209063..a8d1f36f11 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
@@ -69,8 +69,8 @@ Check if input X contains all finite data, if yes, scale it by input Scale.
 $$Out = X / scale$$
 
 If any tensor in X contains Inf or Nan, the Out will generate a indicator.
-FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
-Out should not be used, and its data may not be deterministic. 
+FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of
+Out should not be used, and its data may not be deterministic.
 Otherwise, FoundInfinite will be 0 (False).
 
 )DOC");
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
index c8dc8217ef..03a5f734c2 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -111,8 +111,8 @@ class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
                   "Stop updating loss scaling, and just zero inputs.")
         .SetDefault(false);
     AddComment(R"DOC(
-Update loss scaling according to overall gradients. If all gradients is 
-finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
+Update loss scaling according to overall gradients. If all gradients is
+finite after incr_every_n_steps, loss scaling will increase by incr_ratio.
 Otherwise, loss scaling will decrease by decr_ratio after
 decr_every_n_nan_or_inf steps and each step some gradients are infinite.
 
diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc
index 7938c1182d..f17723bf83 100644
--- a/paddle/fluid/operators/argsort_op.cc
+++ b/paddle/fluid/operators/argsort_op.cc
@@ -58,9 +58,9 @@ class ArgsortOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Argsort operator
 
-Performs sorting on the input tensor along the given axis and outputs two 
-tensors, Output(Out) and Output(Indices). They reserve the same shape 
-with Input(X), and Output(Out) represents the sorted tensor while 
+Performs sorting on the input tensor along the given axis and outputs two
+tensors, Output(Out) and Output(Indices). They reserve the same shape
+with Input(X), and Output(Out) represents the sorted tensor while
 Output(Indices) gives the sorted order along the given axis Attr(axis).
 
  )DOC");
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 5fee66d968..89c817889f 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -223,10 +223,10 @@ class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
              "'paddle/framework/lod_rank_table.h' for more details.");
     AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
     AddComment(
-        R"DOC(This Op build a big LoDTensor from a std::vector<LoDTensor> 
+        R"DOC(This Op build a big LoDTensor from a std::vector<LoDTensor>
           and a LoDRankTable. It is supposed to be used in getting dynamic RNN's
-          outputs back to a normal LoDTensor. The std::vector<LoDTensor> 
-          would be the output of RNN Op and the LoDRankTable would be build 
+          outputs back to a normal LoDTensor. The std::vector<LoDTensor>
+          would be the output of RNN Op and the LoDRankTable would be build
           with RNN's input.)DOC");
   }
 };
diff --git a/paddle/fluid/operators/assign_pos_op.cc b/paddle/fluid/operators/assign_pos_op.cc
index ba1beaf834..80412c7d67 100644
--- a/paddle/fluid/operators/assign_pos_op.cc
+++ b/paddle/fluid/operators/assign_pos_op.cc
@@ -62,7 +62,7 @@ class AssignPosOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 assign_pos_op Operator.
 
-Assign pos decides which tokens should be fetched belong to 
+Assign pos decides which tokens should be fetched belong to
 specially counter orderingly.
 
 )DOC");
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 60e5912c44..203ccd8e60 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -297,7 +297,7 @@ tmp(seqlen*(M+D)) * fc((M+D)*1) => fcout(seqlen*1) with bias, relu
 
 fcout(seqlen*1) * scalar => fcout(seqlen*1) with bias, relu
 
-dotmul and sum pool ( fcout(seqlen*1), x(seqlen * M) ) => lstm_x_t(1, M) 
+dotmul and sum pool ( fcout(seqlen*1), x(seqlen * M) ) => lstm_x_t(1, M)
 
 LSTM part:
 use lstm_x_t as input and compute as standard LSTM.
diff --git a/paddle/fluid/operators/bmm_op.cc b/paddle/fluid/operators/bmm_op.cc
index 305236134d..b27594eed3 100644
--- a/paddle/fluid/operators/bmm_op.cc
+++ b/paddle/fluid/operators/bmm_op.cc
@@ -44,8 +44,8 @@ class BmmOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor), The output tensor of Bmm op.");
     AddComment(R"DOC(
 The Bmm operator is used to perform batched matrix multiplication
-over the last two dimensions of the input tensors `X` and `Y` 
-which are both 3-dimentionsal. 
+over the last two dimensions of the input tensors `X` and `Y`
+which are both 3-dimentionsal.
 
 Examples:
 - X: [B, M, K], Y: [B, K, N] => Out: [B, M, N]
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
index 6bce8ec566..4f681bc650 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cc
+++ b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -54,7 +54,7 @@ class BroadcastTensorsOpMaker : public framework::OpProtoAndCheckerMaker {
               "consistent with :code:`x`.")
         .AsDuplicable();
     AddComment(
-        R"DOC(This OP is used to broadcast a vector of inputs 
+        R"DOC(This OP is used to broadcast a vector of inputs
                      with Tensor or LoDTensor type, following broadcast semantics.)DOC");
   }
 };
diff --git a/paddle/fluid/operators/center_loss_op.cc b/paddle/fluid/operators/center_loss_op.cc
index 15cc715650..f168eb10ae 100644
--- a/paddle/fluid/operators/center_loss_op.cc
+++ b/paddle/fluid/operators/center_loss_op.cc
@@ -80,10 +80,10 @@ class CenterLossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("need_update", "whether need to update center info.");
     AddComment(R"DOC(
 **CenterLoss operator**
-implemention of the center loss function in the papper<<A Discriminative 
+implemention of the center loss function in the papper<<A Discriminative
 Feature Learning Approach for Deep Face Recognition>>, equations in this  implement
 is:loss = 1/2 * (x-y)^2 ,where x(X) means the deep feature(output of last hidden layer )
-and y(Label) the target label 
+and y(Label) the target label
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/channel_shuffle_op.cc b/paddle/fluid/operators/channel_shuffle_op.cc
index 4d0e47157e..69f75691a0 100644
--- a/paddle/fluid/operators/channel_shuffle_op.cc
+++ b/paddle/fluid/operators/channel_shuffle_op.cc
@@ -52,9 +52,9 @@ class ChannelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
         while keeping the original tensor shape.
 
     Please refer to the paper:
-        `ShuffleNet: An Extremely Efficient Convolutional Neural Network for 
+        `ShuffleNet: An Extremely Efficient Convolutional Neural Network for
         Mobile Devices <https://arxiv.org/abs/1707.01083>`_
-        by Zhang et. al (2017) for more details. 
+        by Zhang et. al (2017) for more details.
 
         )DOC");
   }
diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
index b5aa051cc2..6ad9f6d491 100644
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -145,7 +145,7 @@ For some basics of chunking, please refer to
 ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
 and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
 Here is a NER example of labeling for these tagging schemes:
-   
+
           Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
    IO     I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
    IOB    B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
@@ -158,13 +158,13 @@ and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chun
 Since the calculations actually use label ids rather than labels, extra attention
 should be paid when mapping labels to ids to make CheckEvalOp work. The key point
 is that the listed equations are satisfied by ids.
-   
+
    tag_type = label % num_tag_type
    chunk_type = label / num_tag_type
 
 where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
 is the num of chunk types, and `tag_type` get its value from the following table.
-   
+
    Scheme Begin Inside End   Single
     plain   0     -      -     -
     IOB     0     1      -     -
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
index b7600cbb4a..f02946bddc 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
@@ -94,7 +94,7 @@ CINN(https://github.com/PaddlePaddle/CINN/blob/develop/README.md) instruction ex
 Both the input and output of this operator are a set of variables
 which are the input and output arguments of the bound cinn instruction respectively.
 In addition, there is an attribute named 'cached_index' should be
-set necessarily to get the CinnCompiledObject where the instruction is included 
+set necessarily to get the CinnCompiledObject where the instruction is included
 and 'instruction_index' is fetch the instruction object from complied runtime prograrm.
 
 It accomplishes the execution of the instruction according to the following steps:
diff --git a/paddle/fluid/operators/class_center_sample_op.cc b/paddle/fluid/operators/class_center_sample_op.cc
index 57f8bfb71f..cb766dae22 100644
--- a/paddle/fluid/operators/class_center_sample_op.cc
+++ b/paddle/fluid/operators/class_center_sample_op.cc
@@ -75,8 +75,8 @@ class ClassCenterSampleOpMaker : public framework::OpProtoAndCheckerMaker {
     The process of sampling subset class centers is straightforward: 1) First select the positive class centers;
     2) Randomly sample negative class centers. Specifically, given a Label tensor, shape [batch_size], select all
     the positive class centers and randomly sample negative class centers, then remap the input label tensor using
-    the sampled class centers. Note that if the number of the positive class centers is greater than the input 
-    num_samples, it keeps all the positive class centers and the shape of SampledLocalClassCenter will be 
+    the sampled class centers. Note that if the number of the positive class centers is greater than the input
+    num_samples, it keeps all the positive class centers and the shape of SampledLocalClassCenter will be
     [num_positive_class_centers]. The op supports CPU, single GPU and multi GPU.
 
     For more information, Partial FC: Training 10 Million Identities on a Single Machine
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cc b/paddle/fluid/operators/collective/global_scatter_op.cc
index 03dfd1413f..5d81acb226 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cc
@@ -80,8 +80,8 @@ class GlobalScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the result of global_scatter.");
     AddComment(R"DOC(
 Global Scatter Operator
-Scatter data in X which has been put together belong to one expert 
-to n_expert * world_size exeperts according to local_count 
+Scatter data in X which has been put together belong to one expert
+to n_expert * world_size exeperts according to local_count
 and receive tensors from n_expert * world_size experts according
 to global_count.
 )DOC");
diff --git a/paddle/fluid/operators/complex_view_op.cc b/paddle/fluid/operators/complex_view_op.cc
index 146a769cd6..cc15347736 100644
--- a/paddle/fluid/operators/complex_view_op.cc
+++ b/paddle/fluid/operators/complex_view_op.cc
@@ -41,8 +41,8 @@ class AsComplexOpMaker : public framework::OpProtoAndCheckerMaker {
 As_complex Operator.
 
 This operator is used to return a complex tensor represented
-by an old-fashioned real tensor. The size of the last dimension of 
-the input tensor should be 2, which corresponds to 'real' and 
+by an old-fashioned real tensor. The size of the last dimension of
+the input tensor should be 2, which corresponds to 'real' and
 'complex', respectively.
 
 )DOC");
@@ -75,7 +75,7 @@ class AsRealOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 AsReal Operator.
 
-This operator is used to return an old-fashioned real tensor from a 
+This operator is used to return an old-fashioned real tensor from a
 complex tensor. The size of the last dimension of the output tensor is 2,
 which corresponds to 'real' and 'complex', respectively.
 
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cc b/paddle/fluid/operators/controlflow/compare_all_op.cc
index 7b48fb5f73..0452574698 100644
--- a/paddle/fluid/operators/controlflow/compare_all_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cc
@@ -38,7 +38,7 @@ class CompareReduceOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                         comment.equation));
     AddComment(string::Sprintf(R"DOC(
 It operates element-wise on X and Y, and returns the Out. X, Y is a
-N-dim tensor, which could be any type. If all element $%s$, the Out tensor 
+N-dim tensor, which could be any type. If all element $%s$, the Out tensor
 is [True], else [False]
 )DOC",
                                comment.equation));
diff --git a/paddle/fluid/operators/controlflow/depend_op.cc b/paddle/fluid/operators/controlflow/depend_op.cc
index 29403ef570..24df91cf83 100644
--- a/paddle/fluid/operators/controlflow/depend_op.cc
+++ b/paddle/fluid/operators/controlflow/depend_op.cc
@@ -73,7 +73,7 @@ b = opA(a)
 y = opB(x)
 
 if tensor b and tensor x has some inner dependency, for example, x share data with b,
-we need to add explicit dependency for x <- b, otherwise the these two operators may 
+we need to add explicit dependency for x <- b, otherwise the these two operators may
 be executed parellel in static graph. We can use depend op as below,
 
 b = opA(a)
diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc
index 5c1bc8983e..a826f1d1b8 100644
--- a/paddle/fluid/operators/copy_cross_scope_op.cc
+++ b/paddle/fluid/operators/copy_cross_scope_op.cc
@@ -140,9 +140,9 @@ class CopyCrossScopeOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(false);
     AddAttr<int>("num_micro_batches", "Number of micro batches for pipeline.");
     AddComment(R"DOC(
-      This op is used by pipeline to copy tensors across micro batch scopes. 
-      Copy the variable value of the giving Id's micro scope to the micro scope of Id + 1 position. 
-      If need to copy back to the main scope, using to_main_scope option to copy the variable value of 
+      This op is used by pipeline to copy tensors across micro batch scopes.
+      Copy the variable value of the giving Id's micro scope to the micro scope of Id + 1 position.
+      If need to copy back to the main scope, using to_main_scope option to copy the variable value of
       the current micro scope to the main scope.
     )DOC");
   }
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index ee3ff671ed..ae1086b623 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -58,9 +58,9 @@ class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable();
     AddComment(R"DOC(
 The crf_decoding operator reads the emission feature weights and the transition
-feature weights learned by the linear_chain_crf operator and performs decoding. 
-It implements the Viterbi algorithm which is a dynamic programming algorithm 
-for finding the most likely sequence of hidden states, called the Viterbi path, 
+feature weights learned by the linear_chain_crf operator and performs decoding.
+It implements the Viterbi algorithm which is a dynamic programming algorithm
+for finding the most likely sequence of hidden states, called the Viterbi path,
 that results in a sequence of observed tags.
 
 The output of this operator changes according to whether Input(Label) is given:
@@ -68,15 +68,15 @@ The output of this operator changes according to whether Input(Label) is given:
 1. Input(Label) is given:
    This happens in training. This operator is used to co-work with the chunk_eval
    operator.
-   When Input(Label) is given, the crf_decoding operator returns tensor with the 
-   sampe shape as Input(Label) whose values are fixed to be 0, indicating an 
-   incorrect prediction, or 1 indicating a tag is correctly predicted. Such an 
+   When Input(Label) is given, the crf_decoding operator returns tensor with the
+   sampe shape as Input(Label) whose values are fixed to be 0, indicating an
+   incorrect prediction, or 1 indicating a tag is correctly predicted. Such an
    output is the input to chunk_eval operator.
 
 2. Input(Label) is not given:
    This is the standard decoding process.
 
-The crf_decoding operator returns a row vector with shape [N x 1]/[B x S], here 
+The crf_decoding operator returns a row vector with shape [N x 1]/[B x S], here
 the shape depends on the inputs are LoDTensors or common tensors, whose values
 range from 0 to maximum tag number - 1, Each element indicates an index of a
 predicted tag.
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index 41e9d673d3..f7c72c11dd 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -102,14 +102,14 @@ Crop Operator.
 Crop input into output, as specified by offsets and shape.
 
 There are two ways to set the offsets:
-1. In runtime: Using the input 'Offsets', which is a Variable and can be 
-               output of other operators. This way is suitable for 
+1. In runtime: Using the input 'Offsets', which is a Variable and can be
+               output of other operators. This way is suitable for
                dynamic offsets.
-2. In network configuration: Using the attribute 'offsets', which will be 
-                             set in Python configure script. This way is 
+2. In network configuration: Using the attribute 'offsets', which will be
+                             set in Python configure script. This way is
                              suitable for fixed offsets.
-You CANNOT use these two ways at the same time. An exception will be raised 
-if input 'Offset' is configured and meanwhile the attribute 'offsets' is 
+You CANNOT use these two ways at the same time. An exception will be raised
+if input 'Offset' is configured and meanwhile the attribute 'offsets' is
 not empty.
 
 There are two ways to set shape:
diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc
index 52106c7431..c75a5eaf86 100644
--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ b/paddle/fluid/operators/crop_tensor_op.cc
@@ -180,26 +180,26 @@ CropTensor Operator.
 Crop input into output, as specified by offsets and shape.
 
 There are three ways to set the offsets:
-1. Input 'OffsetsTensor: It is a tensor list. It should be set as a list that 
-                         contains tensor variable in python configure script. 
+1. Input 'OffsetsTensor: It is a tensor list. It should be set as a list that
+                         contains tensor variable in python configure script.
                          This way is suitable for dynamic offsets.
-2. Input 'Offsets': It is a variable and can be output of other operators. 
+2. Input 'Offsets': It is a variable and can be output of other operators.
                     This way is suitable for dynamic offsets.
-3. Attribute 'offsets': It will be set in python configure script. This way 
+3. Attribute 'offsets': It will be set in python configure script. This way
                         is suitable for fixed offsets.
 
-You CANNOT use these three ways at the same time. An exception will be raised 
-if input 'OffsetsTensor' or 'Offset' is configured and meanwhile the attribute 'offsets' is 
+You CANNOT use these three ways at the same time. An exception will be raised
+if input 'OffsetsTensor' or 'Offset' is configured and meanwhile the attribute 'offsets' is
 not empty.
 
 There are three ways to set shape:
 1. Input 'ShapeTensor': It is a tensor list. It should be set as a list that contains
-                        tensor variable in python configure script. This way is suitable 
+                        tensor variable in python configure script. This way is suitable
                         for dynamic shape.
-2. Input 'Shape': It is a Variable and can be output of other operators. This way is suitable 
+2. Input 'Shape': It is a Variable and can be output of other operators. This way is suitable
                   for dynamic shape.
-2. Attribute 'shape': crop input X into the shape described by a list<int>. The size of shape 
-                      list should be the same as the dimension size of input X. This way is 
+2. Attribute 'shape': crop input X into the shape described by a list<int>. The size of shape
+                      list should be the same as the dimension size of input X. This way is
                       suitable for fixed shape.
 
 The input should be a k-D tensor(k > 0 and k < 7). As an example:
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 0d98f5b75e..41a0d6ad20 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -250,10 +250,10 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 CrossEntropy Operator.
 
-The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. 
-The matrix's second dimension(row length) is as same as the original last 
-dimension, and the first dimension(column length) is the product of all other 
-original dimensions. Then the softmax computation will take palce on each raw 
+The input 'X' and 'Label' will first be logically flattened to 2-D matrixs.
+The matrix's second dimension(row length) is as same as the original last
+dimension, and the first dimension(column length) is the product of all other
+original dimensions. Then the softmax computation will take palce on each raw
 of flattened matrixs.
 
 It supports both standard cross-entropy and soft-label cross-entropy loss
@@ -385,10 +385,10 @@ class CrossEntropyOpMaker2 : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Hard-label CrossEntropy Operator.
 
-The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. 
-The matrix's second dimension(row length) is as same as the original last 
-dimension, and the first dimension(column length) is the product of all other 
-original dimensions. Then the softmax computation will take palce on each raw 
+The input 'X' and 'Label' will first be logically flattened to 2-D matrixs.
+The matrix's second dimension(row length) is as same as the original last
+dimension, and the first dimension(column length) is the product of all other
+original dimensions. Then the softmax computation will take palce on each raw
 of flattened matrixs.
 
 Only support hard label.
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
index dbab71e161..7731b72071 100644
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
@@ -93,12 +93,12 @@ Then:
     Output.dims = {8, 1}
     Output.LoD = [[0, 6, 8]]
 or Given:
-    Input.data = [[0, 1, 2, 2, 0, 4], 
-                  [0, 4, 5, 0, 6, 0], 
+    Input.data = [[0, 1, 2, 2, 0, 4],
+                  [0, 4, 5, 0, 6, 0],
                   [0, 7, 7, 7, 0, 0]]
     InputLength.data  = [[6],
                          [5],
-                         [4]],   
+                         [4]],
     Input.dims = {3, 6},
     Input.Lod = []
 And:
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index 9faf615a16..f5fd56edef 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -190,7 +190,7 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
 CUDNN LSTM implementation
 
 A four-gate Long Short-Term Memory network with no peephole connections.
-In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
+In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
 the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
 
 $$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
@@ -217,7 +217,7 @@ $$ h_t = o_t \\odot tanh(c_t) $$
 - $\tilde{c_t}$ is also called candidate hidden state,
   which is computed based on the current input and the previous hidden state.
 
-Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
+Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
 X represensts a matrix multiplication
 
 
diff --git a/paddle/fluid/operators/cumprod_op.cc b/paddle/fluid/operators/cumprod_op.cc
index a37b9e951f..cc7e568bac 100644
--- a/paddle/fluid/operators/cumprod_op.cc
+++ b/paddle/fluid/operators/cumprod_op.cc
@@ -35,7 +35,7 @@ class CumprodOpMaker : public framework::OpProtoAndCheckerMaker {
         "（int), The dim along which the input tensors will be cumproded");
     AddComment(
         R"DOC(Cumprod operator. Return the cumprod results of the input elements along the dim.
-              For example, if input X is a tensor with rank 1 and N elements, the output will also be a tensor 
+              For example, if input X is a tensor with rank 1 and N elements, the output will also be a tensor
               with rank 1 and N elements, and elements y[i] = x[0] * x[1] * x[2] *...* x[i] (0<=i<N))DOC");
   }
 };
diff --git a/paddle/fluid/operators/decode_jpeg_op.cc b/paddle/fluid/operators/decode_jpeg_op.cc
index 789976c887..6e12b25028 100644
--- a/paddle/fluid/operators/decode_jpeg_op.cc
+++ b/paddle/fluid/operators/decode_jpeg_op.cc
@@ -61,9 +61,9 @@ class DecodeJpegOpMaker : public framework::OpProtoAndCheckerMaker {
              "of the JPEG image. It is a tensor with rank 1.");
     AddOutput("Out", "The output tensor of DecodeJpeg op");
     AddComment(R"DOC(
-This operator decodes a JPEG image into a 3 dimensional RGB Tensor 
-or 1 dimensional Gray Tensor. Optionally converts the image to the 
-desired format. The values of the output tensor are uint8 between 0 
+This operator decodes a JPEG image into a 3 dimensional RGB Tensor
+or 1 dimensional Gray Tensor. Optionally converts the image to the
+desired format. The values of the output tensor are uint8 between 0
 and 255.
 )DOC");
     AddAttr<std::string>(
diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cc b/paddle/fluid/operators/deformable_conv_v1_op.cc
index 0b817a8f42..ed70e54678 100644
--- a/paddle/fluid/operators/deformable_conv_v1_op.cc
+++ b/paddle/fluid/operators/deformable_conv_v1_op.cc
@@ -73,13 +73,13 @@ class DeformableConvV1OpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 **Deformable Convolution v1 Operator**
 
-Deformable Convolution is a new method based Convolution which feature has offset 
+Deformable Convolution is a new method based Convolution which feature has offset
 in spatial location.
 
-1. Get offset of each pixel in feature map with convolution layers which number 
+1. Get offset of each pixel in feature map with convolution layers which number
    of channels should be double of weight size.
 
-2. Add offset to pixel to get new location and the new value which are computed 
+2. Add offset to pixel to get new location and the new value which are computed
    directly through bilinear interpolation with four nearest pixel.
 
 3. Get the product of pixel and weight as result
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
index f83a4c04a8..bac1bb04bc 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
@@ -104,7 +104,7 @@ class DeformablePSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
               "W is thewidth of output. ");
     AddComment(R"DOC(
 **DeformablePSROIPooling Operator**
-DeformablePSROIPooling is a new method based Region of interest pooling 
+DeformablePSROIPooling is a new method based Region of interest pooling
 (also known as RoI pooling).
 The operator has four steps:
 
diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc
index cd17a8c988..89650d6235 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cc
+++ b/paddle/fluid/operators/detection/box_clip_op.cc
@@ -82,14 +82,14 @@ This operator clips input boxes to original input images.
 For each input box, The formula is given as follows:
 
        $$xmin = \max(\min(xmin, im_w - 1), 0)$$
-       $$ymin = \max(\min(ymin, im_h - 1), 0)$$     
+       $$ymin = \max(\min(ymin, im_h - 1), 0)$$
        $$xmax = \max(\min(xmax, im_w - 1), 0)$$
        $$ymax = \max(\min(ymax, im_h - 1), 0)$$
 
 where im_w and im_h are computed from ImInfo, the formula is given as follows:
 
        $$im_w = \round(width / im_scale)$$
-       $$im_h = \round(height / im_scale)$$ 
+       $$im_h = \round(height / im_scale)$$
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 53a9d04fb5..5120f687de 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -98,9 +98,9 @@ The Encoding schema described below:
 
     oy = (ty - py) / ph / pyv
 
-    ow = log(abs(tw / pw)) / pwv 
+    ow = log(abs(tw / pw)) / pwv
 
-    oh = log(abs(th / ph)) / phv 
+    oh = log(abs(th / ph)) / phv
 
 The Decoding schema described below:
 
@@ -116,11 +116,11 @@ where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
 and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
 priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
 `phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
-encoded/decoded coordinates, width and height. 
+encoded/decoded coordinates, width and height.
 
-During Box Decoding, two modes for broadcast are supported. Say target box has 
+During Box Decoding, two modes for broadcast are supported. Say target box has
 shape [N, M, 4], and the shape of prior box can be [N, 4] or [M, 4]. Then prior
-box will broadcast to target box along the assigned axis. 
+box will broadcast to target box along the assigned axis.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
index d641a6fd41..c1b7e1678d 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
@@ -189,7 +189,7 @@ Decode the target bounding box with the prior_box information.
 The Decoding schema is described below:
 
     $$
-    ox = (pw \\times pxv \\times tx + px) - \\frac{tw}{2} 
+    ox = (pw \\times pxv \\times tx + px) - \\frac{tw}{2}
     $$
     $$
     oy = (ph \\times pyv \\times ty + py) - \\frac{th}{2}
@@ -205,11 +205,11 @@ where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
 and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
 prior_box's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
 `phv` denote the variance of the prior_box and `ox`, `oy`, `ow`, `oh` denote the
-decoded coordinates, width and height in decode_box. 
+decoded coordinates, width and height in decode_box.
 
 decode_box is obtained after box decode, then assigning schema is described below:
 
-For each prior_box, use the best non-background class's decoded values to 
+For each prior_box, use the best non-background class's decoded values to
 update the prior_box locations and get output_assign_box. So, the shape of
 output_assign_box is the same as PriorBox.
 )DOC");
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
index ddb8685ee3..48902f5179 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
@@ -125,7 +125,7 @@ class CollectFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
 This operator concats all proposals from different images
  and different FPN levels. Then sort all of those proposals
 by objectness confidence. Select the post_nms_topN RoIs in
- total. Finally, re-sort the RoIs in the order of batch index. 
+ total. Finally, re-sort the RoIs in the order of batch index.
 )DOC");
   }
 };
@@ -145,7 +145,7 @@ REGISTER_OP_CPU_KERNEL(collect_fpn_proposals,
 REGISTER_OP_VERSION(collect_fpn_proposals)
     .AddCheckpoint(
         R"ROC(
-              Upgrade collect_fpn_proposals add a new input 
+              Upgrade collect_fpn_proposals add a new input
               [MultiLevelRoIsNum] and add a new output [RoisNum].)ROC",
         paddle::framework::compatible::OpVersionDesc()
             .NewInput("MultiLevelRoIsNum",
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index eeda4c819e..15918030c0 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -86,7 +86,7 @@ class GenerateProposalsV2OpMaker : public framework::OpProtoAndCheckerMaker {
                   "If true, im_shape pixel offset is 1.")
         .SetDefault(true);
     AddComment(R"DOC(
-This operator is the second version of generate_proposals op to generate 
+This operator is the second version of generate_proposals op to generate
 bounding box proposals for Faster RCNN.
 The proposals are generated for a list of images based on image
 score 'Scores', bounding box regression result 'BboxDeltas' as
@@ -96,9 +96,9 @@ boxes.
 
 The difference between this version and the first version is that the image
  scale is no long needed now, so the input requires im_shape instead of im_info.
-The change aims to unify the input for all kinds of objective detection 
-such as YOLO-v3 and Faster R-CNN. As a result, the min_size represents the 
-size on input image instead of original image which is slightly different 
+The change aims to unify the input for all kinds of objective detection
+such as YOLO-v3 and Faster R-CNN. As a result, the min_size represents the
+size on input image instead of original image which is slightly different
 to before and will not effect the result.
 
 )DOC");
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc
index c31c630cd6..5f46e9ab51 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -95,7 +95,7 @@ boxes in 'Y' are shared by all instance of the batched inputs of X.
 Given two boxes A and B, the calculation of IOU is as follows:
 
 $$
-IOU(A, B) = 
+IOU(A, B) =
 \\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)}
 $$
 
diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc
index 1c0d19d9d5..1c755c62eb 100644
--- a/paddle/fluid/operators/detection/matrix_nms_op.cc
+++ b/paddle/fluid/operators/detection/matrix_nms_op.cc
@@ -116,7 +116,7 @@ independently for each class. The outputs is a 2-D LoDTenosr, for each
 image, the offsets in first dimension of LoDTensor are called LoD, the number
 of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
 means there is no detected bbox for this image. Now this operator has one more
-output, which is RoisNum. The size of RoisNum is N, RoisNum[i] means the number of 
+output, which is RoisNum. The size of RoisNum is N, RoisNum[i] means the number of
 detected bbox for this image.
 
 For more information on Matrix NMS, please refer to:
diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
index 163da3cdd9..f3df3b228d 100644
--- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
@@ -383,11 +383,11 @@ class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Mine hard examples Operator.
 This operator implements hard example mining to select a subset of negative box indices.
-For each image, selects the box with highest losses. subject to the condition that the 
-box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. 
-The selected number is min(sample_size, max_negative_box_number) when mining_type is 
-hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) 
-when mining_type is max_negative, where the max_negative_box_number is the count of 
+For each image, selects the box with highest losses. subject to the condition that the
+box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative.
+The selected number is min(sample_size, max_negative_box_number) when mining_type is
+hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number)
+when mining_type is max_negative, where the max_negative_box_number is the count of
 MatchIndices elements with value -1.
 )DOC");
   }
diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
index 915b174f17..2f3b59db5c 100644
--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
@@ -640,7 +640,7 @@ where `tx`, `ty`, `tw`, `th` denote the predicted box's center coordinates, widt
 and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
 anchor's center coordinates, width and height. `pxv`, `pyv`, `pwv`,
 `phv` denote the variance of the anchor box and `ox`, `oy`, `ow`, `oh` denote the
-decoded coordinates, width and height. 
+decoded coordinates, width and height.
 
 Then the top decoded prediction from all levels are merged followed by NMS.
 In the NMS step, this operator prunes away boxes that have high IOU
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index 8fbfe2ad85..c6e4c00f79 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -661,7 +661,7 @@ The rest anchors would not contibute to the RPN training loss
 
 ScoreIndex is composed of foreground anchor indexes(positive labels) and
 background anchor indexes(negative labels). LocationIndex is exactly same
-as the foreground anchor indexes since we can not assign regression target to 
+as the foreground anchor indexes since we can not assign regression target to
 the background anchors.
 
 The classification targets(TargetLabel) is a binary class label (of being
@@ -730,16 +730,16 @@ class RetinanetTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
     This layer can be, for given the Intersection-over-Union (IoU) overlap
     between anchors and ground truth boxes, to assign classification and
     regression targets to each anchor, these target labels are used for
-    train retinanet. 
-    
+    train retinanet.
+
     Every anchor is assigned with a length C one-hot vector of
     classification targets, and a 4-vector of box regression targets,
     where C is the class number. The assignment rules are as followed:
-    
+
     1. Anchors are assigned to ground-truth boxes when: (i) it has the highest
     IoU overlap with a ground-truth box, or (ii) it has an IoU overlap higher
     than positive_overlap(0.5) with any ground-truth box.
-    
+
     2. Anchors are assigned to background when its IoU ratio is lower than
     negative_overlap (0.4) for all ground-truth boxes.
 
diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc
index 99deee3f72..5b8e6739bf 100644
--- a/paddle/fluid/operators/detection/target_assign_op.cc
+++ b/paddle/fluid/operators/detection/target_assign_op.cc
@@ -131,7 +131,7 @@ If id = MatchIndices[i][j] > 0,
     Out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
     OutWeight[i][j] = 1.
 
-Otherwise, 
+Otherwise,
 
     Out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
     OutWeight[i][j] = 0.
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index cbe0548f27..3261f8fca3 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -192,19 +192,19 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0.5);
     AddComment(R"DOC(
          This operator generates YOLO detection boxes from output of YOLOv3 network.
-         
+
          The output of previous network is in shape [N, C, H, W], while H and W
-         should be the same, H and W specify the grid size, each grid point predict 
+         should be the same, H and W specify the grid size, each grid point predict
          given number boxes, this given number, which following will be represented as S,
          is specified by the number of anchors. In the second dimension(the channel
          dimension), C should be equal to S * (5 + class_num) if :attr:`iou_aware` is false,
          otherwise C should be equal to S * (6 + class_num). class_num is the object
-         category number of source dataset(such as 80 in coco dataset), so the 
-         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
-         also includes confidence score of the box and class one-hot key of each anchor 
+         category number of source dataset(such as 80 in coco dataset), so the
+         second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
+         also includes confidence score of the box and class one-hot key of each anchor
          box.
 
-         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box 
+         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box
          predictions should be as follows:
 
          $$
@@ -225,9 +225,9 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
 
          The logistic regression value of the 5th channel of each anchor prediction boxes
          represents the confidence score of each prediction box, and the logistic
-         regression value of the last :attr:`class_num` channels of each anchor prediction 
+         regression value of the last :attr:`class_num` channels of each anchor prediction
          boxes represents the classifcation scores. Boxes with confidence scores less than
-         :attr:`conf_thresh` should be ignored, and box final scores is the product of 
+         :attr:`conf_thresh` should be ignored, and box final scores is the product of
          confidence scores and classification scores.
 
          $$
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index afdcfcd42b..0448d7e518 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -105,14 +105,14 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
          This operator generates yolov3 loss based on given predict result and ground
          truth boxes.
-         
+
          The output of previous network is in shape [N, C, H, W], while H and W
-         should be the same, H and W specify the grid size, each grid point predict 
+         should be the same, H and W specify the grid size, each grid point predict
          given number bounding boxes, this given number, which following will be represented as S,
          is specified by the number of anchor clusters in each scale. In the second dimension(the channel
-         dimension), C should be equal to S * (class_num + 5), class_num is the object 
-         category number of source dataset(such as 80 in coco dataset), so in the 
-         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+         dimension), C should be equal to S * (class_num + 5), class_num is the object
+         category number of source dataset(such as 80 in coco dataset), so in the
+         second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
          also includes confidence score of the box and class one-hot key of each anchor box.
 
          Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
@@ -135,21 +135,21 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          and :math:`p_w, p_h` is specified by anchors.
 
          As for confidence score, it is the logistic regression value of IoU between
-         anchor boxes and ground truth boxes, the score of the anchor box which has 
-         the max IoU should be 1, and if the anchor box has IoU bigger than ignore 
+         anchor boxes and ground truth boxes, the score of the anchor box which has
+         the max IoU should be 1, and if the anchor box has IoU bigger than ignore
          thresh, the confidence score loss of this anchor box will be ignored.
 
          Therefore, the yolov3 loss consists of three major parts: box location loss,
-         objectness loss and classification loss. The L1 loss is used for 
-         box coordinates (w, h), sigmoid cross entropy loss is used for box 
+         objectness loss and classification loss. The L1 loss is used for
+         box coordinates (w, h), sigmoid cross entropy loss is used for box
          coordinates (x, y), objectness loss and classification loss.
 
-         Each groud truth box finds a best matching anchor box in all anchors. 
+         Each groud truth box finds a best matching anchor box in all anchors.
          Prediction of this anchor box will incur all three parts of losses, and
          prediction of anchor boxes with no GT box matched will only incur objectness
          loss.
 
-         In order to trade off box coordinate losses between big boxes and small 
+         In order to trade off box coordinate losses between big boxes and small
          boxes, box coordinate losses will be mutiplied by scale weight, which is
          calculated as follows.
 
@@ -165,12 +165,12 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          $$
 
          While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
-         target will be smoothed when calculating classification loss, target of 
+         target will be smoothed when calculating classification loss, target of
          positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of
          negetive samples will be smoothed to :math:`1.0 / class\_num`.
 
-         While :attr:`GTScore` is given, which means the mixup score of ground truth 
-         boxes, all losses incured by a ground truth box will be multiplied by its 
+         While :attr:`GTScore` is given, which means the mixup score of ground truth
+         boxes, all losses incured by a ground truth box will be multiplied by its
          mixup score.
          )DOC");
   }
diff --git a/paddle/fluid/operators/dgc_op.cc b/paddle/fluid/operators/dgc_op.cc
index d06f5739df..1f7b5dbdce 100644
--- a/paddle/fluid/operators/dgc_op.cc
+++ b/paddle/fluid/operators/dgc_op.cc
@@ -126,10 +126,10 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
     DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication.
 
     This optimizer will do two things:
-        
+
         1. Compress the gradient by get TopK import value from tensor \
             and use it for allreduce to reduce network bandwidth.
-    
+
         2. Call momentum to optimize on the cost.
 
 )DOC");
diff --git a/paddle/fluid/operators/diag_embed_op.cc b/paddle/fluid/operators/diag_embed_op.cc
index 0dc5d024ec..45e5e51c41 100644
--- a/paddle/fluid/operators/diag_embed_op.cc
+++ b/paddle/fluid/operators/diag_embed_op.cc
@@ -47,11 +47,11 @@ class DiagEmbedOpMaker : public framework::OpProtoAndCheckerMaker {
         )DOC")
         .SetDefault(-1);
 
-    AddComment(R"DOC(Creates a tensor whose diagonals of certain 2D planes 
-              (specified by dim1 and dim2) are filled by input. 
-              To facilitate creating batched diagonal matrices, 
+    AddComment(R"DOC(Creates a tensor whose diagonals of certain 2D planes
+              (specified by dim1 and dim2) are filled by input.
+              To facilitate creating batched diagonal matrices,
               the 2D planes formed by the last two dimensions of the returned tensor
-              are chosen by default. 
+              are chosen by default.
               )DOC");
   }
 };
diff --git a/paddle/fluid/operators/diag_op.cc b/paddle/fluid/operators/diag_op.cc
index 8ccc5ff389..f7b2c49156 100644
--- a/paddle/fluid/operators/diag_op.cc
+++ b/paddle/fluid/operators/diag_op.cc
@@ -45,7 +45,7 @@ class DiagOpMaker : public framework::OpProtoAndCheckerMaker {
              "Diagonal values of square matrix. It is a tensor with rank 1.");
     AddOutput("Out", "A square matrix.");
     AddComment(R"DOC(
-    Return a square matrix with specified diagonal values. 
+    Return a square matrix with specified diagonal values.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/edit_distance_op.cc b/paddle/fluid/operators/edit_distance_op.cc
index 8197b115cd..70de5a3bb7 100644
--- a/paddle/fluid/operators/edit_distance_op.cc
+++ b/paddle/fluid/operators/edit_distance_op.cc
@@ -65,7 +65,7 @@ strings and their references.
 
 Edit distance, also called Levenshtein distance, measures how dissimilar two strings
 are by counting the minimum number of operations to transform one string into another.
-The operations include insertion, deletion, and substitution. 
+The operations include insertion, deletion, and substitution.
 
 For example, given hypothesis string A = "kitten" and reference B = "sitting",
 A will be transformed into B at least after two substitutions and one
diff --git a/paddle/fluid/operators/fill_any_op.cc b/paddle/fluid/operators/fill_any_op.cc
index 23d00e47b4..4e6929b445 100644
--- a/paddle/fluid/operators/fill_any_op.cc
+++ b/paddle/fluid/operators/fill_any_op.cc
@@ -30,7 +30,7 @@ class FillAnyOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0);
     AddAttr<int>("value_int", "The int var to fill in Tensor").SetDefault(0);
     AddComment(R"DOC(Fill operator with backward;
-                Fill an tensor with `value`. 
+                Fill an tensor with `value`.
                 )DOC");
   };
 };
diff --git a/paddle/fluid/operators/filter_by_instag_op.cc b/paddle/fluid/operators/filter_by_instag_op.cc
index 1bec91a546..a0ac46c4a6 100644
--- a/paddle/fluid/operators/filter_by_instag_op.cc
+++ b/paddle/fluid/operators/filter_by_instag_op.cc
@@ -80,15 +80,15 @@ class FilterByInstagOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("LossWeight", "(Tensor) loss weight.");
     AddOutput("IndexMap", "(LoDTensor) mapping from Out rows to X1 rows");
     AddComment(R"DOC(
-Filter By Instag Op 
+Filter By Instag Op
 
 This operator is used to filter embeded ins.
 
-There are 3 inputs. First is embeded ins, Second is tags for ins, 
+There are 3 inputs. First is embeded ins, Second is tags for ins,
 Third is tags to filter.
 
 There are 3 outputs. First is filtered embeded ins, Second is Loss Weight,
-Third is the IndexMap from Out line number to X1 line number. 
+Third is the IndexMap from Out line number to X1 line number.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
index 149d2bdac3..1c4127b6fb 100644
--- a/paddle/fluid/operators/fold_op.cc
+++ b/paddle/fluid/operators/fold_op.cc
@@ -70,9 +70,9 @@ class FoldOpMaker : public framework::OpProtoAndCheckerMaker {
 **Fold Operator**
 
 This Operator is used to combines an array of sliding local blocks into a large containing
-tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
-combined value in the resulting large tensor by summing all values from all containing blocks. 
-Unfold extracts the values in the local blocks by copying from the large tensor. So, if the 
+tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each
+combined value in the resulting large tensor by summing all values from all containing blocks.
+Unfold extracts the values in the local blocks by copying from the large tensor. So, if the
 blocks overlap, they are not inverses of each other.
     )DOC");
   }
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index 30badd3125..90f6d34535 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -432,8 +432,8 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
   The fused_attention operator is the same as following pseudo codes:
 
-  // @input: [batch_size, seq_len, embed_dim] 
-  // @final_out: [batch_size, seq_len, num_heads, head_dim] 
+  // @input: [batch_size, seq_len, embed_dim]
+  // @final_out: [batch_size, seq_len, num_heads, head_dim]
   residual = input
   if (pre_layernorm)
     query = layer_norm(input);
@@ -447,7 +447,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
     out = dropout(out);
     out = out * v;
     out = transpose(out, perm=[0, 2, 1, 3]);
-                
+
   }
   // out linear
   out = linear(out);
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
index 081a1ab0a0..3e888a2e67 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -140,8 +140,8 @@ class FusedBiasDropoutResidualLnOpMaker
 
     AddComment(R"DOC(
     Add fused bias_dropout_residual_layer_norm op whose logic is as follows:
-    // @input: [batch_size, seq_len, embed_dim] 
-    // @final_out: [batch_size, seq_len, embed_dim] 
+    // @input: [batch_size, seq_len, embed_dim]
+    // @final_out: [batch_size, seq_len, embed_dim]
     y = layer_norm(residual + dropout(bias + x));
     )DOC");
   }
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
index 2e6f991e41..0823f391fd 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
@@ -174,7 +174,7 @@ class FusedGateAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
   Add fused attention op whose logic is as follows:
   {
-    q = paddle.einsum('nbqa,ahc->nbqhc', q_data, self.query_w) 
+    q = paddle.einsum('nbqa,ahc->nbqhc', q_data, self.query_w)
     k = paddle.einsum('nbka,ahc->nbkhc', m_data, self.key_w)
     v = paddle.einsum('nbka,ahc->nbkhc', m_data, self.value_w)
 
@@ -189,10 +189,10 @@ class FusedGateAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
                                     self.gating_w) + self.gating_b
         gate_values_1 = nn.functional.sigmoid(gate_values)
         weighted_avg *= gate_values_1
-    
+
     output = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,
                           self.output_w) + self.output_b
-                
+
   }
     )DOC");
   }
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
index f9366bace3..d14e30a5f7 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -164,32 +164,32 @@ class FusedGemmEpilogueOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddOutput("Out", "The output tensor Out of Out = Act((X * Y) + Bias).");
     AddOutput("ReserveSpace",
-              R"DOC(Reserve GPU space to place 
-        auxiliary data pointer. It is used to pass auxiliary data pointer 
-        for fused_gemm_epilogue op. If not given (empty string), the 
+              R"DOC(Reserve GPU space to place
+        auxiliary data pointer. It is used to pass auxiliary data pointer
+        for fused_gemm_epilogue op. If not given (empty string), the
         auxiliary mode would not be enable.)DOC")
         .AsDispensable()
         .AsExtra();
 
     AddAttr<bool>(
         "trans_x",
-        R"DOC((bool, default false), Whether to transpose input tensor X 
-    or not. The input tensor X coulbe be more than two dimension. When 
-    set trans_x=true, it would fully reverse X. For instant: X with shpae 
+        R"DOC((bool, default false), Whether to transpose input tensor X
+    or not. The input tensor X coulbe be more than two dimension. When
+    set trans_x=true, it would fully reverse X. For instant: X with shpae
     [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC")
         .SetDefault(false);
     AddAttr<bool>(
         "trans_y",
-        R"DOC((bool, default false), Whether to transpose input tensor Y 
-    or not. The input tensor Y should be two dimension. When 
-    set trans_y=true, it would transpose Y. For instant: Y with shpae 
+        R"DOC((bool, default false), Whether to transpose input tensor Y
+    or not. The input tensor Y should be two dimension. When
+    set trans_y=true, it would transpose Y. For instant: Y with shpae
     [d0, d1] -> [d1, d0].)DOC")
         .SetDefault(false);
 
     AddAttr<std::string>(
         "activation",
-        R"DOC((string, default none), The activation function. It could be 
-    one of {none, relu, gelu}. When none is given, Act would be null 
+        R"DOC((string, default none), The activation function. It could be
+    one of {none, relu, gelu}. When none is given, Act would be null
     operations)DOC")
         .SetDefault("none");
 
@@ -337,9 +337,9 @@ class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The input tensor X of Out = (Act(X) * Y) + bias");
     AddInput("Y", "The input tensor Y of Out = (Act(X) * Y) + bias");
     AddInput("ReserveSpace",
-             R"DOC(A GPU space to fetch 
-        auxiliary data pointer. It is used to pass auxiliary data pointer 
-        for fused_gemm_epilogue_grad op. If not given (empty string), the 
+             R"DOC(A GPU space to fetch
+        auxiliary data pointer. It is used to pass auxiliary data pointer
+        for fused_gemm_epilogue_grad op. If not given (empty string), the
         auxiliary mode would not be enable.)DOC")
         .AsDispensable();
 
@@ -352,23 +352,23 @@ class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable();
     AddAttr<bool>(
         "trans_x",
-        R"DOC((bool, default false), Whether to transpose input tensor X 
-    or not. The input tensor X coulbe be more than two dimension. When 
-    set trans_x=true, it would fully reverse X. For instant: X with shpae 
+        R"DOC((bool, default false), Whether to transpose input tensor X
+    or not. The input tensor X coulbe be more than two dimension. When
+    set trans_x=true, it would fully reverse X. For instant: X with shpae
     [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC")
         .SetDefault(false);
     AddAttr<bool>(
         "trans_y",
-        R"DOC((bool, default false), Whether to transpose input tensor Y 
-    or not. The input tensor Y should be two dimension. When 
-    set trans_y=true, it would transpose Y. For instant: Y with shpae 
+        R"DOC((bool, default false), Whether to transpose input tensor Y
+    or not. The input tensor Y should be two dimension. When
+    set trans_y=true, it would transpose Y. For instant: Y with shpae
     [d0, d1] -> [d1, d0].)DOC")
         .SetDefault(false);
 
     AddAttr<std::string>(
         "activation_grad",
-        R"DOC((string, default none), The backward activation function. It could be 
-    one of {none, relu_grad, gelu_grad}. When none is given, The backward Act would 
+        R"DOC((string, default none), The backward activation function. It could be
+    one of {none, relu_grad, gelu_grad}. When none is given, The backward Act would
     be null operations)DOC")
         .SetDefault("none");
 
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 9556ed1288..e2d2cf071c 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -251,7 +251,7 @@ void FusionGRUOpMaker::Make() {
       .SetDefault(false);
   AddComment(R"DOC(
 The Fusion complete GRU Operator.
-This operator fuse the fully-connected operator into GRU, 
+This operator fuse the fully-connected operator into GRU,
 more details can refer to GRU op.
 )DOC");
 }
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index cb948ea59d..6be6763492 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -79,7 +79,7 @@ void FusionSquaredMatSubOpMaker::Make() {
   AddAttr<float>("scalar", "The scalar on output matrix.").SetDefault(1.f);
   AddComment(R"DOC(
     Fusion Squared Matrix and substrct operator.
-    
+
     ( (X * Y).^2 - (X.^2 * Y.^2) ) .* scalar
 )DOC");
 }
diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc
index eab555985c..2a8917f1c0 100644
--- a/paddle/fluid/operators/fused/multi_gru_op.cc
+++ b/paddle/fluid/operators/fused/multi_gru_op.cc
@@ -219,7 +219,7 @@ void MultiGRUOpMaker::Make() {
       .SetDefault(false);
   AddComment(R"DOC(
 The Fusion complete GRU Operator.
-This operator fuse the fully-connected operator into GRU, 
+This operator fuse the fully-connected operator into GRU,
 more details can refer to GRU op.
 )DOC");
 }
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc
index 5852a5c04b..779e28c85b 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cc
@@ -274,10 +274,10 @@ class ResNetUnitOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::string>("act_type", "The activation type to be fused.")
         .SetDefault("relu");
     AddComment(R"DOC(
-Fusion op of the basic unit of resnet block. 
+Fusion op of the basic unit of resnet block.
 
 The implementation is based on the latest fusion op interface in cuDNN v8.0.
-For more details: 
+For more details:
 https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnFusedOps_t
 
 )DOC");
diff --git a/paddle/fluid/operators/fused_token_prune_op.cc b/paddle/fluid/operators/fused_token_prune_op.cc
index 50ca45967b..da43ab7588 100644
--- a/paddle/fluid/operators/fused_token_prune_op.cc
+++ b/paddle/fluid/operators/fused_token_prune_op.cc
@@ -81,7 +81,7 @@ class FusedTokenPruneOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
             fused_token_prune op is used to fuse multiple ops to perform token pruning.
             In this op:
-                1. Elements of Attn will be set to zero if their corresponding mask is smaller than 0. 
+                1. Elements of Attn will be set to zero if their corresponding mask is smaller than 0.
                 2. The second dimension of X will be sorted by Attn.
                 3. The last (max_seq_len - slimmed_seq_len) lines of X will be pruned.
                 4. The remainning part of sorted X will output.
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
index 6ff1841786..59648bc7d1 100644
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
@@ -59,13 +59,13 @@ class GatherNdOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
     Gather_Nd Operator.
 
-    This function is actually a high-dimensional extension of gather 
-    and supports for simultaneous indexing by multiple axes. Out is 
-    obtained by gathering slices from X into a tensor with shape 
+    This function is actually a high-dimensional extension of gather
+    and supports for simultaneous indexing by multiple axes. Out is
+    obtained by gathering slices from X into a tensor with shape
     Index.shape[:-1] + X.shape[Index.shape[-1]:].
 
     Example:
-   
+
     Given:
          X = [[[ 0,  1,  2,  3],
                [ 4,  5,  6,  7],
@@ -73,7 +73,7 @@ class GatherNdOpMaker : public framework::OpProtoAndCheckerMaker {
               [[12, 13, 14, 15],
                [16, 17, 18, 19],
                [20, 21, 22, 23]]]
-       
+
          X.shape = (2, 3, 4)
 
    *Case 1:
@@ -81,7 +81,7 @@ class GatherNdOpMaker : public framework::OpProtoAndCheckerMaker {
        Index = [[1]]
 
     we get:
-       Out = 
+       Out =
             [[12, 13, 14, 15],
              [16, 17, 18, 19],
              [20, 21, 22, 23]]
@@ -91,7 +91,7 @@ class GatherNdOpMaker : public framework::OpProtoAndCheckerMaker {
        Index = [[0,2]]
 
     we get:
-        
+
        Out =  [8, 9, 10, 11]
 
    *Case 3:
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 80964323e6..8d92305eb6 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -161,7 +161,7 @@ REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
 REGISTER_OP_VERSION(gaussian_random)
     .AddCheckpoint(
         R"ROC(
-               Upgrade gaussian_random add new inputs [ShapeTensor] and [ShapeTensorList] 
+               Upgrade gaussian_random add new inputs [ShapeTensor] and [ShapeTensorList]
                and modify the attribute of [shape])ROC",
         paddle::framework::compatible::OpVersionDesc()
             .NewInput("ShapeTensor",
diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc
index add87fdd3c..a16544b8ba 100644
--- a/paddle/fluid/operators/gelu_op.cc
+++ b/paddle/fluid/operators/gelu_op.cc
@@ -100,7 +100,7 @@ class GeluOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) use approximation of gelu")
         .SetDefault(false);
     AddComment(R"DOC(
-Gelu Activation Operator. 
+Gelu Activation Operator.
 
 For more details, please refer to [Gaussian Error Linear Units](https://arxiv.org/pdf/1606.08415.pdf).
 
diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc
index b954ecab70..c907ae2b70 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_recv_op.cc
@@ -83,10 +83,10 @@ Graph Learning Send_Recv combine operator.
 
 $Out = Recv(Send(X, Src_index), Dst_index, reduce_op)$
 
-This operator is mainly used in Graph Learning domain, and the main purpose is to reduce 
-intermediate memory consumption in the process of message passing. 
-Take `x` as the input tensor, we first use `src_index` to gather corresponding data, 
-and then use `dst_index` to update the corresponding position of output tensor in different 
+This operator is mainly used in Graph Learning domain, and the main purpose is to reduce
+intermediate memory consumption in the process of message passing.
+Take `x` as the input tensor, we first use `src_index` to gather corresponding data,
+and then use `dst_index` to update the corresponding position of output tensor in different
 pooling types, like sum, mean, max, or min.
 
 )DOC");
diff --git a/paddle/fluid/operators/graph_send_ue_recv_op.cc b/paddle/fluid/operators/graph_send_ue_recv_op.cc
index af16609df3..6c38ee65e8 100644
--- a/paddle/fluid/operators/graph_send_ue_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_ue_recv_op.cc
@@ -97,7 +97,7 @@ intermediate memory consumption in the process of message passing.
 
 Take `X` as the input tensor, we first use `src_index` to gather corresponding data.
 Then the gather data should compute with `Y` in different message_ops, like add, sub, mul, and div,
-and get the computation result. Then, use `dst_index` to update the corresponding position of output 
+and get the computation result. Then, use `dst_index` to update the corresponding position of output
 tensor in different pooling types, like sum, mean, max, or min.
 
 )DOC");
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 97ec937911..12b18bc55e 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -89,12 +89,12 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault("zeros");
 
     AddComment(R"DOC(
-      This operation samples input X by using bilinear or nearest interpolation based on 
+      This operation samples input X by using bilinear or nearest interpolation based on
       flow field grid, which is usually generated by affine_grid. The grid of
-      shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
-      with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
-      (in width dimension) of input data x and grid_y is indexing the 3rd 
-      dimension (in height dimension), finally results is the bilinear 
+      shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
+      with shape [N, H, W] each, where grid_x is indexing the 4th dimension
+      (in width dimension) of input data x and grid_y is indexing the 3rd
+      dimension (in height dimension), finally results is the bilinear
       interpolation value or nearest value of 4 nearest corner points.
 
       For bilinear interpolation mode:
@@ -105,7 +105,7 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
         grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
 
       Step 2:
-        Indices input data X with grid (x, y) in each [H, W] area, and bilinear 
+        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
         interpolate point value by 4 nearest points.
 
           wn ------- y_n ------- en
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index 143862350b..f111a379e1 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -63,7 +63,7 @@ class HashOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(Tensor) Input tensor of hash operator.");
     AddOutput("Out", "(Tensor) Output tensor of hash operator.");
     AddComment(R"DOC(
-        Execute `num_hash` times xxHash algorithm on all elements on second dimension of input. 
+        Execute `num_hash` times xxHash algorithm on all elements on second dimension of input.
 )DOC");
     AddAttr<int>("num_hash", "").SetDefault(1);
     AddAttr<int64_t>("mod_by", "").SetDefault(100000);
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index 0d1006658a..6741af7638 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -82,7 +82,7 @@ take any values from (-inf, inf), but the labels should be either -1 or 1.
 Then, the hinge loss is computed as follows:
 
 $$
-L_(x, y) = max(1 - y.x, 0) 
+L_(x, y) = max(1 - y.x, 0)
 $$
 
 Note that the labels passed as input will have values as either 0 or 1.
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index d044ae056e..4dddf28792 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -61,7 +61,7 @@ class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Increment Operator.
 
-The equation is: 
+The equation is:
 $$Out = X + step$$
 
 )DOC");
diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc
index 7057b40545..0c5306e1d4 100644
--- a/paddle/fluid/operators/index_sample_op.cc
+++ b/paddle/fluid/operators/index_sample_op.cc
@@ -30,10 +30,10 @@ class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "Return the element of input at index");
 
     AddComment(R"DOC(
-    IndexSample OP returns the element of the specified location of X, 
-    and the location is specified by Index. 
+    IndexSample OP returns the element of the specified location of X,
+    and the location is specified by Index.
 
-    X tensor and Index tensor's shape must be 2-D, 
+    X tensor and Index tensor's shape must be 2-D,
     dimension at 0 which usually is batch size must be equal.
 
     The returned tensor has the same shape and dimensions as the Index tensor.
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 05b27f8d11..4c77e8b5b5 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -452,25 +452,25 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
           This operator samples input X to given output shape by using specified
           interpolation method, the interpolation methods can be \"nearest\"
-          for nearest neighbor interpolation and \"bilinear\" for bilinear 
+          for nearest neighbor interpolation and \"bilinear\" for bilinear
           interpolation and \"linear\" for linear interpolation..
 
           Nearest neighbor interpolation is to perform nearest neighbor interpolation
-          in both the 3rd dimension(in height direction) and the 4th dimension(in width 
+          in both the 3rd dimension(in height direction) and the 4th dimension(in width
           direction) on input tensor.
-           
-          Linear interpolation is the method of using a line connecting two known quantities 
-          to determine the value of an unknown quantity between the two known quantities. 
-          
-          Bilinear interpolation is an extension of linear interpolation for 
-          interpolating functions of two variables (e.g. H-direction and 
-          W-direction in this op) on a rectilinear 2D grid. The key idea is 
-          to perform linear interpolation first in one direction, and then 
+
+          Linear interpolation is the method of using a line connecting two known quantities
+          to determine the value of an unknown quantity between the two known quantities.
+
+          Bilinear interpolation is an extension of linear interpolation for
+          interpolating functions of two variables (e.g. H-direction and
+          W-direction in this op) on a rectilinear 2D grid. The key idea is
+          to perform linear interpolation first in one direction, and then
           again in the other direction.
 
-          Trilinear interpolation is an extension of linear interpolation for 
-          interpolating functions of three variables (e.g. D-direction, 
-          H-direction and W-direction in this op) on a rectilinear 3D grid. 
+          Trilinear interpolation is an extension of linear interpolation for
+          interpolating functions of three variables (e.g. D-direction,
+          H-direction and W-direction in this op) on a rectilinear 3D grid.
           The linear interpolation is performed on three directions.
 
           Bicubic interpolation is an extension of cubic interpolation for interpolating
@@ -478,24 +478,24 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
           smoother than corresponding surfaces obtained by bilinear interpolation or
           nearest-neighbor interpolation.
 
-          Align_corners and align_mode are optional parameters,the calculation method 
+          Align_corners and align_mode are optional parameters,the calculation method
           of interpolation can be selected by them.
-          
+
           Example:
 
           For scale:
-          
+
             if align_corners = True and out_{size}>1 :
 
               scale_{factor} = (in_{size}-1.0)/(out_{size}-1.0)
-            
+
             else:
-              
+
               scale_{factor} = float(in_{size}/out_{size})
-            
-          
+
+
           Nearest neighbor interpolation:
-          
+
           if:
               align_corners = False
 
@@ -518,16 +518,16 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
 
           if:
               align_corners = False , align_mode = 0
-              
+
               input : (N,C,H_in,W_in)
               output: (N,C,H_out,W_out) where:
-              
+
               H_out = (H_{in}+0.5) * scale_{factor} - 0.5
               W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
 
           else:
-           
+
               input : (N,C,H_in,W_in)
               output: (N,C,H_out,W_out) where:
 
@@ -538,17 +538,17 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
 
           if:
               align_corners = False , align_mode = 0
-              
+
               input : (N,C,D_in,H_in,W_in)
               output: (N,C,D_out,H_out,W_out) where:
-              
+
               D_out = (D_{in}+0.5) * scale_{factor} - 0.5
               H_out = (H_{in}+0.5) * scale_{factor} - 0.5
               W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
 
           else:
-           
+
               input : (N,C,D_in,H_in,W_in)
               output: (N,C,D_out,H_out,W_out) where:
 
@@ -570,13 +570,13 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
               H_out = H_{in} * scale_{factor}
               W_out = W_{in} * scale_{factor}
 
-          For details of nearest neighbor interpolation, please refer to Wikipedia: 
+          For details of nearest neighbor interpolation, please refer to Wikipedia:
           https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
 
-          For details of bilinear interpolation, please refer to Wikipedia: 
+          For details of bilinear interpolation, please refer to Wikipedia:
           https://en.wikipedia.org/wiki/Bilinear_interpolation
 
-          For details of trilinear interpolation, please refer to Wikipedia: 
+          For details of trilinear interpolation, please refer to Wikipedia:
           https://en.wikipedia.org/wiki/Trilinear_interpolation
 
           For details of bicubic interpolation, please refer to Wikipedia:
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index 07ecae637a..62d9c547fa 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -553,25 +553,25 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
           This operator samples input X to given output shape by using specified
           interpolation method, the interpolation methods can be \"nearest\"
-          for nearest neighbor interpolation and \"bilinear\" for bilinear 
+          for nearest neighbor interpolation and \"bilinear\" for bilinear
           interpolation and \"linear\" for linear interpolation..
 
           Nearest neighbor interpolation is to perform nearest neighbor interpolation
-          in both the 3rd dimension(in height direction) and the 4th dimension(in width 
+          in both the 3rd dimension(in height direction) and the 4th dimension(in width
           direction) on input tensor.
-           
-          Linear interpolation is the method of using a line connecting two known quantities 
-          to determine the value of an unknown quantity between the two known quantities. 
-          
-          Bilinear interpolation is an extension of linear interpolation for 
-          interpolating functions of two variables (e.g. H-direction and 
-          W-direction in this op) on a rectilinear 2D grid. The key idea is 
-          to perform linear interpolation first in one direction, and then 
+
+          Linear interpolation is the method of using a line connecting two known quantities
+          to determine the value of an unknown quantity between the two known quantities.
+
+          Bilinear interpolation is an extension of linear interpolation for
+          interpolating functions of two variables (e.g. H-direction and
+          W-direction in this op) on a rectilinear 2D grid. The key idea is
+          to perform linear interpolation first in one direction, and then
           again in the other direction.
 
-          Trilinear interpolation is an extension of linear interpolation for 
-          interpolating functions of three variables (e.g. D-direction, 
-          H-direction and W-direction in this op) on a rectilinear 3D grid. 
+          Trilinear interpolation is an extension of linear interpolation for
+          interpolating functions of three variables (e.g. D-direction,
+          H-direction and W-direction in this op) on a rectilinear 3D grid.
           The linear interpolation is performed on three directions.
 
           Bicubic interpolation is an extension of cubic interpolation for interpolating
@@ -579,24 +579,24 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
           smoother than corresponding surfaces obtained by bilinear interpolation or
           nearest-neighbor interpolation.
 
-          Align_corners and align_mode are optional parameters,the calculation method 
+          Align_corners and align_mode are optional parameters,the calculation method
           of interpolation can be selected by them.
-          
+
           Example:
 
           For scale:
-          
+
             if align_corners = True and out_{size}>1 :
 
               scale_{factor} = (in_{size}-1.0)/(out_{size}-1.0)
-            
+
             else:
-              
+
               scale_{factor} = float(in_{size}/out_{size})
-            
-          
+
+
           Nearest neighbor interpolation:
-          
+
           if:
               align_corners = False
 
@@ -619,16 +619,16 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
 
           if:
               align_corners = False , align_mode = 0
-              
+
               input : (N,C,H_in,W_in)
               output: (N,C,H_out,W_out) where:
-              
+
               H_out = (H_{in}+0.5) * scale_{factor} - 0.5
               W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
 
           else:
-           
+
               input : (N,C,H_in,W_in)
               output: (N,C,H_out,W_out) where:
 
@@ -639,17 +639,17 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
 
           if:
               align_corners = False , align_mode = 0
-              
+
               input : (N,C,D_in,H_in,W_in)
               output: (N,C,D_out,H_out,W_out) where:
-              
+
               D_out = (D_{in}+0.5) * scale_{factor} - 0.5
               H_out = (H_{in}+0.5) * scale_{factor} - 0.5
               W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
 
           else:
-           
+
               input : (N,C,D_in,H_in,W_in)
               output: (N,C,D_out,H_out,W_out) where:
 
@@ -671,13 +671,13 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
               H_out = H_{in} * scale_{factor}
               W_out = W_{in} * scale_{factor}
 
-          For details of nearest neighbor interpolation, please refer to Wikipedia: 
+          For details of nearest neighbor interpolation, please refer to Wikipedia:
           https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
 
-          For details of bilinear interpolation, please refer to Wikipedia: 
+          For details of bilinear interpolation, please refer to Wikipedia:
           https://en.wikipedia.org/wiki/Bilinear_interp_v2olation
 
-          For details of trilinear interpolation, please refer to Wikipedia: 
+          For details of trilinear interpolation, please refer to Wikipedia:
           https://en.wikipedia.org/wiki/Trilinear_interp_v2olation
 
           For details of bicubic interpolation, please refer to Wikipedia:
diff --git a/paddle/fluid/operators/isclose_op.cc b/paddle/fluid/operators/isclose_op.cc
index 68d241e4ac..8d0cd10097 100644
--- a/paddle/fluid/operators/isclose_op.cc
+++ b/paddle/fluid/operators/isclose_op.cc
@@ -46,7 +46,7 @@ class IscloseOpMaker : public framework::OpProtoAndCheckerMaker {
                   "compared as equal. Default: :math:`False` .")
         .SetDefault(false);
 
-    AddComment(R"DOC( 
+    AddComment(R"DOC(
 This operator checks if all :math:`x` and :math:`y` satisfy the condition:
 
 .. math::
diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
index dbd2eb763d..decee5567b 100644
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -72,19 +72,19 @@ class KLDivLossOpMaker : public framework::OpProtoAndCheckerMaker {
          While :math:`x` is Input(X) and :math:`y` is Input(Target).
 
          While :attr:`reduction` is :attr:`none`, output loss is in
-         the same shape as Input(X), loss in each point is calculated 
+         the same shape as Input(X), loss in each point is calculated
          seperately and no reduction is applied.
-         
+
          While :attr:`reduction` is :attr:`mean`, output loss is in
          shape of [1] and loss value is the mean value of all losses.
-         
+
          While :attr:`reduction` is :attr:`sum`, output loss is in
          shape of [1] and loss value is the sum value of all losses.
-         
-         While :attr:`reduction` is :attr:`batchmean`, output loss is 
+
+         While :attr:`reduction` is :attr:`batchmean`, output loss is
          in shape of [1] and loss value is the sum value of all losses
          divided by batch size.
-         
+
          )DOC");
   }
 };
diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc
index cede00d5b0..d4fed2db47 100644
--- a/paddle/fluid/operators/kron_op.cc
+++ b/paddle/fluid/operators/kron_op.cc
@@ -63,14 +63,14 @@ class KronOpMaker : public framework::OpProtoAndCheckerMaker {
           Kron Operator.
 
           This operator computes the Kronecker product of two tensors, a
-          composite tensor made of blocks of the second tensor scaled by the 
+          composite tensor made of blocks of the second tensor scaled by the
           first.
 
           This operator assumes that the rank of the two tensors, $X$ and $Y$
-          are the same, if necessary prepending the smallest with ones. If the 
-          shape of $X$ is [$r_0$, $r_1$, ..., $r_N$] and the shape of $Y$ is 
-          [$s_0$, $s_1$, ..., $s_N$], then the shape of the output tensor is 
-          [$r_{0}s_{0}$, $r_{1}s_{1}$, ..., $r_{N}s_{N}$]. The elements are 
+          are the same, if necessary prepending the smallest with ones. If the
+          shape of $X$ is [$r_0$, $r_1$, ..., $r_N$] and the shape of $Y$ is
+          [$s_0$, $s_1$, ..., $s_N$], then the shape of the output tensor is
+          [$r_{0}s_{0}$, $r_{1}s_{1}$, ..., $r_{N}s_{N}$]. The elements are
           products of elements from $X$ and $Y$.
 
           The equation is:
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index 873ab62a3d..72813e76c7 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -92,23 +92,23 @@ class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 LabelSmooth Operator.
 
-Label smoothing is a mechanism to regularize the classifier layer. In machine 
-learning, optimizing the log-likelihood of the correct label directly may 
-cause two problems. First, it may result in overfitting: if the model learns 
+Label smoothing is a mechanism to regularize the classifier layer. In machine
+learning, optimizing the log-likelihood of the correct label directly may
+cause two problems. First, it may result in overfitting: if the model learns
 to assign full probability to the ground-truth label for each training example,
-it is not guaranteed to generalize. Second, it encourages the differences 
-between the largest logit and all others to become large, reducing the ability 
-of the model to adapt. Label smoothing is proposed to encourage the model to 
-be less confident, which replaces the ground-truth label $y$ with the weighted 
+it is not guaranteed to generalize. Second, it encourages the differences
+between the largest logit and all others to become large, reducing the ability
+of the model to adapt. Label smoothing is proposed to encourage the model to
+be less confident, which replaces the ground-truth label $y$ with the weighted
 sum of itself and some fixed distribution $\mu$, i.e.
 
 $$
     \tilde{y} = (1 - \epsilon) * y + \epsilon * \mu,
 $$
 
-where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and 
-$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for 
-$\mu$. This change in the ground-truth label is called label-smoothing 
+where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and
+$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for
+$\mu$. This change in the ground-truth label is called label-smoothing
 regularization or LSR.
 
 See more details about label smoothing in https://arxiv.org/abs/1512.00567.
diff --git a/paddle/fluid/operators/logspace_op.cc b/paddle/fluid/operators/logspace_op.cc
index ac326004a1..5e5e25a56d 100644
--- a/paddle/fluid/operators/logspace_op.cc
+++ b/paddle/fluid/operators/logspace_op.cc
@@ -54,11 +54,11 @@ class LogspaceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("dtype", "The output data type.");
     AddOutput("Out", "A sequence of numbers.");
     AddComment(R"DOC(
-        Return fixed number of logarithmical-evenly spaced values within a given 
-        interval. First entry is exponential of Start with base Base, and last 
-        entry is exponential of Stop with base Base. In the case when Num is 1, 
-        only exponential of Start with base Base is returned. If dtype is int32 
-        or int64, the decimal part of values will be truncated. 
+        Return fixed number of logarithmical-evenly spaced values within a given
+        interval. First entry is exponential of Start with base Base, and last
+        entry is exponential of Stop with base Base. In the case when Num is 1,
+        only exponential of Start with base Base is returned. If dtype is int32
+        or int64, the decimal part of values will be truncated.
         Like logspace function of numpy.
     )DOC");
   }
diff --git a/paddle/fluid/operators/lookup_table_dequant_op.cc b/paddle/fluid/operators/lookup_table_dequant_op.cc
index f5b15af4a4..e0ca707ffa 100644
--- a/paddle/fluid/operators/lookup_table_dequant_op.cc
+++ b/paddle/fluid/operators/lookup_table_dequant_op.cc
@@ -114,7 +114,7 @@ Lookup Table Dequant Operator.
 
 The `W` input is a quantized parameter for the sake of saving memories.
 This operator first index embeddings with `Ids`,
-then dequantizes them and contact them as output (`Out`). 
+then dequantizes them and contact them as output (`Out`).
 
 The input Ids can carry the LoD (Level of Details) information,
 or not. And the output only shares the LoD information with input Ids.
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index 84e4e5cd2c..156fc55fb6 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -259,11 +259,11 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator.
 
-LSTMP has a separate projection layer after the LSTM layer, projecting the 
-original hidden state to a lower-dimensional one, which is proposed to reduce 
-the number of total parameters and furthermore computational complexity for 
-the LSTM, espeacially for the case that the size of output units is relative 
-large (https://research.google.com/pubs/archive/43905.pdf). 
+LSTMP has a separate projection layer after the LSTM layer, projecting the
+original hidden state to a lower-dimensional one, which is proposed to reduce
+the number of total parameters and furthermore computational complexity for
+the LSTM, espeacially for the case that the size of output units is relative
+large (https://research.google.com/pubs/archive/43905.pdf).
 
 The formula is as follows:
 
@@ -291,14 +291,14 @@ denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
 is the activation, such as logistic sigmoid function, and
 $i, f, o$ and $c$ are the input gate, forget gate, output gate,
 and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector $h$. Here $h$ is usually called the hidden 
-state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also 
-called the candidate hidden state, whose computation is based on the current 
+the cell output activation vector $h$. Here $h$ is usually called the hidden
+state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also
+called the candidate hidden state, whose computation is based on the current
 input and previous hidden state.
 
 The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
 are the cell input and cell output activation functions and `tanh` is usually
-used for them. $\overline{act_h}$ is the activation function for the 
+used for them. $\overline{act_h}$ is the activation function for the
 projection output, usually using `identity` or same as $act_h$.
 
 Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
diff --git a/paddle/fluid/operators/lu_op.cc b/paddle/fluid/operators/lu_op.cc
index c6831f975c..923c14f3db 100644
--- a/paddle/fluid/operators/lu_op.cc
+++ b/paddle/fluid/operators/lu_op.cc
@@ -24,7 +24,7 @@ namespace operators {
 class LUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddComment(R"DOC(LU decomposition, 
+    AddComment(R"DOC(LU decomposition,
                 Computes the LU factorization of a matrix or batches of matrices A.
                 )DOC");
     AddInput("X", "(Tensor) The input tensor, shape of (*,m,n)");
diff --git a/paddle/fluid/operators/lu_unpack_op.cc b/paddle/fluid/operators/lu_unpack_op.cc
index 988cba4398..9f631a60c1 100644
--- a/paddle/fluid/operators/lu_unpack_op.cc
+++ b/paddle/fluid/operators/lu_unpack_op.cc
@@ -24,7 +24,7 @@ namespace operators {
 class LU_UnpackOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddComment(R"DOC(Unpack L U and P to single matrix tensor, 
+    AddComment(R"DOC(Unpack L U and P to single matrix tensor,
                 unpack L and U matrix from LU, unpack permutation matrix Pmat from Pivtos .
                 )DOC");
     AddInput("X", "(Tensor) The input LU tensor, shape of (*,m,n)");
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
index 44f77afee0..47ed77cbfb 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -102,19 +102,19 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
 MarginRankLoss Operator.
 
 This operator measures the loss given a pair of training sample
-{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` 
-indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss 
+{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1`
+indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss
 is calculated as:
 
 $loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$
 
 The attribute `margin` here helps make the predictions more robust.
-Denote the item ranked higher as the positive sample, otherwise the negative 
-sample. If the score of the two samples satisfies 
+Denote the item ranked higher as the positive sample, otherwise the negative
+sample. If the score of the two samples satisfies
 
 $positive sample - negative sample < margin$
 
-the pair of samples will contribute to the final loss, which will backpropagate 
+the pair of samples will contribute to the final loss, which will backpropagate
 and train the ranking model to enlarge the difference between the two scores.
 
 For batch input with size `batch_size`, `X1`, `X2` and `Label`
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index 80313f156f..820e754049 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -230,9 +230,9 @@ void MatchMatrixTensorOpMaker::Make() {
       Match Matrix Tensor Operator
 
       This operator calculate X * W * Y, only support 2-D for X and Y.
-      the output is a level-1 LodTensor: 
+      the output is a level-1 LodTensor:
         level_0: dim_t
-      
+
       NOTE: only support 'float32' data type now.
 
     )DOC");
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 8f233d7650..3b32acc8d7 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -193,8 +193,8 @@ class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker {
                   "doing multiplication")
         .SetDefault(false);
     AddComment(
-        R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K), 
-        B has shape (d0, d1 ... K, N), Out has shape ((d0, d1 ... M, N)). 
+        R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K),
+        B has shape (d0, d1 ... K, N), Out has shape ((d0, d1 ... M, N)).
         In addition, it also follows the broadcast rule which is similar as
         numpy.matmul.
 )DOC");
diff --git a/paddle/fluid/operators/mean_iou_op.cc b/paddle/fluid/operators/mean_iou_op.cc
index 315708cc05..0e75629f71 100644
--- a/paddle/fluid/operators/mean_iou_op.cc
+++ b/paddle/fluid/operators/mean_iou_op.cc
@@ -87,10 +87,10 @@ class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker {
 mean-IOU Operator.
 Mean Intersection-Over-Union is a common evaluation metric for
 semantic image segmentation, which first computes the IOU for each
-semantic class and then computes the average over classes. 
-IOU is defined as follows: 
+semantic class and then computes the average over classes.
+IOU is defined as follows:
     IOU = true_positive / (true_positive + false_positive + false_negative).
-It is based on pixel level area while "IOU Similarity Operator" 
+It is based on pixel level area while "IOU Similarity Operator"
 is based on area of rectangle.
 
 )DOC");
diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc
index 9fb06c5968..273b1fe7c9 100644
--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -118,7 +118,7 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                  "6: dst is on CustomDevicePlace");
     AddComment(R"DOC(
     Memcpy Operator.
-    By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or 
+    By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or
     NPUPlace <-> CPUPlace, and used as an internal op by Recompute-Offload.
     You would have to update it if you want other more capacities.
 
diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc
index 924711d9a3..0b95200c12 100644
--- a/paddle/fluid/operators/meshgrid_op.cc
+++ b/paddle/fluid/operators/meshgrid_op.cc
@@ -65,7 +65,7 @@ Take: N tensors, each of which can be either scalr or 1-dimensional vector, and
 N-dimensional grids.
 
 Args:
-  tensors (list of tensor): if the input k tensors has (N1,), (N2,),..., (Nk,), then 
+  tensors (list of tensor): if the input k tensors has (N1,), (N2,),..., (Nk,), then
   the output tensors are all of size (N1, N2, ...., Nk).
 
 Example::
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 3665e035d4..f8e57adc70 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -44,7 +44,7 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Total", "The samples count of current batch");
 
     AddComment(R"DOC(
-Accuracy Operator. 
+Accuracy Operator.
 
 It will print accuracy rate for classification.
 The accuracy is calculated as follows:
@@ -52,7 +52,7 @@ The accuracy is calculated as follows:
 $$accuracy = \frac{NumOfCorrectPredicts}{NumOfAllSamples}$$
 
 Both the input Out and Label can carry the LoD (Level of Details)
-information, or not. But the output only shares the LoD information 
+information, or not. But the output only shares the LoD information
 with the input Out(Inference).
 
 )DOC");
diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc
index 75a55f377e..fce3028ab7 100644
--- a/paddle/fluid/operators/mode_op.cc
+++ b/paddle/fluid/operators/mode_op.cc
@@ -51,7 +51,7 @@ class ModeOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(-1);
     AddAttr<bool>("keepdim", "Keep the dim that to reduce.").SetDefault(false);
     AddComment(R"DOC(
-This operator finds the mode of input Tensor. And outputs their values and indices as vectors. 
+This operator finds the mode of input Tensor. And outputs their values and indices as vectors.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index 17f323d0bc..e0e64bb0c2 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -86,7 +86,7 @@ Since target Y is not differentiable, calculating gradient for Y is illegal.
 The formula of modified huber loss is:
 
 $$
-L(y, f(x)) = 
+L(y, f(x)) =
 \begin{cases}
 (\max(0, 1 - yf(x)))^2,  \text{if} \  yf(x) >= -1    \\
              -4yf(x),    \quad \text{otherwise}
diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
index 14191d018b..782b67d90e 100644
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
@@ -82,10 +82,10 @@ The loss can be described as:
 
 $Out[i] = -X[Label[i]]*Weight[Label[i]]$
 
-It can also be used for higher dimension inputs, such as 2D images, by 
-providing an input of shape (batch_size, C, d1, d2, ..., dK), with 
-K >= 1, where K is the number of dimensions, and a Label of 
-appropriate shape. In the case of images, it computes NLL loss 
+It can also be used for higher dimension inputs, such as 2D images, by
+providing an input of shape (batch_size, C, d1, d2, ..., dK), with
+K >= 1, where K is the number of dimensions, and a Label of
+appropriate shape. In the case of images, it computes NLL loss
 per-pixel.
 
 )DOC");
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index 76737f2bc3..9754628b1b 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -54,7 +54,7 @@ y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
 $$
 
 where, $\sum {x^2}$ is calculated along the `axis` dimension.
-        
+
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc
index ad1262a7d2..d058b890cb 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
@@ -116,7 +116,7 @@ class DpsgdOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Dpsgd Optimizer.
 
-We implement the Dpsgd optimizer according to CCS16 paper - 
+We implement the Dpsgd optimizer according to CCS16 paper -
 Deep Learning with Differential Privacy.
 
 Dpsgd updates:
diff --git a/paddle/fluid/operators/optimizers/lamb_op.cc b/paddle/fluid/operators/optimizers/lamb_op.cc
index cc3c99f9b1..e9d6ab77f4 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/lamb_op.cc
@@ -101,8 +101,8 @@ class LambOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.
 
-LAMB Optimizer is designed to scale up the batch size of training without losing 
-accuracy, which supports adaptive element-wise updating and accurate layer-wise 
+LAMB Optimizer is designed to scale up the batch size of training without losing
+accuracy, which supports adaptive element-wise updating and accurate layer-wise
 correction. For more information, please refer to https://arxiv.org/abs/1904.00962.
 
 The updating of parameters follows:
@@ -121,7 +121,7 @@ r_t &= \frac{m_t}{\sqrt{v_t}+\epsilon} \\
 w_t &= w_{t-1} -\eta_t \frac{\left \| w_{t-1}\right \|}{\left \| r_t + \lambda w_{t-1}\right \|} (r_t + \lambda w_{t-1})
 $$
 
-where $m$ is the 1st moment, and $v$ the 2nd moment, $\eta$ the 
+where $m$ is the 1st moment, and $v$ the 2nd moment, $\eta$ the
 learning rate, $\lambda$ the weight decay rate.
 )DOC");
   }
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
index f576827f9c..d3d45ad3c6 100644
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
@@ -62,11 +62,11 @@ class Pow2DecayWithLinearWarmupOpMaker
     AddComment(R"DOC(
 The Pow2DecayWithLinearWarmup learning rate scheduler.
 
-When step_num < warmup_steps, lr = base_lr * step_num / warmup_steps 
+When step_num < warmup_steps, lr = base_lr * step_num / warmup_steps
 
-When warmup_steps <= step_num <= total_steps, 
-   factor = 1 - (step_num - warmup_steps) / (total_steps - warmup_steps) 
-   lr = (base_lr - end_lr) * factor * factor + end_lr 
+When warmup_steps <= step_num <= total_steps,
+   factor = 1 - (step_num - warmup_steps) / (total_steps - warmup_steps)
+   lr = (base_lr - end_lr) * factor * factor + end_lr
 
 When step_num > total_steps, lr = end_lr
 
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
index 072e39dd91..de280a6788 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
@@ -119,9 +119,9 @@ param = sign(prox\_param) / (1 + learning\_rate * l2) *
         \max(|prox\_param| - learning\_rate * l1 , 0)
 $$
 
-The paper that proposed Proximal GD: 
+The paper that proposed Proximal GD:
 (http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
-Here, we use the adagrad learning rate as specified here: 
+Here, we use the adagrad learning rate as specified here:
 (http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
 
 )DOC");
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
index 5067686367..2460b30fa2 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
@@ -92,7 +92,7 @@ $$
 prox\_param = param - learning\_rate * grad \\
 param = sign(prox\_param) / (1 + learning\_rate * l2) *
         \max(|prox\_param| - learning\_rate * l1, 0)
-$$        
+$$
 
 The paper that proposed Proximal Gradient Descent:
 (http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc
index d80a5d8900..3e923d34a0 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc
@@ -66,7 +66,7 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
         .SetDefault(false);
     AddComment(R"DOC(
-Rmsprop Optimizer. 
+Rmsprop Optimizer.
 
 $$
 MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad \\
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index 0f2873d73e..66aef5fe4e 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -775,7 +775,7 @@ class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault("NCHW");
     AddComment(R"DOC(
 Pad2d Operator.
-Pad 2-d images according to 'paddings' and 'mode'. 
+Pad 2-d images according to 'paddings' and 'mode'.
 If mode is 'reflect', paddings[0] and paddings[1] must be no greater
 than height-1. And the width dimension has the same condition.
 
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
index 301c21b2fc..6141e6e98b 100644
--- a/paddle/fluid/operators/pad3d_op.cc
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -113,7 +113,7 @@ class Pad3dOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault("NCDHW");
     AddComment(R"DOC(
 Pad3d Operator.
-Pad 3-d images according to 'paddings' and 'mode'. 
+Pad 3-d images according to 'paddings' and 'mode'.
 If mode is 'reflect', paddings[0] and paddings[1] must be no greater
 than width-1. The height and depth dimension have the same condition.
 
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index 962feb2ee9..fb4a90ebd8 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -59,7 +59,7 @@ class PadOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Pad Operator.
 
-Pad input into output, as specified by paddings and pad_value. 
+Pad input into output, as specified by paddings and pad_value.
 The input should be a k-D tensor(k > 0 and k < 7). As an example:
 
 Given:
diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc
index 148eb8806b..eb8271edcc 100644
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -161,9 +161,9 @@ class PartialSumOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(-1);
     AddComment(R"DOC(
 PartialSum Operator.
-This Op can sum the vars by specifying the initial position(start_index) and length(length). 
+This Op can sum the vars by specifying the initial position(start_index) and length(length).
 This OP exists in contrib, which means that it is not shown to the public.
-Only 2-D Tensor or LodTensor input is supported. Slice and concat can only be 
+Only 2-D Tensor or LodTensor input is supported. Slice and concat can only be
 performed along the second dimension.
 
 Examples:
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index e73abba60d..64e7321706 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -60,9 +60,9 @@ class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
     		with a stride of :math:`1/r`.
 
 		Please refer to the paper:
-		 `Real-Time Single Image and Video Super-Resolution Using an Efficient 
+		 `Real-Time Single Image and Video Super-Resolution Using an Efficient
 		 Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
-    		by Shi et. al (2016) for more details. 
+    		by Shi et. al (2016) for more details.
 
         )DOC");
   }
diff --git a/paddle/fluid/operators/pixel_unshuffle_op.cc b/paddle/fluid/operators/pixel_unshuffle_op.cc
index 9e31a8567a..e0ac5283ab 100644
--- a/paddle/fluid/operators/pixel_unshuffle_op.cc
+++ b/paddle/fluid/operators/pixel_unshuffle_op.cc
@@ -55,9 +55,9 @@ class PixelUnshuffleOpMaker : public framework::OpProtoAndCheckerMaker {
 		This operation is the reversion of PixelShuffle operation.
 
 		Please refer to the paper:
-		 `Real-Time Single Image and Video Super-Resolution Using an Efficient 
+		 `Real-Time Single Image and Video Super-Resolution Using an Efficient
 		 Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
-    		by Shi et. al (2016) for more details. 
+    		by Shi et. al (2016) for more details.
 
         )DOC");
   }
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 4ae7b2f170..57aef714a0 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -119,7 +119,7 @@ MaxPool2d Operator.
 The maxPooling2d with index operation calculates the output and the mask
 based on the input, ksize, strides, and paddings parameters. Input(X) and
 output(Out, Mask) are in NCHW format, where N is batch size, C is the
-number of channels, H is the height of the feature, 
+number of channels, H is the height of the feature,
 and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
@@ -136,12 +136,12 @@ Example:
        H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
        W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
-  
+
   For adaptive = true:
        $$
        H_{out} = ksize[0]   W_{out} = ksize[1]
        $$
-      
+
 
 )DOC");
   }
@@ -210,7 +210,7 @@ The maxpooling3d with index operation calculates the output and the mask
 based on the input and ksize, strides, paddings parameters.
 Input(X) and output(Out, Mask) are in NCDHW format, where N is batch
 size, C is the number of channels, and D, H and W are the depth, height and
-width of the feature, respectively. 
+width of the feature, respectively.
 Parameters(ksize, strides, paddings) are three elements.
 These three elements represent depth, height and width, respectively.
 The input(X) size and output(Out, Mask) size may be different.
@@ -227,7 +227,7 @@ Example:
        H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
        W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
        $$
-  
+
   For adaptive = true:
        $$
        D_{out} = ksize[0]   H_{out} = ksize[1]   W_{out} = ksize[2]
diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
index f3a4af3b67..e85e51d9eb 100644
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
@@ -70,8 +70,8 @@ class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(1);
     AddComment(R"Doc(
 Position sensitive region of interest pooling (also known as PSROIPooling) is to perform
-position-sensitive average pooling on regions of interest specified by input, takes as 
-input N position-sensitive score maps and a list of num_rois regions of interest. 
+position-sensitive average pooling on regions of interest specified by input, takes as
+input N position-sensitive score maps and a list of num_rois regions of interest.
 
 PSROIPooling for R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details.
     )Doc");
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
index b86cd9538a..6736cb4c87 100644
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -79,7 +79,7 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
       This operator takes a batch of instance, and do random cropping on each instance.
       It means that cropping positions differs on each instance, which is determined
-      by an uniform random generator. All cropped instances have the same shape, which 
+      by an uniform random generator. All cropped instances have the same shape, which
       is determined by the operator's attribute 'shape'.
     )DOC");
   }
diff --git a/paddle/fluid/operators/randperm_op.cc b/paddle/fluid/operators/randperm_op.cc
index 565707853e..78366efc53 100644
--- a/paddle/fluid/operators/randperm_op.cc
+++ b/paddle/fluid/operators/randperm_op.cc
@@ -71,7 +71,7 @@ class RandpermOpMaker : public framework::OpProtoAndCheckerMaker {
                  "Default: 0.")
         .SetDefault(0);
 
-    AddComment(R"DOC( 
+    AddComment(R"DOC(
 This operator returns a random permutation of integers from 0 to n-1.
 )DOC");
   }
diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc
index 716fc58d41..f68e1668aa 100644
--- a/paddle/fluid/operators/rank_attention_op.cc
+++ b/paddle/fluid/operators/rank_attention_op.cc
@@ -146,7 +146,7 @@ class RankAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0);
     AddComment(R"DOC(
 RankAttention Operator.
-This Op can calculate rank attention between input and rank_param, 
+This Op can calculate rank attention between input and rank_param,
 and rank_param gives the organization of data. Notice: It currently supports GPU device.
 This Op exists in contrib, which means that it is not shown to the public.
 )DOC");
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index b353b2992c..2daf8c5d6b 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -155,7 +155,7 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
 RankLoss Operator.
 
 RankLoss operator for RankNet
-(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf). 
+(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf).
 RankNet is a pairwise ranking model with
 one training sample consisting of a pair of doc A and B, and the label P
 indicating that A is ranked higher than B or not:
@@ -164,8 +164,8 @@ P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
 the input pair.
 
 The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
-(P_{i,j}), which represent the output score of RankNet for the two docs and 
-the label respectively, and yields the rank loss C_{i,j} using the following 
+(P_{i,j}), which represent the output score of RankNet for the two docs and
+the label respectively, and yields the rank loss C_{i,j} using the following
 equation:
 
 $$
diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc
index 32e772374b..617c47530c 100644
--- a/paddle/fluid/operators/real_op.cc
+++ b/paddle/fluid/operators/real_op.cc
@@ -30,10 +30,10 @@ class RealOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "(Tensor), The input tensor of real op.");
     AddOutput("Out", "(Tensor), The output tensor of real op.");
-    AddComment(R"DOC( 
-Real Operator. 
+    AddComment(R"DOC(
+Real Operator.
 
-This operator is used to get a new tensor containing real values 
+This operator is used to get a new tensor containing real values
 from a tensor with complex data type.
 
 )DOC");
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index c3f61d4d2b..6778855bcb 100644
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -70,7 +70,7 @@ X = [Slice0, Slice1, Slice2, Slice3] and its LoD information is empty. The
 indices in RankTable are [3, 0, 2, 1].
 Out = [Slice3, Slice0, Slice2, Slice1] with no LoD information is appended.
 
-**NOTE**: 
+**NOTE**:
 This operator sorts Input(X) according to a given LoDRankTable which does
 not need to be calculated according to Input(X). It can be calculated according
 to another different sequence, and then this operator sorts Input(X) according
diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc
index 4e99be4e52..810a73d89d 100644
--- a/paddle/fluid/operators/reverse_op.cc
+++ b/paddle/fluid/operators/reverse_op.cc
@@ -69,7 +69,7 @@ class ReverseOpMaker : public framework::OpProtoAndCheckerMaker {
             Out = [[11, 12, 13, 14, 15]
                    [6, 7, 8, 9, 10]
                    [1, 2, 3, 4, 5]].
-        
+
       Case 2:
         Given
             X = [[[1, 2, 3, 4]
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index c02dc492c3..922d255bbe 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -116,17 +116,17 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
 **RoIAlign Operator**
 
 Region of interest align (also known as RoI align) is to perform
-bilinear interpolation on inputs of nonuniform sizes to obtain 
+bilinear interpolation on inputs of nonuniform sizes to obtain
 fixed-size feature maps (e.g. 7*7)
 
 Dividing each region proposal into equal-sized sections with
 the pooled_width and pooled_height. Location remains the origin
 result.
 
-In each ROI bin, the value of the four regularly sampled locations 
+In each ROI bin, the value of the four regularly sampled locations
 are computed directly through bilinear interpolation. The output is
 the mean of four locations.
-Thus avoid the misaligned problem.   
+Thus avoid the misaligned problem.
     )DOC");
   }
 };
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 0878c33247..c95e235aff 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -124,7 +124,7 @@ The operator has three steps:
 
 3. Copying these max values to the output buffer
 
-ROI Pooling for Faster-RCNN. The link below is a further introduction: 
+ROI Pooling for Faster-RCNN. The link below is a further introduction:
 https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
     )DOC");
   }
diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc
index 9f66073c36..7ac1d4b8d4 100644
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -85,10 +85,10 @@ class RollOpMaker : public framework::OpProtoAndCheckerMaker {
         "with shifts or size == 0")
         .SetDefault({});
     AddComment(R"DOC(
-    Roll the tensor along the given dimension(s). 
+    Roll the tensor along the given dimension(s).
     Elements that are shifted beyond the last position
     are re-introduced at the first position. If a dimension
-    is not specified, the tensor will be flattened before 
+    is not specified, the tensor will be flattened before
     rolling and then restored to the original shape.
     )DOC");
   }
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index fc39d174c9..1bf471641d 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -100,20 +100,20 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 :strong:`Row-convolution operator`
 
-The row convolution is called lookahead convolution.  This operator was 
+The row convolution is called lookahead convolution.  This operator was
 introduced in the following paper for DeepSpeech2:
-http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf 
+http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf
 
-The main motivation is that a bidirectional RNN, useful in DeepSpeech 
-like speech models, learns representation for a sequence by performing a 
-forward and a backward pass through the entire sequence. However, unlike 
+The main motivation is that a bidirectional RNN, useful in DeepSpeech
+like speech models, learns representation for a sequence by performing a
+forward and a backward pass through the entire sequence. However, unlike
 unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
-and low-latency setting. The lookahead convolution incorporates information 
-from future subsequences in a computationally efficient manner to improve 
-unidirectional recurrent neural networks. The row convolution operator is 
+and low-latency setting. The lookahead convolution incorporates information
+from future subsequences in a computationally efficient manner to improve
+unidirectional recurrent neural networks. The row convolution operator is
 different from the 1D sequence convolution, and is computed as follows:
 
-Given an input sequence $X$ of length $t$ and input dimension $D$, 
+Given an input sequence $X$ of length $t$ and input dimension $D$,
 and a filter ($W$) of size $context \times D$,
 the output sequence is convolved as:
 
diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc
index 0d384eef8a..45fee045cb 100644
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
@@ -133,14 +133,14 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 RunProgram operator.
 
-The RunProgram operator receives a program's feed targets, fetch targets, 
-and parameters, and receives the forward and backward program desc 
+The RunProgram operator receives a program's feed targets, fetch targets,
+and parameters, and receives the forward and backward program desc
 as attributes, and then executes the program by executor.
 
-NOTE: This operator is added so that the inference model stored by 
-`fluid.io.save_inference_model` under the static graph mode can be loaded 
+NOTE: This operator is added so that the inference model stored by
+`fluid.io.save_inference_model` under the static graph mode can be loaded
 under the dynamic graph mode for fine-tuning or inferencing.
-      
+
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
index 6f23eeebaf..ee9abf6f35 100644
--- a/paddle/fluid/operators/sample_logits_op.cc
+++ b/paddle/fluid/operators/sample_logits_op.cc
@@ -102,7 +102,7 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
   """
   Computes sampled output training logits and labels suitable for implementing
-  sampled softmax.        
+  sampled softmax.
   """
 
 )DOC");
diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc
index afe43a337b..1beb06366e 100644
--- a/paddle/fluid/operators/searchsorted_op.cc
+++ b/paddle/fluid/operators/searchsorted_op.cc
@@ -54,7 +54,7 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker {
   Searchsorted Operator.
 
   This OP is used to find the index of the corresponding sorted_sequence in the innermost dimension based on the given values.
- 
+
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/select_output_op.cc b/paddle/fluid/operators/select_output_op.cc
index 0cb7e058a6..ad9cbf22cf 100644
--- a/paddle/fluid/operators/select_output_op.cc
+++ b/paddle/fluid/operators/select_output_op.cc
@@ -83,7 +83,7 @@ class SelectOutputOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     // (minimal viable product) here.
     AddComment(R"DOC(
 Split input variable into one output branch. The mask is an integer tensor to
-specify which output branch should copy the input. 
+specify which output branch should copy the input.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
index de55f1ab52..337ea46b26 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -59,10 +59,10 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
 Sequence Enumerate Operator.
 
 Generate a new sequence for the input index sequence, which enumerates all the
-sub-sequences with length `win_size` of the input. 
+sub-sequences with length `win_size` of the input.
 The enumerated sequence has the same 1st dimension with variable `input`, and
 the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
-    
+
 Examples:
 Case 1:
   Input:
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
index c64b568e53..2943b88959 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
@@ -62,17 +62,17 @@ class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Sequence Erase Operator.
 
-Sequence erase operator erases tokens specified by Attr(tokens) from the input 
-sequences Input(X), and outputs the remaining data and modifies the LoD 
+Sequence erase operator erases tokens specified by Attr(tokens) from the input
+sequences Input(X), and outputs the remaining data and modifies the LoD
 information at the same time. For example, given a 2-D LoDTensor
 
     X = [[2, 2, 6, 1, 3, 9, 6, 1, 0, 1]]^T
 
 with lod = [[0, 3, 6, 10]], there are three sequences in the input:
-   
+
      X1 = [[2, 2, 6]]^T, X2 = [[1, 3, 9]]^T and X3 = [[6, 1, 0, 1]]^T.
 
-If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing 
+If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing
 operation, the three sequences become
 
     X1' = [[6]]^T, X2' = [[1, 9]]^T and X3' = [[6, 1, 0, 1]]^T.
@@ -83,8 +83,8 @@ Hence the LoDTensor Output(Out) should be
 
 with lod = [[0, 1, 3, 7]].
 
-An example usage for this operator is to remove the special tokens when 
-computing the edit distance between two strings, such as blank, start token, 
+An example usage for this operator is to remove the special tokens when
+computing the edit distance between two strings, such as blank, start token,
 and end token.
 )DOC");
   }
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
index 2ed9c44f59..8ea756e455 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
@@ -85,7 +85,7 @@ This operator outputs a Mask according to Input(X) and Attr(maxlen).
 Supposing Input(X) is a Tensor with shape [d_1, d_2, ..., d_n], the
 Output(Y) is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
 
-Y(i_1, i_2, ..., i_n, j) = (j < X(i_1, i_2, ..., i_n)) 
+Y(i_1, i_2, ..., i_n, j) = (j < X(i_1, i_2, ..., i_n))
 
 If maxlen < 0, maxlen = max(X)
     )DOC");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
index e5c84d45d5..d427e339fb 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
@@ -170,9 +170,9 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
       Sequence Pad Operator
 
-      This operator pads sequences in a same batch to a consistent length. 
-      The length is specified by attribute 'padded_length'. New elements, 
-      whose values are specified by input 'PadValue', will be appended to 
+      This operator pads sequences in a same batch to a consistent length.
+      The length is specified by attribute 'padded_length'. New elements,
+      whose values are specified by input 'PadValue', will be appended to
       the end of each sequence, to make their final lengths consistent.
 
       Following are cases to better explain how this works:
@@ -186,10 +186,10 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
           PadValue.data = [0]
       and attribite 'padded_length' = 4,
       then we get LoDTensor:
-          Out.data = [[a, b, 0, 0], 
+          Out.data = [[a, b, 0, 0],
                       [c, d, e, 0]]
           Length.data = [2, 3]
-      
+
       Case 2:
 
       Given a 1-level LoDTensor input(X):
@@ -197,13 +197,13 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
           X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
       and Input(PadValue):
           PadValue.data = [0]
-      and attribite 'padded_length' = -1, which mean using the length 
+      and attribite 'padded_length' = -1, which mean using the length
       of longest input sequence(3 in this case),
       then we get LoDTensor:
-          Out.data = [[[a1, a2], [b1, b2], [0, 0]], 
+          Out.data = [[[a1, a2], [b1, b2], [0, 0]],
                       [[c1, c2], [d1, d2], [e1, e2]]]
           Length.data = [2, 3]
- 
+
       Case 3:
 
       Given a 1-level LoDTensor input(X):
@@ -211,10 +211,10 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
           X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
       and Input(PadValue):
           PadValue.data = [p1, p2]
-      and attribite 'padded_length' = -1, which mean using the length 
+      and attribite 'padded_length' = -1, which mean using the length
       of longest input sequence(3 in this case),
       then we get LoDTensor:
-          Out.data = [[[a1, a2], [b1, b2], [p1, p2]], 
+          Out.data = [[[a1, a2], [b1, b2], [p1, p2]],
                       [[c1, c2], [d1, d2], [e1, e2]]]
           Length.data = [2, 3]
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
index 613dc8bfbc..fe91dd00d4 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
@@ -108,8 +108,8 @@ class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
       Sequence Unpad Operator
 
-      This operator removes the padding data in the input sequences and convert 
-      them into sequences with actual length as output, identitied by lod 
+      This operator removes the padding data in the input sequences and convert
+      them into sequences with actual length as output, identitied by lod
       information.
 
       Example:
@@ -117,9 +117,9 @@ class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
       Given input tensor Input(X):
           X.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
                     [ 6.0,  7.0,  8.0,  9.0, 10.0],
-                    [11.0, 12.0, 13.0, 14.0, 15.0]], 
-`     
-      in which there are 3 sequences padded to length 5, and the actual length 
+                    [11.0, 12.0, 13.0, 14.0, 15.0]],
+`
+      in which there are 3 sequences padded to length 5, and the actual length
       specified by Input(Length):
 
           Length.data = [2, 3, 4],
@@ -127,7 +127,7 @@ class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
       after unpadding, Output(Out) will be:
 
           Out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
-          Out.lod = [[0, 2, 5, 9]]      
+          Out.lod = [[0, 2, 5, 9]]
 
     )DOC");
   }
diff --git a/paddle/fluid/operators/shard_index_op.cc b/paddle/fluid/operators/shard_index_op.cc
index bd09ff9921..e601a50409 100644
--- a/paddle/fluid/operators/shard_index_op.cc
+++ b/paddle/fluid/operators/shard_index_op.cc
@@ -55,10 +55,10 @@ class ShardIndexOpMaker : public framework::OpProtoAndCheckerMaker {
 This layer creates the sharded index for input. This layers is used in
 model- and data- parallel mixed training generally, in which the index
 data (usually the label) should be recaculated in each trainer according
-to 
+to
 
 .. math::
-    
+
     assert index_num % nshards == 0
 
     shard_size = index_num / nshards
@@ -76,13 +76,13 @@ Examples:
     X is a Tensor of integer values:
       X.shape = [4, 1]
       X.data = [[1], [6], [12], [19]]
-    
+
     suppose index_num = 20 and nshards = 2, then we get shard_size = 10
-    
+
     if shard_id == 0, we get the Out:
       Out.shape = [4, 1]
       Out.data = [[1], [6], [-1], [-1]]
-    
+
     if shard_id == 1, we get the Out:
       Out.shape = [4, 1]
       Out.data = [[-1], [-1], [2], [9]]
diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc
index f390e99da6..5c5343bf42 100644
--- a/paddle/fluid/operators/similarity_focus_op.cc
+++ b/paddle/fluid/operators/similarity_focus_op.cc
@@ -35,17 +35,17 @@ class SimilarityFocusOpMaker : public framework::OpProtoAndCheckerMaker {
 SimilarityFocus Operator.
 
 Generate a similarity focus mask with the same shape of input using the following method:
-1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding 
-   to the axis according to the indexes. For example, if axis=1 and indexes=[a], 
-   it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X 
+1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
+   to the axis according to the indexes. For example, if axis=1 and indexes=[a],
+   it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
    is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
-2. For each index, find the largest numbers in the tensor T, so that the same 
-   row and same column has at most one number(what it means is that if the 
-   largest number has been found in the i-th row and the j-th column, then 
-   the numbers in the i-th row or j-th column will be skipped. And then the 
-   next largest number will be selected from the remaining numbers. Obviously 
-   there will be min(B, C) numbers), and mark the corresponding position of the 
-   3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for 
+2. For each index, find the largest numbers in the tensor T, so that the same
+   row and same column has at most one number(what it means is that if the
+   largest number has been found in the i-th row and the j-th column, then
+   the numbers in the i-th row or j-th column will be skipped. And then the
+   next largest number will be selected from the remaining numbers. Obviously
+   there will be min(B, C) numbers), and mark the corresponding position of the
+   3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for
    each index.
 3. Broadcast the 3-D similarity focus mask to the same shape of input X.
 
diff --git a/paddle/fluid/operators/sparse_attention_op.cc b/paddle/fluid/operators/sparse_attention_op.cc
index 6f867c05e2..48dc3d7824 100644
--- a/paddle/fluid/operators/sparse_attention_op.cc
+++ b/paddle/fluid/operators/sparse_attention_op.cc
@@ -66,8 +66,8 @@ class SparseAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddComment(R"DOC(
       Compute the value of the sparse attention module. Its input value includes five tensors.
-      Q, K, and V represent query, key, and value in the Attention module, respectively. 
-      The CSR format is used to represent the sparsity feature in the Attention module. 
+      Q, K, and V represent query, key, and value in the Attention module, respectively.
+      The CSR format is used to represent the sparsity feature in the Attention module.
       The CSR format contains two tensors, offset and columns.
       )DOC");
   }
diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc
index 1d47a10d56..19a846afd4 100644
--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ b/paddle/fluid/operators/spectral_norm_op.cc
@@ -114,7 +114,7 @@ class SpectralNormOpMaker : public framework::OpProtoAndCheckerMaker {
 
             $$\sigma{\mathbf{W}} = \mathbf{u}^{T} \mathbf{W} \mathbf{v}$$
 
-          For details of spectral normalization, please refer to paper: 
+          For details of spectral normalization, please refer to paper:
           `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
          )DOC");
   }
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
index 55d307cf08..dc1848b3ee 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -140,15 +140,15 @@ class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 SquaredL2Distance operator
 
-This operator will cacluate the squared L2 distance for the input and 
-the target. Number of distance value will be equal to the first dimension 
-of input. First dimension of the target could be equal to the input or to 1. 
-If the first dimension of target is 1, the operator will broadcast target's 
-first dimension to input's first dimension. During backward propagation, 
-the user can decide whether to calculate the gradient of the input or 
+This operator will cacluate the squared L2 distance for the input and
+the target. Number of distance value will be equal to the first dimension
+of input. First dimension of the target could be equal to the input or to 1.
+If the first dimension of target is 1, the operator will broadcast target's
+first dimension to input's first dimension. During backward propagation,
+the user can decide whether to calculate the gradient of the input or
 the target or both.
 
-Both the input X and Y can carry the LoD (Level of Details) information. 
+Both the input X and Y can carry the LoD (Level of Details) information.
 However, the output only shares the LoD information with input X.
     )DOC");
   }
diff --git a/paddle/fluid/operators/tdm_child_op.cc b/paddle/fluid/operators/tdm_child_op.cc
index 1e98035039..c91f0b989e 100644
--- a/paddle/fluid/operators/tdm_child_op.cc
+++ b/paddle/fluid/operators/tdm_child_op.cc
@@ -49,7 +49,7 @@ class TDMChildOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(2);
     AddComment(R"DOC("
      **Tdm Child**
-     According to the input node_id on the given tree, return the corresponding child node_id and 
+     According to the input node_id on the given tree, return the corresponding child node_id and
       whether child is a leaf node by LeafMask.")DOC");
   }
 };
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
index 37d9897f72..4525d431ff 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -226,7 +226,7 @@ It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is tha
 we add another label(z') to original.
         loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
         z is click or not
-        z' is teacher value 
+        z' is teacher value
         label = {-2, -1, [0, 2]}
         when z' is not exist, clk = 0 : label = -2;
         when z' is not exist, clk = 1 : label = -1;
diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc
index 82f14a2691..ca446fcb97 100644
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ b/paddle/fluid/operators/temporal_shift_op.cc
@@ -74,20 +74,20 @@ class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
           This operator calculates the temporal shifting features for Input(X).
 
-          Input(X) should be in shape of [N*T, C, H, W] or [N*T, H, W, C], while 
-          N is the batch size, T is the temporal segment number specified by 
-          :attr:`seg_num`, C is the channel number, H and W is the height and 
+          Input(X) should be in shape of [N*T, C, H, W] or [N*T, H, W, C], while
+          N is the batch size, T is the temporal segment number specified by
+          :attr:`seg_num`, C is the channel number, H and W is the height and
           width of features.
 
           Temporal Shifting is calculated as follows when data format is NCHW:
-          
+
           Step 1: Reshape Input(X) to [N, T, C, H, W].
 
-          Step 2: Pad 0 to reshaping result in the 2nd(T) dimension with 
-          padding width as 1 on each side, padding result will be in shape 
+          Step 2: Pad 0 to reshaping result in the 2nd(T) dimension with
+          padding width as 1 on each side, padding result will be in shape
           of [N, T+2, C, H, W].
 
-          Step 3: Assume :attr:`shift_ratio` is :math:`1/4`, slice padding 
+          Step 3: Assume :attr:`shift_ratio` is :math:`1/4`, slice padding
           result as follows:
 
           $$
@@ -100,10 +100,10 @@ class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker {
           slice3 = x[:, 1:T+1, C/2:, :, :]
           $$
 
-          Step 4: Concatenate three slices along the 3rd(C) dimension and 
+          Step 4: Concatenate three slices along the 3rd(C) dimension and
           reshape result to [N*T, C, H, W].
 
-          For details of temporal shifting, please refer to paper: 
+          For details of temporal shifting, please refer to paper:
           `Temporal Shift Module <http://arxiv.org/abs/1811.08383>`_ .
 
          )DOC");
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 4b56300374..afc18010bb 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -90,8 +90,8 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Top K operator
 
-If the input is a vector (1d tensor), this operator finds the k largest 
-entries in the vector and outputs their values and indices as vectors. 
+If the input is a vector (1d tensor), this operator finds the k largest
+entries in the vector and outputs their values and indices as vectors.
 Thus values[j] is the j-th largest entry in input, and its index is indices[j].
 
 For matrices, this operator computes the top k entries in each row. )DOC");
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index 2f915c959f..b1b68eb1ed 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -51,8 +51,8 @@ class TopkV2OpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Top K operator
 
-If the input is a vector (1d tensor), this operator finds the k largest 
-entries in the vector and outputs their values and indices as vectors. 
+If the input is a vector (1d tensor), this operator finds the k largest
+entries in the vector and outputs their values and indices as vectors.
 Thus values[j] is the j-th largest entry in input, and its index is indices[j].
 
 For matrices, this operator computes the top k entries in each row. )DOC");
diff --git a/paddle/fluid/operators/tril_indices_op.cc b/paddle/fluid/operators/tril_indices_op.cc
index c8123dfdf8..bae34fa5f5 100644
--- a/paddle/fluid/operators/tril_indices_op.cc
+++ b/paddle/fluid/operators/tril_indices_op.cc
@@ -61,9 +61,9 @@ class TrilIndicesOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
   TrilIndices Operator.
 
-  The tril_indices operator returns the indices of the lower triangular part of the matrix 
-  whose rows and cols is knowed. It is a 2-by-x tensor,where the first row contains row coordinates 
-  of all indices and the second row contains column coordinates. Indices are ordered based on 
+  The tril_indices operator returns the indices of the lower triangular part of the matrix
+  whose rows and cols is knowed. It is a 2-by-x tensor,where the first row contains row coordinates
+  of all indices and the second row contains column coordinates. Indices are ordered based on
   rows and then columns. The lower triangular part of the matrix is defined as the elements on
   and below the diagonal.
 
diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc
index e81a734c16..5d2c3c0797 100644
--- a/paddle/fluid/operators/tril_triu_op.cc
+++ b/paddle/fluid/operators/tril_triu_op.cc
@@ -41,9 +41,9 @@ class TrilTriuOpMaker : public framework::OpProtoAndCheckerMaker {
 TrilTriu Operator.
 
 The tril operator returns the lower triangular part of the matrix (2-D tensor)
-or batch of matrices $input$. The lower triangular part of the matrix is defined 
+or batch of matrices $input$. The lower triangular part of the matrix is defined
 as the elements on and below the diagonal.
-The triu operator returns the upper triangular part of a matrix (2-D tensor) 
+The triu operator returns the upper triangular part of a matrix (2-D tensor)
 or batch of matrices $input$. The upper triangular part of the matrix is defined
 as the elements on and above the diagonal.
 The other elements of the result tensor out are set to 0.
diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index 8bb0b20402..b8de9df202 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -54,7 +54,7 @@ class UnfoldOpMaker : public framework::OpProtoAndCheckerMaker {
 This Operator is used to extract sliding local blocks from a batched input tensor, also known
 as im2col when operated on batched 2D image tensor. For each block under the convolution filter,
 all element will be rearranged as a column. While the convolution filter sliding over the input
-feature map, a series of such columns will be formed. 
+feature map, a series of such columns will be formed.
     )DOC");
   }
 };
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
index 3f5c24fe4b..4d772e50e6 100644
--- a/paddle/fluid/operators/unique_op.cc
+++ b/paddle/fluid/operators/unique_op.cc
@@ -153,9 +153,9 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(false);
     AddComment(R"DOC(
     1. Return a unique subsequence for 1-D input tensor, and an index tensor
-    pointing to this unique subsequence when Attr(is_sorted) is false. This 
+    pointing to this unique subsequence when Attr(is_sorted) is false. This
     means paddle.unique is called.
-    
+
     2. Returns the unique elements of X in ascending order when Attr(is_sorted)
     is true. This means fluid.layers.unique is called.
 )DOC");
diff --git a/paddle/fluid/operators/unique_with_counts_op.cc b/paddle/fluid/operators/unique_with_counts_op.cc
index b86eb72e7d..6e60078f6a 100644
--- a/paddle/fluid/operators/unique_with_counts_op.cc
+++ b/paddle/fluid/operators/unique_with_counts_op.cc
@@ -64,7 +64,7 @@ class UniqueWithCountsOpMaker : public framework::OpProtoAndCheckerMaker {
               "the attr `dtype`");
     AddOutput("Count", "A subsequence for the count of unique index");
     AddComment(R"DOC(
-    Return a unique subsequence for 1-D input tensor, index tensor pointing to this unique subsequence, 
+    Return a unique subsequence for 1-D input tensor, index tensor pointing to this unique subsequence,
     and the subsequence for the count of unique index.
 )DOC");
   }
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index 3aec6b8356..eb7421019b 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -51,9 +51,9 @@ void VarConv2dOpMaker::Make() {
   AddComment(R"DOC(
     Var Size Conv Operator
 
-    This operator calculate Out = \sigma \left ( W * X + b \right ), 
+    This operator calculate Out = \sigma \left ( W * X + b \right ),
     only support 2-D for X.
-    
+
     NOTE: only support 'float32' data type now.
 
   )DOC");
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 287628c85e..d2e097994f 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -130,10 +130,10 @@ void ChromeTracingLogger::LogMemTraceEventNode(
   output_file_stream_ << string_format(
       std::string(
           R"JSON(
-  { 
+  {
     "name": "[memory]", "pid": %lld, "tid": "%lld(C++)",
-    "ts": %lld, 
-    "ph": "i", "cat": "%s", 
+    "ts": %lld,
+    "ph": "i", "cat": "%s",
     "args": {
       "place": "%s",
       "addr": "%llu",
@@ -196,10 +196,10 @@ void ChromeTracingLogger::LogHostTraceEventNode(
       output_file_stream_ << string_format(
           std::string(
               R"JSON(
-  { 
+  {
     "name": "%s[%s]", "pid": %lld, "tid": "%lld(Python)",
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "thread_state_runnable",
     "args": {
       "start_time": "%.3f us",
@@ -223,10 +223,10 @@ void ChromeTracingLogger::LogHostTraceEventNode(
       output_file_stream_ << string_format(
           std::string(
               R"JSON(
-  { 
+  {
     "name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "thread_state_runnable",
     "args": {
       "start_time": "%.3f us",
@@ -263,10 +263,10 @@ void ChromeTracingLogger::LogHostTraceEventNode(
       output_file_stream_ << string_format(
           std::string(
               R"JSON(
-  { 
+  {
     "name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "thread_state_runnable",
     "args": {
       "start_time": "%.3f us",
@@ -304,10 +304,10 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
   output_file_stream_ << string_format(
       std::string(
           R"JSON(
-  { 
+  {
     "name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "thread_state_running",
     "args": {
       "correlation id": %d,
@@ -331,9 +331,9 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
   output_file_stream_ << string_format(
       std::string(
           R"JSON(
-  { 
+  {
     "name": "launch", "id": %d, "pid": %lld, "tid": "%lld(C++)",
-    "ts": %lld, 
+    "ts": %lld,
     "ph": "s", "cat": "async"
   },
   )JSON"),
@@ -365,9 +365,9 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
   if (nsToUs(device_node.Duration()) == 0) {
     output_file_stream_ << string_format(std::string(
                                              R"JSON(
-  { 
+  {
     "name": "launch", "id": %d, "pid": %lld, "tid": %lld,
-    "ts": %lld, 
+    "ts": %lld,
     "ph": "f", "cat": "async"
   },
   )JSON"),
@@ -381,9 +381,9 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
     output_file_stream_ << string_format(
         std::string(
             R"JSON(
-  { 
+  {
     "name": "launch", "id": %d, "pid": %lld, "tid": %lld,
-    "ts": %lld, 
+    "ts": %lld,
     "ph": "f", "cat": "async", "bp": "e"
   },
   )JSON"),
@@ -410,10 +410,10 @@ void ChromeTracingLogger::HandleTypeKernel(
   output_file_stream_ << string_format(
       std::string(
           R"JSON(
-  { 
+  {
     "name": "%s[%s]", "pid": %lld, "tid": %lld,
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "cq_build_failed",
     "args": {
       "start_time": "%.3f us",
@@ -476,7 +476,7 @@ void ChromeTracingLogger::HandleTypeMemcpy(
   {
     "name": "%s[%s]", "pid": %lld, "tid": %lld,
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "cq_build_failed",
     "args": {
       "start_time": "%.3f us",
@@ -517,7 +517,7 @@ void ChromeTracingLogger::HandleTypeMemset(
   {
     "name": "%s[%s]", "pid": %lld, "tid": %lld,
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "cq_build_failed",
     "args": {
       "start_time": "%.3f us",
@@ -548,7 +548,7 @@ void ChromeTracingLogger::HandleTypeMemset(
 void ChromeTracingLogger::StartLog() {
   output_file_stream_ << std::string(
       R"JSON(
-  { 
+  {
     "displayTimeUnit": "ms",)JSON");
 }
 
@@ -717,49 +717,49 @@ void ChromeTracingLogger::RefineDisplayName(
             R"JSON(
   {
     "name": "process_name", "pid": %lld, "tid": "%lld(Python)",
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "name": "Process %lld (CPU)"
     }
   },
   {
     "name": "process_name", "pid": %lld, "tid": "%lld(C++)",
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "name": "Process %lld (CPU)"
     }
   },
    {
     "name": "thread_name", "pid": %lld, "tid": "%lld(Python)",
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "name": "thread %lld:%s(Python)"
     }
   },
   {
     "name": "thread_name", "pid": %lld, "tid": "%lld(C++)",
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "name": "thread %lld:%s(C++)"
     }
   },
   {
     "name": "process_sort_index", "pid": %lld, "tid": %lld,
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "sort_index": %lld
     }
-  },  
+  },
   {
     "name": "thread_sort_index", "pid": %lld, "tid": "%lld(Python)",
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "sort_index": %lld
     }
   },
   {
     "name": "thread_sort_index", "pid": %lld, "tid": "%lld(C++)",
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "sort_index": %lld
     }
@@ -803,32 +803,32 @@ void ChromeTracingLogger::RefineDisplayName(
                                              R"JSON(
   {
     "name": "process_name", "pid": %lld, "tid": %lld,
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "name": "Deivce %lld (%s)"
     }
   },
    {
     "name": "thread_name", "pid": %lld, "tid": %lld,
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "name": "stream %lld"
     }
   },
   {
     "name": "process_sort_index", "pid": %lld, "tid": %lld,
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "sort_index": %lld
     }
-  },  
+  },
   {
     "name": "thread_sort_index", "pid": %lld, "tid": %lld,
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "sort_index": %lld
     }
-  },  
+  },
   )JSON"),
                                          (*it).first,
                                          (*it).second,
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 66cd20340c..65e759d3b2 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -98,12 +98,12 @@ void BindCudaStream(py::module *m_ptr) {
       The handle of the CUDA stream.
 
       Parameters:
-        device(paddle.CUDAPlace()|int|None, optional): The device which wanted to allocate the stream. 
-        If device is None or negative integer, device will be the current device. 
-        If device is positive integer, it must less than the device count. Default: None. 
+        device(paddle.CUDAPlace()|int|None, optional): The device which wanted to allocate the stream.
+        If device is None or negative integer, device will be the current device.
+        If device is positive integer, it must less than the device count. Default: None.
 
         priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal).
-        If priority is None, the priority is 2(normal). Default: None. 
+        If priority is None, the priority is 2(normal). Default: None.
 
       Examples:
         .. code-block:: python
@@ -126,7 +126,7 @@ void BindCudaStream(py::module *m_ptr) {
 
       Parameters:
         event(CUDAEvent): The event to wait on.
-      
+
       Examples:
         .. code-block:: python
 
@@ -149,7 +149,7 @@ void BindCudaStream(py::module *m_ptr) {
 
       Parameters:
         stream(CUDAStream): The stream to synchronize with.
-      
+
       Examples:
         .. code-block:: python
 
@@ -207,7 +207,7 @@ void BindCudaStream(py::module *m_ptr) {
       Parameters:
           event(CUDAEvent, optional): The event to be record. If event is None, a new event is created.
           Default: None.
-      
+
       Returns:
           The recored event.
 
@@ -238,7 +238,7 @@ void BindCudaStream(py::module *m_ptr) {
             import ctypes
             cuda_stream = paddle.device.cuda.current_stream().cuda_stream
             print(cuda_stream)
-            
+
             ptr = ctypes.c_void_p(cuda_stream)  # convert back to void*
             print(ptr)
 
@@ -322,7 +322,7 @@ void BindCudaStream(py::module *m_ptr) {
         enable_timing(bool, optional): Whether the event will measure time. Default: False.
         blocking(bool, optional): Whether the wait() func will be blocking. Default: False;
         interprocess(bool, optional): Whether the event can be shared between processes. Defalut: False.
-      
+
       Examples:
         .. code-block:: python
 
@@ -345,7 +345,7 @@ void BindCudaStream(py::module *m_ptr) {
 
           Parameters:
             stream(CUDAStream, optional): The handle of CUDA stream. If None, the stream is the current stream. Default: None.
-          
+
           Examples:
             .. code-block:: python
 
@@ -353,7 +353,7 @@ void BindCudaStream(py::module *m_ptr) {
               import paddle
               event = paddle.device.cuda.Event()
               event.record()
-    
+
         )DOC",
           py::arg("stream") = nullptr)
       .def(
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index aeaa0dbff7..3dc87f0f7c 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1231,7 +1231,7 @@ void BindImperative(py::module *m_ptr) {
           },
           R"DOC(
         Returns a numpy array shows the value of current Tensor.
-        
+
         Returns:
             ndarray: The numpy value of current Tensor.
 
@@ -1348,10 +1348,10 @@ void BindImperative(py::module *m_ptr) {
                 # Due to sharing of data with origin Tensor, There are some unsafe operations:
                 y = 2 * x
                 detach_x[:] = 5.0
-                y.backward() 
+                y.backward()
                 # It will raise Error:
                 #   one of the variables needed for gradient computation has been modified by an inplace operation.
-             
+
        )DOC")
       .def("clear_gradient",
            &imperative::VarBase::ClearGradient,
@@ -1618,7 +1618,7 @@ void BindImperative(py::module *m_ptr) {
               import paddle
               x = paddle.to_tensor(1.0, place=paddle.CUDAPlace(0))
               print(x.place)    # CUDAPlace(0)
-              
+
               y = x.cpu()
               print(y.place)    # CPUPlace
 
@@ -1708,12 +1708,12 @@ void BindImperative(py::module *m_ptr) {
           R"DOC(
         Returns a copy of this Tensor in GPU memory.
 
-        If this Tensor is already in GPU memory and device_id is default, 
+        If this Tensor is already in GPU memory and device_id is default,
         then no copy is performed and the original Tensor is returned.
-        
+
         Args:
             device_id(int, optional): The destination GPU device id. Default: None, means current device.
-            blocking(bool, optional): If False and the source is in pinned memory, the copy will be 
+            blocking(bool, optional): If False and the source is in pinned memory, the copy will be
               asynchronous with respect to the host. Otherwise, the argument has no effect. Default: False.
 
         Examples:
@@ -1726,7 +1726,7 @@ void BindImperative(py::module *m_ptr) {
 
               y = x.cuda()
               print(y.place)        # Place(gpu:0)
-            
+
               y = x.cuda(None)
               print(y.place)        # Place(gpu:0)
 
@@ -2011,7 +2011,7 @@ void BindImperative(py::module *m_ptr) {
            })
       .def("element_size", &imperative::VarBase::ElementSize, R"DOC(
         Returns the size in bytes of an element in the Tensor.
-        
+
         Examples:
           .. code-block:: python
 
@@ -2076,8 +2076,8 @@ void BindImperative(py::module *m_ptr) {
                              R"DOC(
       Whether a Tensor is leaf Tensor.
 
-      For the Tensor whose stop_gradient is ``True`` , it will be leaf Tensor. 
-      
+      For the Tensor whose stop_gradient is ``True`` , it will be leaf Tensor.
+
       For the Tensor whose stop_gradient is ``False`` , it will be leaf Tensor too if it is created by user.
 
       Returns:
@@ -2721,7 +2721,7 @@ void BindImperative(py::module *m_ptr) {
 
   Returns:
 
-      new_tensor(paddle.Tensor): Return the UVA Tensor with the sample dtype and 
+      new_tensor(paddle.Tensor): Return the UVA Tensor with the sample dtype and
                                  shape with the input numpy array.
 
   Examples:
@@ -2730,7 +2730,7 @@ void BindImperative(py::module *m_ptr) {
         # required: gpu
         import numpy as np
         import paddle
-        
+
         data = np.random.randint(10, size=(3, 4))
         tensor = paddle.fluid.core.to_uva_tensor(data)
         print(tensor)
@@ -2834,38 +2834,38 @@ void BindImperative(py::module *m_ptr) {
         }
       },
       R"DOC(
-  This api provides a way to write pieces of source tensor to destination tensor 
-  inplacely and asynchronously. In which, we use `offset` and `count` to determine 
-  where to copy. `offset` means the begin points of the copy pieces of `src`, and 
-  `count` means the lengths of the copy pieces of `src`. To be noted, the copy process 
-  will run asynchronously from cuda to pin memory. We can simply remember this as 
+  This api provides a way to write pieces of source tensor to destination tensor
+  inplacely and asynchronously. In which, we use `offset` and `count` to determine
+  where to copy. `offset` means the begin points of the copy pieces of `src`, and
+  `count` means the lengths of the copy pieces of `src`. To be noted, the copy process
+  will run asynchronously from cuda to pin memory. We can simply remember this as
   "gpu async_write to pin_memory".
-  
+
   Arguments:
-  
-    src (Tensor): The source tensor, and the data type should be `float32` currently. 
+
+    src (Tensor): The source tensor, and the data type should be `float32` currently.
                   Besides, `src` should be placed on CUDAPlace.
 
-    dst (Tensor): The destination tensor, and the data type should be `float32` currently. 
-                  Besides, `dst` should be placed on CUDAPinnedPlace. The shape of `dst` 
-                  should be the same with `src` except for the first dimension. 
+    dst (Tensor): The destination tensor, and the data type should be `float32` currently.
+                  Besides, `dst` should be placed on CUDAPinnedPlace. The shape of `dst`
+                  should be the same with `src` except for the first dimension.
 
-    offset (Tensor): The offset tensor, and the data type should be `int64` currently. 
-                     Besides, `offset` should be placed on CPUPlace. The shape of `offset` 
-                     should be one-dimensional. 
-    
-    count (Tensor): The count tensor, and the data type should be `int64` currently. 
-                    Besides, `count` should be placed on CPUPlace. The shape of `count` 
-                    should be one-dimensinal. 
+    offset (Tensor): The offset tensor, and the data type should be `int64` currently.
+                     Besides, `offset` should be placed on CPUPlace. The shape of `offset`
+                     should be one-dimensional.
+
+    count (Tensor): The count tensor, and the data type should be `int64` currently.
+                    Besides, `count` should be placed on CPUPlace. The shape of `count`
+                    should be one-dimensinal.
 
   Examples:
       .. code-block:: python
 
           import numpy as np
           import paddle
-          from paddle.fluid import core  
+          from paddle.fluid import core
           from paddle.device import cuda
-          
+
           if core.is_compiled_with_cuda():
               src = paddle.rand(shape=[100, 50, 50])
               dst = paddle.emtpy(shape=[200, 50, 50]).pin_memory()
@@ -3058,38 +3058,38 @@ void BindImperative(py::module *m_ptr) {
                         stream);
       },
       R"DOC(
-  This api provides a way to read from pieces of source tensor to destination tensor 
-  asynchronously. In which, we use `index`, `offset` and `count` to determine where 
-  to read. `index` means the index position of src tensor we want to read. `offset` 
-  and count means the begin points and length of pieces of src tensor we want to read. 
-  To be noted, the copy process will run asynchronously from pin memory to cuda place. 
+  This api provides a way to read from pieces of source tensor to destination tensor
+  asynchronously. In which, we use `index`, `offset` and `count` to determine where
+  to read. `index` means the index position of src tensor we want to read. `offset`
+  and count means the begin points and length of pieces of src tensor we want to read.
+  To be noted, the copy process will run asynchronously from pin memory to cuda place.
   We can simply remember this as "cuda async_read from pin_memory".
 
   Arguments:
-  
-    src (Tensor): The source tensor, and the data type should be `float32` currently. 
+
+    src (Tensor): The source tensor, and the data type should be `float32` currently.
                   Besides, `src` should be placed on CUDAPinnedPlace.
-  
-    dst (Tensor): The destination tensor, and the data type should be `float32` currently. 
-                  Besides, `dst` should be placed on CUDAPlace. The shape of `dst` should 
+
+    dst (Tensor): The destination tensor, and the data type should be `float32` currently.
+                  Besides, `dst` should be placed on CUDAPlace. The shape of `dst` should
                   be the same with `src` except for the first dimension.
 
-    index (Tensor): The index tensor, and the data type should be `int64` currently. 
-                    Besides, `index` should be on CPUplace. The shape of `index` should 
+    index (Tensor): The index tensor, and the data type should be `int64` currently.
+                    Besides, `index` should be on CPUplace. The shape of `index` should
                     be one-dimensional.
 
-    buffer (Tensor): The buffer tensor, used to buffer index copy tensor temporarily. 
-                     The data type should be `float32` currently, and should be placed 
+    buffer (Tensor): The buffer tensor, used to buffer index copy tensor temporarily.
+                     The data type should be `float32` currently, and should be placed
                      on CUDAPinnedPlace. The shape of `buffer` should be the same with `src` except for the first dimension.
 
-    offset (Tensor): The offset tensor, and the data type should be `int64` currently. 
-                     Besides, `offset` should be placed on CPUPlace. The shape of `offset` 
+    offset (Tensor): The offset tensor, and the data type should be `int64` currently.
+                     Besides, `offset` should be placed on CPUPlace. The shape of `offset`
                      should be one-dimensional.
 
-    count (Tensor): The count tensor, and the data type should be `int64` currently. 
-                    Besides, `count` should be placed on CPUPlace. The shape of `count` 
+    count (Tensor): The count tensor, and the data type should be `int64` currently.
+                    Besides, `count` should be placed on CPUPlace. The shape of `count`
                     should be one-dimensinal.
-    
+
   Examples:
       .. code-block:: python
 
@@ -3108,11 +3108,11 @@ void BindImperative(py::module *m_ptr) {
               buffer = paddle.empty(shape=[50, 50, 50], dtype="float32").pin_memory()
               index = paddle.to_tensor(
                   np.array([1, 3, 5, 7, 9], dtype="int64")).cpu()
-          
+
               stream = cuda.Stream()
               with cuda.stream_guard(stream):
                   core.async_read(src, dst, index, buffer, offset, count)
- 
+
 )DOC");
 #endif
 }
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index 0b44dc5d2a..02be0e9693 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -296,9 +296,9 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Default 100.
 
                 .. note::
-                    1. If you fetch data when calling the 'run', the ParallelExecutor 
-                    will clean up the temp variables at the end of the current iteration. 
-                    2. In some NLP model, it may cause the GPU memory is insufficient, 
+                    1. If you fetch data when calling the 'run', the ParallelExecutor
+                    will clean up the temp variables at the end of the current iteration.
+                    2. In some NLP model, it may cause the GPU memory is insufficient,
                     in this case, you should reduce `num_iteration_per_drop_scope`.
 
                 Examples:
@@ -859,7 +859,7 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 synchronous batch normalization which synchronizes the mean
                 and variance through multi-devices in training phase.
                 Current implementation doesn't support FP16 training and CPU.
-                And only synchronous on one machine, not all machines. 
+                And only synchronous on one machine, not all machines.
                 Default is False.
 
                 Examples:
@@ -897,9 +897,9 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
           R"DOC((bool, optional): memory opitimize aims to save total memory
                 consumption, set to True to enable it.
 
-                Default None. None means framework would choose to use or not use 
-                this strategy automatically. Currently, None means that it is 
-                enabled when GC is disabled, and disabled when GC is enabled. 
+                Default None. None means framework would choose to use or not use
+                this strategy automatically. Currently, None means that it is
+                enabled when GC is disabled, and disabled when GC is enabled.
                 True means enabling and False means disabling. Default is None.
 
                 Examples:
@@ -912,7 +912,7 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
 
                         build_strategy = static.BuildStrategy()
                         build_strategy.memory_optimize = True
-                
+
                 )DOC")
       .def_property(
           "is_distribution",
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 0044f037fc..d69e670cc6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1038,7 +1038,7 @@ All parameter, weight, gradient are variables in Paddle.
            py::arg("name"),
            R"DOC(
            Find variable named :code:`name` in the current scope or
-           its parent scope. Return None if not found. 
+           its parent scope. Return None if not found.
 
            Args:
                name (str): the variable name.
@@ -1053,7 +1053,7 @@ All parameter, weight, gradient are variables in Paddle.
            py::arg("names"),
            R"DOC(
            Find variable named :code:`name` in the current scope or
-           its parent scope. Return None if not found. 
+           its parent scope. Return None if not found.
 
            Args:
                name (str): the variable names to be erase.
@@ -1248,12 +1248,12 @@ All parameter, weight, gradient are variables in Paddle.
       R"DOC(
              Prune the backward part of a program, mostly called in
              program.clone(for_test=True).
-              
+
             Args:
                    program (ProgramDesc): The original program.
 
              Returns:
-                   tuple(ProgramDesc, map<int, int>): The first part is 
+                   tuple(ProgramDesc, map<int, int>): The first part is
                    the pruned program desc, and the second part is a map
                    which contains the id pair of pruned block and corresponding
                    origin block.
@@ -1873,7 +1873,7 @@ All parameter, weight, gradient are variables in Paddle.
           py::arg("tensor"),
           R"DOC(
              Append a LoDensor to LoDTensorArray.
-              
+
              Args:
                    tensor (LoDTensor): The LoDTensor to be appended.
 
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 8396a970bd..8152a11c81 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -430,7 +430,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
            py::arg("zero_copy") = false,
            R"DOC(
         Set the data of Tensor on place with given numpy array.
-        
+
         Args:
           lod (numpy.ndarray): The data to set.
           place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the
@@ -613,7 +613,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 
            Args:
                 recursive_sequence_lengths (list[list[int]]): The recursive sequence lengths.
-           
+
            Returns:
                 None.
 
@@ -644,7 +644,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 
            Returns:
                list[list[int]]: The lod of the Tensor.
-           
+
            Examples:
                .. code-block:: python
 
@@ -668,7 +668,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
             return new_lod;
           },
           R"DOC(
-           Return the recursive sequence lengths corresponding to of the LodD 
+           Return the recursive sequence lengths corresponding to of the LodD
            of the Tensor.
 
            Returns:
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index 8da2623bb2..251916d8c1 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -425,7 +425,7 @@ struct EnforceNotMet : public std::exception {
  *     __ROLE: (string), Input or Output
  *     __NAME: (string), Input or Output name
  *     __OP_TYPE: (string), the op type
- *  
+ *
  * Return: The data pointed to by the pointer.
  *
  * Examples:
-- 
GitLab