From de8c0ba53236e2ea19f85153f0e606acc7efed78 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Wed, 14 Sep 2022 21:56:19 +0800
Subject: [PATCH] [CodeStyle][W291] trim trailing whitespace in python file
 (#45937)

* trim trailing whitespace

* fix `.cmake-format.py`

* revert npu ut changes, avoid npu ci error
---
 .cmake-format.py                              |   6 +-
 .../generate_file_structures.py               |   6 +-
 .../generator/eager_gen.py                    |   6 +-
 .../inference/api/demo_ci/untar_model.py      |   2 +-
 .../api/full_pascalvoc_test_preprocess.py     |  78 +--
 paddle/infrt/tests/models/abs_model.py        |  12 +-
 .../tests/models/efficientnet-b4/model.py     |  11 +-
 .../models/efficientnet-b4/net/__init__.py    |   6 +-
 .../efficientnet-b4/net/efficientnet.py       | 120 +++--
 .../tests/models/efficientnet-b4/net/utils.py |  70 +--
 paddle/infrt/tests/models/linear.py           |  15 +-
 paddle/infrt/tests/models/resnet50_model.py   |  11 +-
 paddle/phi/api/yaml/generator/api_base.py     |  24 +-
 paddle/scripts/conda_build.py                 |  18 +-
 python/paddle/amp/auto_cast.py                |  28 +-
 python/paddle/amp/grad_scaler.py              |  94 ++--
 python/paddle/audio/functional/functional.py  |   2 +-
 python/paddle/autograd/backward_mode.py       |   6 +-
 python/paddle/autograd/py_layer.py            |  68 +--
 python/paddle/batch.py                        |  16 +-
 python/paddle/compat.py                       |   8 +-
 python/paddle/device/__init__.py              |  16 +-
 python/paddle/device/cuda/__init__.py         |  50 +-
 .../distributed/auto_parallel/cluster_v2.py   |   8 +-
 .../distributed/auto_parallel/completion.py   |   2 +-
 .../distributed/auto_parallel/converter.py    |  10 +-
 .../distributed/auto_parallel/cost_model.py   |  12 +-
 .../distributed/auto_parallel/dist_tensor.py  |   6 +-
 .../distributed/auto_parallel/interface.py    |  14 +-
 .../auto_parallel/operators/common.py         |  22 +-
 .../distributed/auto_parallel/parallelizer.py |   4 +-
 .../distributed/auto_parallel/partitioner.py  |   2 +-
 .../distributed/auto_parallel/process_mesh.py |  10 +-
 .../auto_parallel/process_mesh_v2.py          |   8 +-
 .../auto_parallel/tuner/algorithms.py         |  14 +-
 .../distributed/auto_parallel/tuner/config.py |   2 +-
 .../auto_parallel/tuner/optimization_tuner.py |  10 +-
 .../paddle/distributed/auto_parallel/utils.py |  44 +-
 python/paddle/distributed/collective.py       |  60 +--
 .../fleet/base/distributed_strategy.py        |  78 +--
 .../fleet/base/private_helper_function.py     |   4 +-
 .../distributed/fleet/base/role_maker.py      |   4 +-
 .../fleet/base/strategy_compiler.py           |   4 +-
 .../paddle/distributed/fleet/base/topology.py |   2 +-
 .../distributed/fleet/base/util_factory.py    |   2 +-
 .../fleet/data_generator/data_generator.py    |  12 +-
 .../distributed/fleet/dataset/dataset.py      |  66 +--
 .../distributed/fleet/elastic/manager.py      |   2 +-
 python/paddle/distributed/fleet/fleet.py      |  26 +-
 .../distributed/fleet/fleet_executor_utils.py |   2 +-
 python/paddle/distributed/fleet/launch.py     |  32 +-
 .../dygraph_sharding_optimizer.py             |   6 +-
 .../sharding_optimizer_stage2.py              |   2 +-
 .../fleet/meta_optimizers/sharding/utils.py   |   4 +-
 .../meta_optimizers/sharding_optimizer.py     |   8 +-
 .../parallel_layers/pp_layers.py              |  18 +-
 .../fleet/meta_parallel/pp_utils/utils.py     |   2 +-
 .../group_sharded_optimizer_stage2.py         |   2 +-
 .../sharding/group_sharded_stage2.py          |   4 +-
 .../sharding/group_sharded_stage3.py          |   6 +-
 .../meta_parallel/sharding/sharding_stage2.py |   4 +-
 .../meta_parallel/sharding/sharding_stage3.py |   4 +-
 .../distributed/fleet/metrics/metric.py       |   2 +-
 python/paddle/distributed/fleet/utils/fs.py   |  26 +-
 .../fleet/utils/hybrid_parallel_inference.py  |  46 +-
 .../distributed/fleet/utils/recompute.py      |  14 +-
 python/paddle/distributed/launch/main.py      |  30 +-
 python/paddle/distributed/models/moe/utils.py |  16 +-
 python/paddle/distributed/parallel.py         |   6 +-
 ...uto_parallel_data_parallel_optimization.py |   6 +-
 .../distributed/passes/auto_parallel_fp16.py  |   2 +-
 .../passes/auto_parallel_recompute.py         |   2 +-
 .../paddle/distributed/passes/pass_utils.py   |   4 +-
 .../distributed/passes/ps_trainer_pass.py     |   2 +-
 python/paddle/distributed/ps/utils/public.py  |   4 +-
 .../distributed/sharding/group_sharded.py     |   6 +-
 python/paddle/distributed/utils.py            |  26 +-
 python/paddle/distribution/beta.py            |  30 +-
 python/paddle/distribution/categorical.py     |  10 +-
 python/paddle/distribution/dirichlet.py       |  20 +-
 python/paddle/distribution/distribution.py    |  30 +-
 .../paddle/distribution/exponential_family.py |  16 +-
 python/paddle/distribution/independent.py     |   6 +-
 python/paddle/distribution/kl.py              |  12 +-
 python/paddle/distribution/multinomial.py     |  26 +-
 python/paddle/distribution/normal.py          |   4 +-
 python/paddle/distribution/transform.py       | 122 ++---
 .../distribution/transformed_distribution.py  |  10 +-
 python/paddle/distribution/variable.py        |   6 +-
 python/paddle/fft.py                          | 486 +++++++++---------
 python/paddle/fluid/average.py                |   8 +-
 python/paddle/fluid/backward.py               |  22 +-
 python/paddle/fluid/clip.py                   |  90 ++--
 python/paddle/fluid/compiler.py               |  82 +--
 .../paddle/fluid/contrib/layers/metric_op.py  |   2 +-
 python/paddle/fluid/contrib/layers/nn.py      |  90 ++--
 .../paddle/fluid/contrib/layers/rnn_impl.py   |  24 +-
 .../fluid/contrib/mixed_precision/amp_nn.py   |  26 +-
 .../contrib/mixed_precision/bf16/decorator.py |  32 +-
 .../contrib/mixed_precision/decorator.py      |  56 +-
 python/paddle/fluid/contrib/model_stat.py     |   6 +-
 python/paddle/fluid/contrib/optimizer.py      |   6 +-
 .../slim/quantization/imperative/ptq.py       |   8 +-
 .../quantization/imperative/ptq_config.py     |   2 +-
 .../slim/quantization/imperative/qat.py       |  22 +-
 .../post_training_quantization.py             | 100 ++--
 .../slim/quantization/quantization_pass.py    |  46 +-
 .../quantization/quantize_transpiler_v2.py    |  10 +-
 python/paddle/fluid/contrib/sparsity/asp.py   |  48 +-
 .../contrib/sparsity/supported_layer_list.py  |   4 +-
 python/paddle/fluid/contrib/sparsity/utils.py |  26 +-
 python/paddle/fluid/data.py                   |   6 +-
 python/paddle/fluid/data_feed_desc.py         |   4 +-
 python/paddle/fluid/data_feeder.py            |  80 +--
 .../paddle/fluid/dataloader/batch_sampler.py  |  38 +-
 python/paddle/fluid/dataloader/collate.py     |  12 +-
 python/paddle/fluid/dataloader/dataset.py     |  46 +-
 python/paddle/fluid/dataloader/sampler.py     |  28 +-
 python/paddle/fluid/dataset.py                |  32 +-
 python/paddle/fluid/distributed/downpour.py   |   2 +-
 python/paddle/fluid/distributed/fleet.py      |   2 +-
 python/paddle/fluid/distributed/helper.py     |   2 +-
 python/paddle/fluid/distributed/node.py       |  16 +-
 .../paddle/fluid/distributed/ps_instance.py   |  10 +-
 python/paddle/fluid/dygraph/amp/auto_cast.py  |  28 +-
 .../paddle/fluid/dygraph/amp/loss_scaler.py   |  26 +-
 python/paddle/fluid/dygraph/base.py           |  78 +--
 python/paddle/fluid/dygraph/checkpoint.py     |  26 +-
 .../dygraph_to_static/base_transformer.py     |  12 +-
 .../break_continue_transformer.py             |  12 +-
 .../dygraph_to_static/convert_operators.py    |   6 +-
 .../dygraph_to_static/function_spec.py        |   8 +-
 .../dygraph_to_static/ifelse_transformer.py   |   2 +-
 .../dygraph_to_static/partial_program.py      |   2 +-
 .../dygraph_to_static/program_translator.py   |  22 +-
 .../dygraph_to_static/static_analysis.py      |   4 +-
 .../fluid/dygraph/dygraph_to_static/utils.py  |  40 +-
 .../dygraph_to_static/variable_trans_func.py  |   2 +-
 python/paddle/fluid/dygraph/io.py             |  20 +-
 python/paddle/fluid/dygraph/jit.py            |   4 +-
 python/paddle/fluid/dygraph/layers.py         |   6 +-
 .../fluid/dygraph/learning_rate_scheduler.py  | 168 +++---
 python/paddle/fluid/dygraph/nn.py             |  92 ++--
 python/paddle/fluid/dygraph/parallel.py       |  78 +--
 python/paddle/fluid/dygraph/rnn.py            |  12 +-
 python/paddle/fluid/dygraph/static_runner.py  |   2 +-
 python/paddle/fluid/dygraph/tracer.py         |   8 +-
 python/paddle/fluid/dygraph_utils.py          |   4 +-
 python/paddle/fluid/evaluator.py              |   2 +-
 python/paddle/fluid/executor.py               |  54 +-
 python/paddle/fluid/framework.py              | 158 +++---
 python/paddle/fluid/generator.py              |   2 +-
 .../fluid/incubate/data_generator/__init__.py |  20 +-
 .../parameter_server/ir/heter_trainer_pass.py |   2 +-
 .../fleet/parameter_server/ir/pserver_pass.py |   2 +-
 .../fleet/parameter_server/ir/trainer_pass.py |  12 +-
 .../fleet/parameter_server/pslib/__init__.py  |   2 +-
 .../fluid/incubate/fleet/utils/fleet_util.py  |   4 +-
 .../paddle/fluid/incubate/fleet/utils/hdfs.py |   4 +-
 python/paddle/fluid/initializer.py            |  18 +-
 python/paddle/fluid/input.py                  |  48 +-
 python/paddle/fluid/install_check.py          |   2 +-
 python/paddle/fluid/io.py                     |  14 +-
 python/paddle/fluid/layers/control_flow.py    | 102 ++--
 python/paddle/fluid/layers/detection.py       | 438 ++++++++--------
 python/paddle/fluid/layers/distributions.py   |  16 +-
 python/paddle/fluid/layers/io.py              |  48 +-
 .../fluid/layers/layer_function_generator.py  |   2 +-
 .../fluid/layers/learning_rate_scheduler.py   |  50 +-
 python/paddle/fluid/layers/loss.py            | 182 +++----
 python/paddle/fluid/layers/math_op_patch.py   |  12 +-
 python/paddle/fluid/layers/metric_op.py       |   4 +-
 python/paddle/fluid/layers/nn.py              | 180 +++----
 python/paddle/fluid/layers/ops.py             |  58 +--
 python/paddle/fluid/layers/rnn.py             | 190 +++----
 python/paddle/fluid/layers/sequence_lod.py    | 102 ++--
 python/paddle/fluid/layers/tensor.py          |  38 +-
 python/paddle/fluid/layers/utils.py           |  16 +-
 python/paddle/fluid/lazy_init.py              |   4 +-
 python/paddle/fluid/metrics.py                |  70 +--
 python/paddle/fluid/nets.py                   |  26 +-
 python/paddle/fluid/optimizer.py              | 250 ++++-----
 python/paddle/fluid/param_attr.py             |  26 +-
 python/paddle/fluid/reader.py                 | 234 ++++-----
 python/paddle/fluid/regularizer.py            |  38 +-
 .../tests/custom_op/test_multi_out_jit.py     |   2 +-
 .../test_auto_parallel_relaunch.py            |  10 +-
 .../unittests/auto_parallel/test_cluster.py   |   2 +-
 .../test_autograd_functional_static.py        |   6 +-
 .../fluid/tests/unittests/autograd/utils.py   |   6 +-
 .../fleet/parallel_dygraph_transformer.py     |   2 +-
 .../test_imperative_auto_mixed_precision.py   |   2 +-
 ...perative_auto_mixed_precision_for_eager.py |   2 +-
 .../tests/unittests/distribution/mock_data.py |   2 +-
 .../dygraph_to_static/predictor_utils.py      |   2 +-
 .../dygraph_to_static/simnet_dygraph_model.py |   4 +-
 .../simnet_dygraph_model_v2.py                |   4 +-
 .../test_closure_analysis.py                  |   2 +-
 .../dygraph_to_static/test_multi_forward.py   |   8 +-
 .../fluid/tests/unittests/gradient_checker.py |  14 +-
 .../unittests/ir/inference/auto_scan_test.py  |   4 +-
 .../ir/inference/inference_pass_test.py       |  24 +-
 .../unittests/ir/inference/program_config.py  |   2 +-
 .../ir/inference/quant_dequant_test.py        |  16 +-
 .../test_conv_act_mkldnn_fuse_pass.py         |   4 +-
 .../test_conv_bias_mkldnn_fuse_pass.py        |   2 +-
 ...est_conv_elementwise_add2_act_fuse_pass.py |   4 +-
 ...test_conv_elementwise_add_act_fuse_pass.py |   2 +-
 .../test_conv_eltwiseadd_bn_fuse_pass.py      |   4 +-
 .../test_conv_transpose_bn_fuse_pass.py       |   2 +-
 ..._conv_transpose_eltwiseadd_bn_fuse_pass.py |   4 +-
 ...test_fc_elementwise_layernorm_fuse_pass.py |   4 +-
 .../test_flatten2_matmul_fuse_pass.py         |   8 +-
 .../test_layernorm_shift_partition_pass.py    |   4 +-
 .../inference/test_map_matmul_to_mul_pass.py  |   2 +-
 .../test_map_matmul_v2_to_matmul_pass.py      |   2 +-
 .../test_map_matmul_v2_to_mul_pass.py         |   2 +-
 .../inference/test_matmul_scale_fuse_pass.py  |   2 +-
 .../test_matmul_v2_scale_fuse_pass.py         |   2 +-
 .../test_mkldnn_depthwise_conv_pass.py        |   4 +-
 .../test_reshape2_matmul_fuse_pass.py         |   8 +-
 .../test_squeeze2_matmul_fuse_pass.py         |   8 +-
 ...test_transpose_flatten_concat_fuse_pass.py |   4 +-
 .../test_trt_flatten2_matmul_fuse_pass.py     |   8 +-
 .../inference/test_trt_multiclass_nms3_op.py  |   4 +-
 .../test_trt_reshape2_matmul_fuse_pass.py     |   8 +-
 .../test_trt_squeeze2_matmul_fuse_pass.py     |   8 +-
 .../test_unsqueeze2_eltwise_fuse_pass.py      |   8 +-
 .../mkldnn/test_mul_int8_mkldnn_op.py         |   6 +-
 .../unittests/mlu/test_tril_triu_op_mlu.py    |   2 +-
 .../paddle/fluid/tests/unittests/op_test.py   |   4 +-
 .../tests/unittests/ps/ps_dnn_trainer.py      |   6 +-
 .../unittests/test_auto_parallel_cluster.py   |   2 +-
 .../unittests/test_auto_parallel_mapper.py    |  20 +-
 .../paddle/fluid/tests/unittests/test_cond.py |   2 +-
 .../tests/unittests/test_cuda_stream_event.py |   4 +-
 .../unittests/test_deprecated_decorator.py    |   8 +-
 .../tests/unittests/test_dist_fleet_base.py   |   2 +-
 .../test_dist_sparse_tensor_load_adagrad.py   |   2 +-
 .../test_dist_sparse_tensor_load_adam.py      |   2 +-
 .../test_dist_sparse_tensor_load_ftrl.py      |   2 +-
 .../test_dist_sparse_tensor_load_momentum.py  |   2 +-
 .../test_dist_sparse_tensor_load_rmsprop.py   |   2 +-
 .../test_dist_sparse_tensor_load_sgd.py       |   2 +-
 .../test_eager_deletion_padding_rnn.py        |   2 +-
 .../fluid/tests/unittests/test_einsum_v2.py   |   4 +-
 .../unittests/test_faster_tokenizer_op.py     |  10 +-
 .../unittests/test_generate_proposals_op.py   |   2 +-
 .../tests/unittests/test_group_norm_op.py     |   4 +-
 .../tests/unittests/test_launch_coverage.py   |  24 +-
 .../fluid/tests/unittests/test_lu_op.py       |   4 +-
 .../tests/unittests/test_lu_unpack_op.py      |   4 +-
 .../fluid/tests/unittests/test_prune.py       |  14 +-
 .../fluid/tests/unittests/test_svd_op.py      |  22 +-
 .../tests/unittests/test_tril_triu_op.py      |   2 +-
 .../unittests/tokenizer/bert_tokenizer.py     |  16 +-
 .../unittests/tokenizer/tokenizer_utils.py    |  10 +-
 .../xpu/test_generate_proposals_v2_op_xpu.py  |   2 +-
 .../fluid/transpiler/distribute_transpiler.py |  12 +-
 python/paddle/fluid/unique_name.py            |  28 +-
 python/paddle/fluid/variable_index.py         |   2 +-
 python/paddle/framework/dtype.py              |   2 +-
 python/paddle/framework/framework.py          |   6 +-
 python/paddle/framework/io.py                 |  72 +--
 python/paddle/framework/random.py             |   4 +-
 python/paddle/geometric/math.py               |  18 +-
 .../geometric/message_passing/send_recv.py    |  32 +-
 python/paddle/geometric/reindex.py            |  26 +-
 python/paddle/geometric/sampling/neighbors.py |  16 +-
 python/paddle/hapi/callbacks.py               |  46 +-
 python/paddle/hapi/dynamic_flops.py           |   6 +-
 python/paddle/hapi/hub.py                     |  10 +-
 python/paddle/hapi/model.py                   |  70 +--
 python/paddle/hapi/model_summary.py           |  12 +-
 python/paddle/hapi/static_flops.py            |   2 +-
 python/paddle/incubate/autograd/functional.py |  76 +--
 python/paddle/incubate/autograd/primapi.py    |  16 +-
 python/paddle/incubate/autograd/primreg.py    |  30 +-
 python/paddle/incubate/autograd/primx.py      |  38 +-
 python/paddle/incubate/autograd/utils.py      |  18 +-
 .../distributed/models/moe/grad_clip.py       |  20 +-
 .../distributed/models/moe/moe_layer.py       |  12 +-
 .../nn/functional/fused_matmul_bias.py        |  40 +-
 .../paddle/incubate/nn/layer/fused_linear.py  |   8 +-
 .../incubate/operators/graph_khop_sampler.py  |  18 +-
 .../incubate/operators/graph_reindex.py       |  24 +-
 .../operators/graph_sample_neighbors.py       |  22 +-
 .../incubate/operators/graph_send_recv.py     |  16 +-
 .../incubate/optimizer/functional/bfgs.py     |  10 +-
 .../incubate/optimizer/functional/lbfgs.py    |  10 +-
 .../optimizer/functional/line_search.py       |  26 +-
 .../incubate/optimizer/functional/utils.py    |   8 +-
 python/paddle/incubate/optimizer/lookahead.py |  20 +-
 .../paddle/incubate/optimizer/modelaverage.py |  22 +-
 python/paddle/incubate/sparse/binary.py       |  60 +--
 python/paddle/incubate/sparse/creation.py     |  44 +-
 python/paddle/incubate/sparse/multiary.py     |  12 +-
 .../sparse/nn/functional/activation.py        |  22 +-
 .../incubate/sparse/nn/functional/conv.py     |  62 +--
 .../incubate/sparse/nn/functional/pooling.py  |   4 +-
 .../sparse/nn/functional/transformer.py       |  22 +-
 .../incubate/sparse/nn/layer/activation.py    |  18 +-
 .../paddle/incubate/sparse/nn/layer/conv.py   |  24 +-
 .../paddle/incubate/sparse/nn/layer/norm.py   |  26 +-
 python/paddle/incubate/sparse/unary.py        |  62 +--
 python/paddle/incubate/tensor/math.py         |  18 +-
 python/paddle/metric/metrics.py               |  78 +--
 python/paddle/nn/functional/activation.py     |  56 +-
 python/paddle/nn/functional/common.py         | 112 ++--
 python/paddle/nn/functional/conv.py           | 164 +++---
 python/paddle/nn/functional/extension.py      |  12 +-
 python/paddle/nn/functional/input.py          |   4 +-
 python/paddle/nn/functional/loss.py           | 166 +++---
 python/paddle/nn/functional/pooling.py        |  80 +--
 .../paddle/nn/functional/sparse_attention.py  |  78 +--
 python/paddle/nn/functional/vision.py         |  50 +-
 python/paddle/nn/initializer/assign.py        |   2 +-
 python/paddle/nn/initializer/dirac.py         |  16 +-
 python/paddle/nn/initializer/kaiming.py       |   2 +-
 python/paddle/nn/initializer/normal.py        |   2 +-
 python/paddle/nn/initializer/orthogonal.py    |   8 +-
 python/paddle/nn/initializer/uniform.py       |   2 +-
 python/paddle/nn/layer/activation.py          |  12 +-
 python/paddle/nn/layer/common.py              |  20 +-
 python/paddle/nn/layer/container.py           |  14 +-
 python/paddle/nn/layer/conv.py                |  64 +--
 python/paddle/nn/layer/loss.py                | 106 ++--
 python/paddle/nn/layer/norm.py                |  68 +--
 python/paddle/nn/layer/pooling.py             |  34 +-
 python/paddle/nn/layer/rnn.py                 | 294 +++++------
 python/paddle/nn/layer/transformer.py         | 206 ++++----
 python/paddle/nn/layer/vision.py              |  16 +-
 python/paddle/nn/quant/quant_layers.py        |   4 +-
 python/paddle/nn/utils/spectral_norm_hook.py  |   8 +-
 .../paddle/nn/utils/transform_parameters.py   |   4 +-
 python/paddle/nn/utils/weight_norm_hook.py    |  22 +-
 python/paddle/onnx/export.py                  |  16 +-
 python/paddle/optimizer/adadelta.py           |   6 +-
 python/paddle/optimizer/adagrad.py            |  10 +-
 python/paddle/optimizer/adam.py               |   6 +-
 python/paddle/optimizer/adamax.py             |  12 +-
 python/paddle/optimizer/adamw.py              |  10 +-
 python/paddle/optimizer/lamb.py               |   2 +-
 python/paddle/optimizer/lr.py                 |   2 +-
 python/paddle/optimizer/momentum.py           |   6 +-
 python/paddle/optimizer/optimizer.py          |  56 +-
 python/paddle/optimizer/rmsprop.py            |  22 +-
 python/paddle/optimizer/sgd.py                |   4 +-
 python/paddle/profiler/profiler.py            |  14 +-
 python/paddle/profiler/timer.py               |   2 +-
 python/paddle/profiler/utils.py               |   2 +-
 python/paddle/reader/__init__.py              |   2 +-
 python/paddle/reader/decorator.py             |  90 ++--
 python/paddle/regularizer.py                  |  30 +-
 python/paddle/signal.py                       |  44 +-
 python/paddle/static/nn/common.py             |   4 +-
 python/paddle/tensor/attribute.py             |   2 +-
 python/paddle/tensor/creation.py              | 130 ++---
 python/paddle/tensor/einsum.py                |  66 +--
 .../paddle/tensor/layer_function_generator.py |   2 +-
 python/paddle/tensor/linalg.py                | 126 ++---
 python/paddle/tensor/logic.py                 |  30 +-
 python/paddle/tensor/manipulation.py          | 272 +++++-----
 python/paddle/tensor/math.py                  | 274 +++++-----
 python/paddle/tensor/ops.py                   |  12 +-
 python/paddle/tensor/random.py                |  42 +-
 python/paddle/tensor/search.py                | 116 ++---
 python/paddle/tensor/stat.py                  |  10 +-
 python/paddle/tensor/to_string.py             |   6 +-
 python/paddle/text/datasets/conll05.py        |   8 +-
 python/paddle/text/datasets/uci_housing.py    |   2 +-
 python/paddle/text/datasets/wmt14.py          |   8 +-
 python/paddle/text/datasets/wmt16.py          |   4 +-
 python/paddle/text/viterbi_decode.py          |  18 +-
 .../utils/cpp_extension/cpp_extension.py      |  64 +--
 .../utils/cpp_extension/extension_utils.py    |  12 +-
 python/paddle/utils/deprecated.py             |   8 +-
 python/paddle/utils/dlpack.py                 |   8 +-
 python/paddle/vision/datasets/flowers.py      |   2 +-
 python/paddle/vision/datasets/folder.py       |   2 +-
 python/paddle/vision/datasets/mnist.py        |   8 +-
 python/paddle/vision/image.py                 |  18 +-
 python/paddle/vision/models/alexnet.py        |   4 +-
 python/paddle/vision/models/densenet.py       |   2 +-
 python/paddle/vision/models/googlenet.py      |   6 +-
 python/paddle/vision/models/inceptionv3.py    |   2 +-
 python/paddle/vision/models/lenet.py          |   2 +-
 python/paddle/vision/models/mobilenetv1.py    |   4 +-
 python/paddle/vision/models/mobilenetv2.py    |   4 +-
 python/paddle/vision/models/mobilenetv3.py    |   4 +-
 python/paddle/vision/models/resnet.py         |  14 +-
 python/paddle/vision/models/shufflenetv2.py   |   2 +-
 python/paddle/vision/models/squeezenet.py     |   2 +-
 python/paddle/vision/models/vgg.py            |  10 +-
 python/paddle/vision/ops.py                   | 214 ++++----
 python/paddle/vision/transforms/functional.py | 104 ++--
 .../vision/transforms/functional_cv2.py       |  50 +-
 .../vision/transforms/functional_pil.py       |  60 +--
 .../vision/transforms/functional_tensor.py    |  56 +-
 python/paddle/vision/transforms/transforms.py | 240 ++++-----
 r/example/mobilenet.py                        |   6 +-
 tools/check_ut.py                             |   6 +-
 tools/codestyle/test_docstring_checker.py     |  18 +-
 tools/count_api_without_core_ops.py           |   4 +-
 tools/coverage/coverage_diff.py               |   4 +-
 tools/coverage/coverage_diff_list.py          |   4 +-
 tools/coverage/pull_request.py                |   4 +-
 tools/diff_api.py                             |   2 +-
 tools/diff_unittest.py                        |   2 +-
 tools/final_ut_parallel_rule.py               |   2 +-
 tools/gen_ut_cmakelists.py                    |   6 +-
 tools/jetson_infer_op.py                      |   2 +-
 tools/parallel_UT_rule.py                     |   2 +-
 tools/prune_for_jetson.py                     |   2 +-
 tools/remove_grad_op_and_kernel.py            |   2 +-
 tools/sampcd_processor.py                     |  12 +-
 416 files changed, 5999 insertions(+), 5980 deletions(-)

diff --git a/.cmake-format.py b/.cmake-format.py
index 62f5651fb1c..8527ad66f0d 100644
--- a/.cmake-format.py
+++ b/.cmake-format.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
index 431bbdea7e0..19ea5982f58 100644
--- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
+++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
@@ -22,7 +22,7 @@ def GenerateFileStructureForFinalDygraph(eager_dir):
     |- generated
     |  |- CMakeLists.txt
     |  |  "add_subdirectory(forwards), add_subdirectory(backwards)"
-    |  
+    |
     |  |- forwards
     |     |- "dygraph_functions.cc"
     |     |- "dygraph_functions.h"
@@ -59,7 +59,7 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
     |- generated
     |  |- CMakeLists.txt
     |  |  "add_subdirectory(forwards), add_subdirectory(nodes)"
-    |  
+    |
     |  |- forwards
     |     |- "dygraph_forward_functions.cc"
     |     |- CMakeLists.txt
@@ -70,7 +70,7 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
     |     |- "nodes.h"
     |     |- CMakeLists.txt
     |     |  "cc_library(dygraph_node SRCS nodes.cc DEPS ${eager_deps} ${fluid_deps})"
-    | 
+    |
     |  |- dygraph_forward_api.h
     """
     # Directory Generation
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 100dfd57405..d16ed62f5bf 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -403,9 +403,9 @@ LAYOUT_LOGIC_TEMPLATE=\
   if (paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune()) {{
     VLOG(5) << "Check and Prepare For LAYOUT";
     paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> tensors_vector = {};
-    {} 
     {}
-    paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune(); 
+    {}
+    paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune();
     {}
     {}
     paddle::imperative::LayoutAutoTune::Instance().EnableLayoutAutoTune();
@@ -1772,7 +1772,7 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
             autograd_api = self.grad_api_contents['invoke'].replace(
                 forward_api_name, forward_api_name + '_dygraph_function', 1)
             grad_function_call_str = f"""
-  if (trace_backward) {{            
+  if (trace_backward) {{
   {indent}{autograd_api_out} api_output = {autograd_api};
   {out_assign_str}}} else {{
   {indent}{autograd_api_out} api_output = paddle::experimental::{self.namespace}{self.grad_api_contents['invoke']};
diff --git a/paddle/fluid/inference/api/demo_ci/untar_model.py b/paddle/fluid/inference/api/demo_ci/untar_model.py
index 1400f3bfc0d..ed56a4fc037 100644
--- a/paddle/fluid/inference/api/demo_ci/untar_model.py
+++ b/paddle/fluid/inference/api/demo_ci/untar_model.py
@@ -20,7 +20,7 @@ def untar(fname, dirs):
     """
     extract the tar.gz file
     :param fname: the name of tar.gz file
-    :param dirs: the path of decompressed file 
+    :param dirs: the path of decompressed file
     :return: bool
     """
     try:
diff --git a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
index 84c4eb7e5e8..a8671be62f2 100644
--- a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
@@ -106,8 +106,8 @@ def convert_pascalvoc_local2bin(args):
         for object in objects:
             bbox_sample = []
             # start from 1
-            bbox_sample.append(
-                float(label_list.index(object.find('name').text)))
+            bbox_sample.append(float(label_list.index(
+                object.find('name').text)))
             bbox = object.find('bndbox')
             difficult = float(object.find('difficult').text)
             bbox_sample.append(float(bbox.find('xmin').text) / im_width)
@@ -131,7 +131,7 @@ def convert_pascalvoc_local2bin(args):
     f1.close()
 
     object_nums_sum = sum(object_nums)
-    # The data should be contains 
+    # The data should be contains
     # number of images + all images data + an array that represent object numbers of each image
     # + labels of all objects in images + bboxes of all objects + difficulties of all objects
     # so the target size should be as follows:
@@ -210,8 +210,8 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path):
 
         for object in objects:
             bbox_sample = []
-            bbox_sample.append(
-                float(label_list.index(object.find('name').text)))
+            bbox_sample.append(float(label_list.index(
+                object.find('name').text)))
             bbox = object.find('bndbox')
             difficult = float(object.find('difficult').text)
             bbox_sample.append(float(bbox.find('xmin').text) / im_width)
@@ -230,7 +230,7 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path):
         if line_idx % per_percentage:
             print_processbar(line_idx / per_percentage)
 
-    # The data should be stored in binary in following sequence: 
+    # The data should be stored in binary in following sequence:
     # number of images->all images data->an array that represent object numbers in each image
     # ->labels of all objects in images->bboxes of all objects->difficulties of all objects
     f1.write(np.array(object_nums).astype('uint64').tobytes())
@@ -258,9 +258,9 @@ def download_pascalvoc(data_url, data_dir, tar_targethash, tar_path):
 def run_convert():
     try_limit = 2
     retry = 0
-    while not (os.path.exists(DATA_OUT_PATH) and
-               os.path.getsize(DATA_OUT_PATH) == BIN_FULLSIZE and BIN_TARGETHASH
-               == hashlib.md5(open(DATA_OUT_PATH, 'rb').read()).hexdigest()):
+    while not (os.path.exists(DATA_OUT_PATH) and os.path.getsize(DATA_OUT_PATH)
+               == BIN_FULLSIZE and BIN_TARGETHASH == hashlib.md5(
+                   open(DATA_OUT_PATH, 'rb').read()).hexdigest()):
         if os.path.exists(DATA_OUT_PATH):
             sys.stderr.write(
                 "The existing binary file is broken. It is being removed...\n")
@@ -275,52 +275,52 @@ def run_convert():
 
 def main_pascalvoc_preprocess(args):
     parser = argparse.ArgumentParser(
-        description="Convert the full pascalvoc val set or local data to binary file.",
+        description=
+        "Convert the full pascalvoc val set or local data to binary file.",
         usage=None,
         add_help=True)
     parser.add_argument(
         '--local',
         action="store_true",
         help="If used, user need to set --data_dir and then convert file")
-    parser.add_argument(
-        "--data_dir", default="", type=str, help="Dataset root directory")
+    parser.add_argument("--data_dir",
+                        default="",
+                        type=str,
+                        help="Dataset root directory")
     parser.add_argument(
         "--img_annotation_list",
         type=str,
         default="test_100.txt",
-        help="A file containing the image file path and corresponding annotation file path"
+        help=
+        "A file containing the image file path and corresponding annotation file path"
     )
     parser.add_argument(
         "--label_file",
         type=str,
         default="label_list",
-        help="List of object labels with same sequence as denoted in the annotation file"
+        help=
+        "List of object labels with same sequence as denoted in the annotation file"
     )
-    parser.add_argument(
-        "--output_file",
-        type=str,
-        default="pascalvoc_small.bin",
-        help="File path of the output binary file")
-    parser.add_argument(
-        "--resize_h",
-        type=int,
-        default=RESIZE_H,
-        help="Image preprocess with resize_h")
-    parser.add_argument(
-        "--resize_w",
-        type=int,
-        default=RESIZE_W,
-        help="Image prerocess with resize_w")
-    parser.add_argument(
-        "--mean_value",
-        type=str,
-        default=MEAN_VALUE,
-        help="Image preprocess with mean_value")
-    parser.add_argument(
-        "--ap_version",
-        type=str,
-        default=AP_VERSION,
-        help="Image preprocess with ap_version")
+    parser.add_argument("--output_file",
+                        type=str,
+                        default="pascalvoc_small.bin",
+                        help="File path of the output binary file")
+    parser.add_argument("--resize_h",
+                        type=int,
+                        default=RESIZE_H,
+                        help="Image preprocess with resize_h")
+    parser.add_argument("--resize_w",
+                        type=int,
+                        default=RESIZE_W,
+                        help="Image prerocess with resize_w")
+    parser.add_argument("--mean_value",
+                        type=str,
+                        default=MEAN_VALUE,
+                        help="Image preprocess with mean_value")
+    parser.add_argument("--ap_version",
+                        type=str,
+                        default=AP_VERSION,
+                        help="Image preprocess with ap_version")
     args = parser.parse_args()
     if args.local:
         convert_pascalvoc_local2bin(args)
diff --git a/paddle/infrt/tests/models/abs_model.py b/paddle/infrt/tests/models/abs_model.py
index dd1632bc9d4..185aade58aa 100644
--- a/paddle/infrt/tests/models/abs_model.py
+++ b/paddle/infrt/tests/models/abs_model.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@ import sys
 
 
 class AbsNet(paddle.nn.Layer):
+
     def __init__(self):
         super(AbsNet, self).__init__()
 
@@ -32,7 +33,6 @@ if __name__ == '__main__':
     # build network
     model = AbsNet()
     # save inferencing format model
-    net = to_static(
-        model, input_spec=[InputSpec(
-            shape=[None, 1, 28, 28], name='x')])
+    net = to_static(model,
+                    input_spec=[InputSpec(shape=[None, 1, 28, 28], name='x')])
     paddle.jit.save(net, sys.argv[1])
diff --git a/paddle/infrt/tests/models/efficientnet-b4/model.py b/paddle/infrt/tests/models/efficientnet-b4/model.py
index c660c3a4674..0c6163f3df2 100644
--- a/paddle/infrt/tests/models/efficientnet-b4/model.py
+++ b/paddle/infrt/tests/models/efficientnet-b4/model.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,7 +20,6 @@ import paddle
 import sys
 
 model = EfficientNet.from_name('efficientnet-b4')
-net = to_static(
-    model, input_spec=[InputSpec(
-        shape=[None, 3, 256, 256], name='x')])
+net = to_static(model,
+                input_spec=[InputSpec(shape=[None, 3, 256, 256], name='x')])
 paddle.jit.save(net, sys.argv[1])
diff --git a/paddle/infrt/tests/models/efficientnet-b4/net/__init__.py b/paddle/infrt/tests/models/efficientnet-b4/net/__init__.py
index d4e557829ae..cb2b62444de 100644
--- a/paddle/infrt/tests/models/efficientnet-b4/net/__init__.py
+++ b/paddle/infrt/tests/models/efficientnet-b4/net/__init__.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/infrt/tests/models/efficientnet-b4/net/efficientnet.py b/paddle/infrt/tests/models/efficientnet-b4/net/efficientnet.py
index a9956fcdc88..75f6780484d 100644
--- a/paddle/infrt/tests/models/efficientnet-b4/net/efficientnet.py
+++ b/paddle/infrt/tests/models/efficientnet-b4/net/efficientnet.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -38,8 +38,8 @@ class MBConvBlock(nn.Layer):
         self._block_args = block_args
         self._bn_mom = global_params.batch_norm_momentum
         self._bn_eps = global_params.batch_norm_epsilon
-        self.has_se = (self._block_args.se_ratio is not None) and (
-            0 < self._block_args.se_ratio <= 1)
+        self.has_se = (self._block_args.se_ratio
+                       is not None) and (0 < self._block_args.se_ratio <= 1)
         self.id_skip = block_args.id_skip  # skip connection and drop connect
 
         # Get static or dynamic convolution depending on image size
@@ -49,13 +49,13 @@ class MBConvBlock(nn.Layer):
         inp = self._block_args.input_filters  # number of input channels
         oup = self._block_args.input_filters * self._block_args.expand_ratio  # number of output channels
         if self._block_args.expand_ratio != 1:
-            self._expand_conv = Conv2d(
-                in_channels=inp,
-                out_channels=oup,
-                kernel_size=1,
-                bias_attr=False)
-            self._bn0 = nn.BatchNorm2D(
-                num_features=oup, momentum=self._bn_mom, epsilon=self._bn_eps)
+            self._expand_conv = Conv2d(in_channels=inp,
+                                       out_channels=oup,
+                                       kernel_size=1,
+                                       bias_attr=False)
+            self._bn0 = nn.BatchNorm2D(num_features=oup,
+                                       momentum=self._bn_mom,
+                                       epsilon=self._bn_eps)
 
         # Depthwise convolution phase
         k = self._block_args.kernel_size
@@ -67,32 +67,31 @@ class MBConvBlock(nn.Layer):
             kernel_size=k,
             stride=s,
             bias_attr=False)
-        self._bn1 = nn.BatchNorm2D(
-            num_features=oup, momentum=self._bn_mom, epsilon=self._bn_eps)
+        self._bn1 = nn.BatchNorm2D(num_features=oup,
+                                   momentum=self._bn_mom,
+                                   epsilon=self._bn_eps)
 
         # Squeeze and Excitation layer, if desired
         if self.has_se:
-            num_squeezed_channels = max(1,
-                                        int(self._block_args.input_filters *
-                                            self._block_args.se_ratio))
-            self._se_reduce = Conv2d(
-                in_channels=oup,
-                out_channels=num_squeezed_channels,
-                kernel_size=1)
-            self._se_expand = Conv2d(
-                in_channels=num_squeezed_channels,
-                out_channels=oup,
-                kernel_size=1)
+            num_squeezed_channels = max(
+                1,
+                int(self._block_args.input_filters * self._block_args.se_ratio))
+            self._se_reduce = Conv2d(in_channels=oup,
+                                     out_channels=num_squeezed_channels,
+                                     kernel_size=1)
+            self._se_expand = Conv2d(in_channels=num_squeezed_channels,
+                                     out_channels=oup,
+                                     kernel_size=1)
 
         # Output phase
         final_oup = self._block_args.output_filters
-        self._project_conv = Conv2d(
-            in_channels=oup,
-            out_channels=final_oup,
-            kernel_size=1,
-            bias_attr=False)
-        self._bn2 = nn.BatchNorm2D(
-            num_features=final_oup, momentum=self._bn_mom, epsilon=self._bn_eps)
+        self._project_conv = Conv2d(in_channels=oup,
+                                    out_channels=final_oup,
+                                    kernel_size=1,
+                                    bias_attr=False)
+        self._bn2 = nn.BatchNorm2D(num_features=final_oup,
+                                   momentum=self._bn_mom,
+                                   epsilon=self._bn_eps)
         self._swish = nn.Hardswish()
 
     def forward(self, inputs, drop_connect_rate=None):
@@ -121,8 +120,9 @@ class MBConvBlock(nn.Layer):
         input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
         if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
             if drop_connect_rate:
-                x = drop_connect(
-                    x, prob=drop_connect_rate, training=self.training)
+                x = drop_connect(x,
+                                 prob=drop_connect_rate,
+                                 training=self.training)
             x = x + inputs  # skip connection
         return x
 
@@ -162,10 +162,14 @@ class EfficientNet(nn.Layer):
         in_channels = 3  # rgb
         out_channels = round_filters(
             32, self._global_params)  # number of output channels
-        self._conv_stem = Conv2d(
-            in_channels, out_channels, kernel_size=3, stride=2, bias_attr=False)
-        self._bn0 = nn.BatchNorm2D(
-            num_features=out_channels, momentum=bn_mom, epsilon=bn_eps)
+        self._conv_stem = Conv2d(in_channels,
+                                 out_channels,
+                                 kernel_size=3,
+                                 stride=2,
+                                 bias_attr=False)
+        self._bn0 = nn.BatchNorm2D(num_features=out_channels,
+                                   momentum=bn_mom,
+                                   epsilon=bn_eps)
 
         # Build blocks
         self._blocks = nn.LayerList([])
@@ -186,16 +190,19 @@ class EfficientNet(nn.Layer):
                 block_args = block_args._replace(
                     input_filters=block_args.output_filters, stride=1)
             for _ in range(block_args.num_repeat - 1):
-                self._blocks.append(
-                    MBConvBlock(block_args, self._global_params))
+                self._blocks.append(MBConvBlock(block_args,
+                                                self._global_params))
 
         # Head
         in_channels = block_args.output_filters  # output of final block
         out_channels = round_filters(1280, self._global_params)
-        self._conv_head = Conv2d(
-            in_channels, out_channels, kernel_size=1, bias_attr=False)
-        self._bn1 = nn.BatchNorm2D(
-            num_features=out_channels, momentum=bn_mom, epsilon=bn_eps)
+        self._conv_head = Conv2d(in_channels,
+                                 out_channels,
+                                 kernel_size=1,
+                                 bias_attr=False)
+        self._bn1 = nn.BatchNorm2D(num_features=out_channels,
+                                   momentum=bn_mom,
+                                   epsilon=bn_eps)
 
         # Final linear layer
         self._avg_pooling = nn.AdaptiveAvgPool2D(1)
@@ -253,20 +260,21 @@ class EfficientNet(nn.Layer):
                         advprop=False,
                         num_classes=1000,
                         in_channels=3):
-        model = cls.from_name(
-            model_name, override_params={'num_classes': num_classes})
-        load_pretrained_weights(
-            model, model_name, load_fc=(num_classes == 1000), advprop=advprop)
+        model = cls.from_name(model_name,
+                              override_params={'num_classes': num_classes})
+        load_pretrained_weights(model,
+                                model_name,
+                                load_fc=(num_classes == 1000),
+                                advprop=advprop)
         if in_channels != 3:
             Conv2d = get_same_padding_conv2d(
                 image_size=model._global_params.image_size)
             out_channels = round_filters(32, model._global_params)
-            model._conv_stem = Conv2d(
-                in_channels,
-                out_channels,
-                kernel_size=3,
-                stride=2,
-                bias_attr=False)
+            model._conv_stem = Conv2d(in_channels,
+                                      out_channels,
+                                      kernel_size=3,
+                                      stride=2,
+                                      bias_attr=False)
         return model
 
     @classmethod
@@ -280,5 +288,5 @@ class EfficientNet(nn.Layer):
         """ Validates model name. """
         valid_models = ['efficientnet-b' + str(i) for i in range(9)]
         if model_name not in valid_models:
-            raise ValueError('model_name should be one of: ' + ', '.join(
-                valid_models))
+            raise ValueError('model_name should be one of: ' +
+                             ', '.join(valid_models))
diff --git a/paddle/infrt/tests/models/efficientnet-b4/net/utils.py b/paddle/infrt/tests/models/efficientnet-b4/net/utils.py
index 3bf8b4eb730..f4a39b47882 100644
--- a/paddle/infrt/tests/models/efficientnet-b4/net/utils.py
+++ b/paddle/infrt/tests/models/efficientnet-b4/net/utils.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -96,15 +96,14 @@ class Conv2dDynamicSamePadding(nn.Conv2D):
                  dilation=1,
                  groups=1,
                  bias_attr=None):
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            0,
-            dilation,
-            groups,
-            bias_attr=bias_attr)
+        super().__init__(in_channels,
+                         out_channels,
+                         kernel_size,
+                         stride,
+                         0,
+                         dilation,
+                         groups,
+                         bias_attr=bias_attr)
         self.stride = self._stride if len(
             self._stride) == 2 else [self._stride[0]] * 2
 
@@ -113,10 +112,12 @@ class Conv2dDynamicSamePadding(nn.Conv2D):
         kh, kw = self.weight.shape[-2:]
         sh, sw = self.stride
         oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
-        pad_h = max((oh - 1) * self.stride[0] +
-                    (kh - 1) * self._dilation[0] + 1 - ih, 0)
-        pad_w = max((ow - 1) * self.stride[1] +
-                    (kw - 1) * self._dilation[1] + 1 - iw, 0)
+        pad_h = max(
+            (oh - 1) * self.stride[0] + (kh - 1) * self._dilation[0] + 1 - ih,
+            0)
+        pad_w = max(
+            (ow - 1) * self.stride[1] + (kw - 1) * self._dilation[1] + 1 - iw,
+            0)
         if pad_h > 0 or pad_w > 0:
             x = F.pad(x, [
                 pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
@@ -142,15 +143,18 @@ class Conv2dStaticSamePadding(nn.Conv2D):
 
         # Calculate padding based on image size and save it
         assert image_size is not None
-        ih, iw = image_size if type(
-            image_size) == list else [image_size, image_size]
+        ih, iw = image_size if type(image_size) == list else [
+            image_size, image_size
+        ]
         kh, kw = self.weight.shape[-2:]
         sh, sw = self.stride
         oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
-        pad_h = max((oh - 1) * self.stride[0] +
-                    (kh - 1) * self._dilation[0] + 1 - ih, 0)
-        pad_w = max((ow - 1) * self.stride[1] +
-                    (kw - 1) * self._dilation[1] + 1 - iw, 0)
+        pad_h = max(
+            (oh - 1) * self.stride[0] + (kh - 1) * self._dilation[0] + 1 - ih,
+            0)
+        pad_w = max(
+            (ow - 1) * self.stride[1] + (kw - 1) * self._dilation[1] + 1 - iw,
+            0)
         if pad_h > 0 or pad_w > 0:
             self.static_padding = nn.Pad2D([
                 pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
@@ -166,6 +170,7 @@ class Conv2dStaticSamePadding(nn.Conv2D):
 
 
 class Identity(nn.Layer):
+
     def __init__(self, ):
         super().__init__()
 
@@ -225,9 +230,12 @@ class BlockDecoder(object):
     def _encode_block_string(block):
         """Encodes a block to a string."""
         args = [
-            'r%d' % block.num_repeat, 'k%d' % block.kernel_size, 's%d%d' %
-            (block.strides[0], block.strides[1]), 'e%s' % block.expand_ratio,
-            'i%d' % block.input_filters, 'o%d' % block.output_filters
+            'r%d' % block.num_repeat,
+            'k%d' % block.kernel_size,
+            's%d%d' % (block.strides[0], block.strides[1]),
+            'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters,
+            'o%d' % block.output_filters
         ]
         if 0 < block.se_ratio <= 1:
             args.append('se%s' % block.se_ratio)
@@ -291,7 +299,8 @@ def efficientnet(width_coefficient=None,
         depth_coefficient=depth_coefficient,
         depth_divisor=8,
         min_depth=None,
-        image_size=image_size, )
+        image_size=image_size,
+    )
 
     return blocks_args, global_params
 
@@ -300,11 +309,10 @@ def get_model_params(model_name, override_params):
     """ Get the block args and global params for a given model """
     if model_name.startswith('efficientnet'):
         w, d, s, p = efficientnet_params(model_name)
-        blocks_args, global_params = efficientnet(
-            width_coefficient=w,
-            depth_coefficient=d,
-            dropout_rate=p,
-            image_size=s)
+        blocks_args, global_params = efficientnet(width_coefficient=w,
+                                                  depth_coefficient=d,
+                                                  dropout_rate=p,
+                                                  image_size=s)
     else:
         raise NotImplementedError('model name is not pre-defined: %s' %
                                   model_name)
diff --git a/paddle/infrt/tests/models/linear.py b/paddle/infrt/tests/models/linear.py
index 602e067365b..1a6c6f095c7 100644
--- a/paddle/infrt/tests/models/linear.py
+++ b/paddle/infrt/tests/models/linear.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -28,6 +28,7 @@ CLASS_NUM = 10
 
 # define a random dataset
 class RandomDataset(paddle.io.Dataset):
+
     def __init__(self, num_samples):
         self.num_samples = num_samples
 
@@ -41,6 +42,7 @@ class RandomDataset(paddle.io.Dataset):
 
 
 class LinearNet(nn.Layer):
+
     def __init__(self):
         super(LinearNet, self).__init__()
         self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
@@ -69,8 +71,11 @@ adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
 
 # create data loader
 dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
-loader = paddle.io.DataLoader(
-    dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2)
+loader = paddle.io.DataLoader(dataset,
+                              batch_size=BATCH_SIZE,
+                              shuffle=True,
+                              drop_last=True,
+                              num_workers=2)
 
 # train
 train(layer, loader, loss_fn, adam)
diff --git a/paddle/infrt/tests/models/resnet50_model.py b/paddle/infrt/tests/models/resnet50_model.py
index 6edd75116e8..af45de0c8a6 100644
--- a/paddle/infrt/tests/models/resnet50_model.py
+++ b/paddle/infrt/tests/models/resnet50_model.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,7 +19,6 @@ from paddle.static import InputSpec
 import sys
 
 model = resnet50(True)
-net = to_static(
-    model, input_spec=[InputSpec(
-        shape=[None, 3, 256, 256], name='x')])
+net = to_static(model,
+                input_spec=[InputSpec(shape=[None, 3, 256, 256], name='x')])
 paddle.jit.save(net, sys.argv[1])
diff --git a/paddle/phi/api/yaml/generator/api_base.py b/paddle/phi/api/yaml/generator/api_base.py
index cbee67aaa5c..26cb5c6de3e 100644
--- a/paddle/phi/api/yaml/generator/api_base.py
+++ b/paddle/phi/api/yaml/generator/api_base.py
@@ -706,29 +706,29 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
 {code_indent}     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{{"""
             for input_name in single_tensor_names[:-1]:
                 if input_name in self.optional_vars:
-                    input_tensor_code = input_tensor_code + f"""            
+                    input_tensor_code = input_tensor_code + f"""
 {code_indent}     {{"{input_name}", {input_name}_record_shapes}},"""
                 else:
-                    input_tensor_code = input_tensor_code + f"""            
+                    input_tensor_code = input_tensor_code + f"""
 {code_indent}     {{"{input_name}", {{"""
                     input_tensors = input_name_tensor_map[input_name]
                     for input_tensor, _ in input_tensors[:-1]:
-                        input_tensor_code = input_tensor_code + f"""            
+                        input_tensor_code = input_tensor_code + f"""
 {code_indent}     (*{input_tensor}).dims(),"""
-                    input_tensor_code = input_tensor_code + f"""            
+                    input_tensor_code = input_tensor_code + f"""
 {code_indent}     (*{input_tensors[-1][0]}).dims()}}}},"""
             if single_tensor_names[-1] in self.optional_vars:
-                input_tensor_code = input_tensor_code + f"""            
-{code_indent}     {{"{single_tensor_names[-1]}",         
+                input_tensor_code = input_tensor_code + f"""
+{code_indent}     {{"{single_tensor_names[-1]}",
 {code_indent}     {single_tensor_names[-1]}_record_shapes}}}};"""
             else:
-                input_tensor_code = input_tensor_code + f"""            
+                input_tensor_code = input_tensor_code + f"""
 {code_indent}     {{"{single_tensor_names[-1]}", {{"""
                 input_tensors = input_name_tensor_map[single_tensor_names[-1]]
                 for input_tensor, _ in input_tensors[:-1]:
-                    input_tensor_code = input_tensor_code + f"""            
+                    input_tensor_code = input_tensor_code + f"""
 {code_indent}     (*{input_tensor}).dims(),"""
-                input_tensor_code = input_tensor_code + f"""            
+                input_tensor_code = input_tensor_code + f"""
 {code_indent}     (*{input_tensors[-1][0]}).dims()}}}}}};"""
         if list_tensor_names:
             input_tensor_code = input_tensor_code + f"""
@@ -757,14 +757,14 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
 {code_indent}       ddims_vec.emplace_back((*{input_tensor_truncate}[i]).dims());
 {code_indent}     }}"""
                 else:
-                    input_tensor_code = input_tensor_code + f"""  
+                    input_tensor_code = input_tensor_code + f"""
                   ddims_vec.emplace_back((*{input_tensor}).dims());
 {code_indent}     """
             input_tensor_code = input_tensor_code + f"""
 {code_indent}     input_shapes.emplace_back("{input_name}", ddims_vec);"""
 
-        input_tensor_code = input_tensor_code + f"""  
-{code_indent}     platform::RecordOpInfoSupplement("{self.api}", input_shapes);  
+        input_tensor_code = input_tensor_code + f"""
+{code_indent}     platform::RecordOpInfoSupplement("{self.api}", input_shapes);
 {code_indent}  }}"""
         kernel_args = ["*dev_ctx"]
         for param in kernel_param:
diff --git a/paddle/scripts/conda_build.py b/paddle/scripts/conda_build.py
index 2fe02dc51bf..a514c8b5231 100644
--- a/paddle/scripts/conda_build.py
+++ b/paddle/scripts/conda_build.py
@@ -1,13 +1,13 @@
 #!/bin/python
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,17 +24,17 @@ import time
 
 def parse_args():
     parser = argparse.ArgumentParser("conda build for paddlepaddle version")
-    parser.add_argument(
-        "--paddle_version",
-        type=str,
-        required=True,
-        help="paddle version for conda build.")
+    parser.add_argument("--paddle_version",
+                        type=str,
+                        required=True,
+                        help="paddle version for conda build.")
     args = parser.parse_args()
 
     return args
 
 
 class ConstantVar:
+
     def __init__(self):
         self.build = r"""
 build:
@@ -89,7 +89,7 @@ about:
         self.build_const = r"""
 """
 
-        self.blt_const = r""" 
+        self.blt_const = r"""
 """
 
         self.python36 = r"    - python>=3.6, <3.7"
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 4cf628abe05..ffb16486342 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -25,21 +25,21 @@ def auto_cast(enable=True,
               dtype='float16'):
     """
     Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
-    If enabled, the input data type (float32 or float16) of each operator is decided 
-    by autocast algorithm for better performance. 
-    
-    Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in 
+    If enabled, the input data type (float32 or float16) of each operator is decided
+    by autocast algorithm for better performance.
+
+    Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in
     imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode.
 
     Args:
         enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
         custom_white_list(set|list|tuple, optional): The custom white_list. It's the set of ops that support
-             fp16 calculation and are considered numerically-safe and performance-critical. These ops 
+             fp16 calculation and are considered numerically-safe and performance-critical. These ops
              will be converted to fp16.
         custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
-             calculation and are considered numerically-dangerous and whose effects may also be 
+             calculation and are considered numerically-dangerous and whose effects may also be
              observed in downstream ops. These ops will not be converted to fp16.
-        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list; 
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list;
              O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp)
         dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
 
@@ -69,7 +69,7 @@ def auto_cast(enable=True,
         with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}):
             c = a + b
             print(c.dtype) # paddle.float32
-        
+
         with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}, level='O2'):
             d = a + b
             print(d.dtype) # paddle.float32
@@ -85,15 +85,15 @@ def decorate(models,
              master_weight=None,
              save_dtype=None):
     """
-    Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing. 
+    Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
     When level is O2(pure float16/bfloat16), the decorate will cast all parameters of models to float16/bfloat16, except BatchNorm and LayerNorm.
-    
+
     Commonly, it is used together with `auto_cast` to achieve Pure float16/bfloat16 in imperative mode.
 
     Args:
         models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
         optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
-        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing; 
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing;
              O2 represent Pure float16/bfloat16, the decorator will cast all parameters of models to float16/bfloat16, except BatchNorm and LayerNorm. Default is O1(amp)
         dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
         master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
@@ -102,7 +102,7 @@ def decorate(models,
 
     Examples:
 
-     .. code-block:: python   
+     .. code-block:: python
 
         # required: gpu
         # Demo1: single model and optimizer:
@@ -118,7 +118,7 @@ def decorate(models,
         with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
             output = model(data)
             print(output.dtype) # FP16
-            
+
         # required: gpu
         # Demo2: multi models and optimizers:
         model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
@@ -133,7 +133,7 @@ def decorate(models,
             output2 = models[1](data)
             print(output.dtype) # FP16
             print(output2.dtype) # FP16
-        
+
         # required: gpu
         # Demo3: optimizers is None:
         model3 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 46582b1770b..9fe7f71d2be 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -25,7 +25,7 @@ def _refresh_optimizer_state():
 
 class GradScaler(AmpScaler):
     """
-    GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode. 
+    GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode.
     It controls the scaling of loss, helps avoiding numerical overflow.
     The object of this class has nineteen methods `scale()`, `unscale_()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters.
 
@@ -36,19 +36,19 @@ class GradScaler(AmpScaler):
     `update` is used to update the loss_scaling.
 
 
-    Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in 
+    Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in
     dynamic graph mode.
 
     Args:
         enable(bool, optional): Enable loss scaling or not. Default is True.
         init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
-        incr_ratio(float, optional): The multiplier to use when increasing the loss 
+        incr_ratio(float, optional): The multiplier to use when increasing the loss
                         scaling. Default is 2.0.
-        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing 
+        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing
                         the loss scaling. Default is 0.5.
-        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive 
+        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive
                                 steps with finite gradients. Default is 1000.
-        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n 
+        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
                                     accumulated steps with nan or inf gradients. Default is 2.
         use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
     Returns:
@@ -57,7 +57,7 @@ class GradScaler(AmpScaler):
     Examples:
 
         .. code-block:: python
-            
+
             import paddle
 
             model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
@@ -68,10 +68,10 @@ class GradScaler(AmpScaler):
             with paddle.amp.auto_cast():
                 conv = model(data)
                 loss = paddle.mean(conv)
-                
-            scaled = scaler.scale(loss)  # scale the loss 
+
+            scaled = scaler.scale(loss)  # scale the loss
             scaled.backward()            # do backward
-            scaler.minimize(optimizer, scaled)  # update parameters     
+            scaler.minimize(optimizer, scaled)  # update parameters
             optimizer.clear_grad()
     """
 
@@ -90,18 +90,18 @@ class GradScaler(AmpScaler):
 
     def scale(self, var):
         """
-        Multiplies a Tensor by the scale factor and returns scaled outputs.  
+        Multiplies a Tensor by the scale factor and returns scaled outputs.
         If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
 
         Args:
             var (Tensor):  The tensor to scale.
         Returns:
             The scaled tensor or original tensor.
-        
+
         Examples:
 
             .. code-block:: python
-                
+
                 import paddle
 
                 model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
@@ -113,9 +113,9 @@ class GradScaler(AmpScaler):
                     conv = model(data)
                     loss = paddle.mean(conv)
 
-                scaled = scaler.scale(loss)  # scale the loss 
+                scaled = scaler.scale(loss)  # scale the loss
                 scaled.backward()            # do backward
-                scaler.minimize(optimizer, scaled)  # update parameters  
+                scaler.minimize(optimizer, scaled)  # update parameters
                 optimizer.clear_grad()
         """
         return super(GradScaler, self).scale(var)
@@ -123,7 +123,7 @@ class GradScaler(AmpScaler):
     def minimize(self, optimizer, *args, **kwargs):
         """
         This function is similar as `optimizer.minimize()`, which performs parameters updating.
-        
+
         If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
         Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.
 
@@ -149,9 +149,9 @@ class GradScaler(AmpScaler):
                     conv = model(data)
                     loss = paddle.mean(conv)
 
-                scaled = scaler.scale(loss)  # scale the loss 
+                scaled = scaler.scale(loss)  # scale the loss
                 scaled.backward()            # do backward
-                scaler.minimize(optimizer, scaled)  # update parameters  
+                scaler.minimize(optimizer, scaled)  # update parameters
                 optimizer.clear_grad()
         """
         return super(GradScaler, self).minimize(optimizer, *args, **kwargs)
@@ -159,7 +159,7 @@ class GradScaler(AmpScaler):
     def step(self, optimizer):
         """
         This function is similar as `optimizer.step()`, which performs parameters updating.
-        
+
         If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
         Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.
 
@@ -169,7 +169,7 @@ class GradScaler(AmpScaler):
         Examples:
 
             .. code-block:: python
-            
+
                 # required: gpu
                 import paddle
 
@@ -180,7 +180,7 @@ class GradScaler(AmpScaler):
                 with paddle.amp.auto_cast():
                     conv = model(data)
                     loss = paddle.mean(conv)
-                scaled = scaler.scale(loss)  # scale the loss 
+                scaled = scaler.scale(loss)  # scale the loss
                 scaled.backward()            # do backward
                 scaler.step(optimizer)       # update parameters
                 scaler.update()              # update the loss scaling ratio
@@ -212,11 +212,11 @@ class GradScaler(AmpScaler):
     def update(self):
         """
         Updates the loss_scaling.
-        
+
         Examples:
 
             .. code-block:: python
-            
+
                 # required: gpu
                 import paddle
 
@@ -227,11 +227,11 @@ class GradScaler(AmpScaler):
                 with paddle.amp.auto_cast():
                     conv = model(data)
                     loss = paddle.mean(conv)
-                scaled = scaler.scale(loss)     # scale the loss 
+                scaled = scaler.scale(loss)     # scale the loss
                 scaled.backward()               # do backward
                 scaler.step(optimizer)          # update parameters
                 scaler.update()                 # update the loss scaling ratio
-                optimizer.clear_grad() 
+                optimizer.clear_grad()
         """
         if not self._enable:
             return
@@ -242,7 +242,7 @@ class GradScaler(AmpScaler):
 
     def unscale_(self, optimizer):
         """
-        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).  
+        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).
         If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
 
         Args:
@@ -250,7 +250,7 @@ class GradScaler(AmpScaler):
 
         Returns:
             The unscaled parameters or original parameters.
-        
+
         Examples:
 
             .. code-block:: python
@@ -265,12 +265,12 @@ class GradScaler(AmpScaler):
                 with paddle.amp.auto_cast():
                     conv = model(data)
                     loss = paddle.mean(conv)
-                scaled = scaler.scale(loss)  # scale the loss 
+                scaled = scaler.scale(loss)  # scale the loss
                 scaled.backward()            # do backward
                 scaler.unscale_(optimizer)    # unscale the parameter
                 scaler.step(optimizer)
-                scaler.update()  
-                optimizer.clear_grad() 
+                scaler.update()
+                optimizer.clear_grad()
         """
         return super(GradScaler, self)._unscale(optimizer)
 
@@ -280,7 +280,7 @@ class GradScaler(AmpScaler):
 
         Returns:
             bool: enable loss scaling return True else return False.
-        
+
         Examples:
             .. code-block:: python
 
@@ -304,11 +304,11 @@ class GradScaler(AmpScaler):
 
         Returns:
             bool: if fixed loss_scaling is used return False, if the loss scaling is updated dynamicly return true.
-        
+
         Examples:
             .. code-block:: python
 
-                # required: gpu,xpu         
+                # required: gpu,xpu
                 import paddle
                 scaler = paddle.amp.GradScaler(enable=True,
                                                init_loss_scaling=1024,
@@ -328,7 +328,7 @@ class GradScaler(AmpScaler):
 
         Reurns:
             float:  the initial loss scaling factor.
-        
+
         Examples:
             .. code-block:: python
 
@@ -352,10 +352,10 @@ class GradScaler(AmpScaler):
 
         Args:
             new_init_loss_scaling(float):  The new_init_loss_scaling used to update initial loss scaling factor.
-        
+
         Examples:
             .. code-block:: python
-                
+
                 # required: gpu,xpu
                 import paddle
                 scaler = paddle.amp.GradScaler(enable=True,
@@ -378,7 +378,7 @@ class GradScaler(AmpScaler):
 
         Reurns:
             float:  the multiplier to use when increasing the loss scaling.
-        
+
         Examples:
             .. code-block:: python
 
@@ -402,7 +402,7 @@ class GradScaler(AmpScaler):
 
         Args:
             new_incr_ratio(float):  The new_incr_ratio used to update the multiplier to use when increasing the loss scaling.
-        
+
         Examples:
             .. code-block:: python
 
@@ -428,7 +428,7 @@ class GradScaler(AmpScaler):
 
         Reurns:
             float:  the less-than-one-multiplier to use when decreasing the loss scaling.
-        
+
         Examples:
             .. code-block:: python
 
@@ -452,7 +452,7 @@ class GradScaler(AmpScaler):
 
         Args:
             new_decr_ratio(float):  The new_decr_ratio used to update the less-than-one-multiplier to use when decreasing the loss scaling.
-        
+
         Examples:
             .. code-block:: python
 
@@ -478,7 +478,7 @@ class GradScaler(AmpScaler):
 
         Reurns:
             int:  the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
-        
+
         Examples:
             .. code-block:: python
 
@@ -502,7 +502,7 @@ class GradScaler(AmpScaler):
 
         Args:
             new_incr_every_n_steps(int):  The new_incr_every_n_steps used to update the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
-        
+
         Examples:
             .. code-block:: python
 
@@ -528,7 +528,7 @@ class GradScaler(AmpScaler):
 
         Reurns:
             int:  the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
-        
+
         Examples:
             .. code-block:: python
 
@@ -552,7 +552,7 @@ class GradScaler(AmpScaler):
 
         Args:
             new_decr_every_n_nan_or_inf(int):  The new_decr_every_n_nan_or_inf used to update the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
-        
+
         Examples:
             .. code-block:: python
 
@@ -588,7 +588,7 @@ class GradScaler(AmpScaler):
             decr_count(int): The number of recent consecutive skipped steps.
             use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
 
-        
+
         Examples:
 
             .. code-block:: python
@@ -610,10 +610,10 @@ class GradScaler(AmpScaler):
     def load_state_dict(self, state_dict):
         """
         Loads the scaler state.
-        
+
         Args:
            state_dict(dict): scaler state.  Should be an object returned from a call to `GradScaler.state_dict()`.
-                
+
         Examples:
 
             .. code-block:: python
diff --git a/python/paddle/audio/functional/functional.py b/python/paddle/audio/functional/functional.py
index 26c095a6e9a..071fa6cac71 100644
--- a/python/paddle/audio/functional/functional.py
+++ b/python/paddle/audio/functional/functional.py
@@ -247,7 +247,7 @@ def create_dct(n_mfcc: int,
     """Create a discrete cosine transform(DCT) matrix.
 
     Args:
-        n_mfcc (int): Number of mel frequency cepstral coefficients. 
+        n_mfcc (int): Number of mel frequency cepstral coefficients.
         n_mels (int): Number of mel filterbanks.
         norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'.
         dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index d2c2beadf38..549f859b9cc 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -24,12 +24,12 @@ __all__ = []
 def backward(tensors, grad_tensors=None, retain_graph=False):
     """
     Compute the backward gradients of given tensors.
-    
+
     Args:
         tensors(list of Tensors): the tensors which the gradient to be computed. The tensors can not contain the same tensor.
 
         grad_tensors(list of Tensors of None, optional): the init gradients of the `tensors`` .If not None, it must have the same length with ``tensors`` ,
-            and if any of the elements is None, then the init gradient is the default value which is filled with 1.0. 
+            and if any of the elements is None, then the init gradient is the default value which is filled with 1.0.
             If None, all the gradients of the ``tensors`` is the default value which is filled with 1.0.
             Defaults to None.
 
@@ -37,7 +37,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
             like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
             :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
             Defaults to False.
-    
+
     Returns:
         NoneType: None
 
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
index 673b047d5a3..6ba9acf382b 100644
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -54,16 +54,16 @@ class LegacyPyLayerContext(object):
     def save_for_backward(self, *tensors):
         """
         Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
-        
+
         .. note::
-            This API should be called at most once, and only inside `forward`. 
+            This API should be called at most once, and only inside `forward`.
 
         Args:
             tensors(list of Tensors): Tensors to be stored.
 
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -94,7 +94,7 @@ class LegacyPyLayerContext(object):
         Get the tensors stored by ``save_for_backward``.
 
         Returns:
-            list of Tensors or None: If context contains tensors stored by `save_for_backward`, 
+            list of Tensors or None: If context contains tensors stored by `save_for_backward`,
             then return these tensors, otherwise return None.
 
         Examples:
@@ -147,7 +147,7 @@ class CPyLayer(object):
 
         Returns:
             tensors or other types : output of PyLayer.
-        
+
         Examples:
             .. code-block:: python
 
@@ -210,15 +210,15 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
     Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules:
     1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod.
     Their first argument should be a context and `None` can not be included in the returned result.
-    2. Input of backward contains a context as the first argument, and the rest arguments are the 
-    gradient of forward's output tensors. so the number of backward's input tensors equal to 
-    the number of forward output tensors. If you need the forward's inputs or outputs in `backward`, 
+    2. Input of backward contains a context as the first argument, and the rest arguments are the
+    gradient of forward's output tensors. so the number of backward's input tensors equal to
+    the number of forward output tensors. If you need the forward's inputs or outputs in `backward`,
     you can use `save_for_backward` to store the required tensors, and then use them in the backward.
     3. Output of backward function can only be `Tensor` or tuple/list of `Tensor`.
-    Output tensors of backward are the gradient of forward's input tensors, 
+    Output tensors of backward are the gradient of forward's input tensors,
     so the number of backward's output tensors equal to the number of forward input tensors.
     After building the custom Layer, run it through the `apply` method.
-    
+
 
     Examples:
         .. code-block:: python
@@ -259,8 +259,8 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
     @staticmethod
     def forward(ctx, *args, **kwargs):
         """
-        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as 
-        the first argument, followed by any number of arguments (tensors or other types). 
+        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
+        the first argument, followed by any number of arguments (tensors or other types).
         `None` can not be included in the returned result.
 
         Args:
@@ -269,7 +269,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
 
         Returns:
             tensors or other types : output of PyLayer.
-        
+
         Examples:
             .. code-block:: python
 
@@ -297,9 +297,9 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
     @staticmethod
     def backward(ctx, *args, **kwargs):
         """
-        This is a function to calculate the gradient. It is to be overloaded by subclasses. 
-        It must accept a object of `PyLayerContext` as the first argument, and the rest 
-        arguments are the gradient of forward's output tensors. Output tensors of backward 
+        This is a function to calculate the gradient. It is to be overloaded by subclasses.
+        It must accept a object of `PyLayerContext` as the first argument, and the rest
+        arguments are the gradient of forward's output tensors. Output tensors of backward
         are the gradient of forward's input tensors.
 
         Args:
@@ -308,7 +308,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
 
         Returns:
             Tensor or list of Tensors: The gradient of forward's input tensor(s).
-        
+
         Examples:
             .. code-block:: python
 
@@ -340,16 +340,16 @@ class EagerPyLayerContext(object):
     def save_for_backward(self, *tensors):
         """
         Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
-        
+
         .. note::
-            This API should be called at most once, and only inside `forward`. 
+            This API should be called at most once, and only inside `forward`.
 
         Args:
             tensors(list of Tensors): Tensors to be stored.
 
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -380,7 +380,7 @@ class EagerPyLayerContext(object):
         Get the tensors stored by ``save_for_backward``.
 
         Returns:
-            list of Tensors or None: If context contains tensors stored by `save_for_backward`, 
+            list of Tensors or None: If context contains tensors stored by `save_for_backward`,
             then return these tensors, otherwise return None.
 
         Examples:
@@ -410,11 +410,11 @@ class EagerPyLayerContext(object):
     def mark_not_inplace(self, *args):
         """
         Marks inputs as not inplace.
-        This should be called at most once, only from inside the `forward` method, 
+        This should be called at most once, only from inside the `forward` method,
         and all arguments should be Tensor inputs.
 
-        If the Tensor returned by `forward` method is the same as the Tensor input of forward, 
-        and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output. 
+        If the Tensor returned by `forward` method is the same as the Tensor input of forward,
+        and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output.
         Thereby preventing the auto grad information of the input Tensor from being overwritten.
 
         Examples:
@@ -427,7 +427,7 @@ class EagerPyLayerContext(object):
                     def forward(ctx, x):
                         ctx.mark_not_inplace(x)
                         return x
-                    
+
                     @staticmethod
                     def backward(ctx, grad_output):
                         out = grad_output.exp()
@@ -438,7 +438,7 @@ class EagerPyLayerContext(object):
                 attn_layers = []
                 for idx in range(0, 2):
                     attn_layers.append(Exp())
-                
+
                 for step in range(0, 2):
                     a = x
                     for j in range(0,2):
@@ -450,7 +450,7 @@ class EagerPyLayerContext(object):
     def mark_non_differentiable(self, *args):
         """
         Marks outputs as non-differentiable.
-        This should be called at most once, only from inside the `forward` method, 
+        This should be called at most once, only from inside the `forward` method,
         and all arguments should be tensor outputs.
 
         This will mark outputs as not requiring gradients, increasing the
@@ -564,8 +564,8 @@ class EagerPyLayer(
     @staticmethod
     def forward(ctx, *args, **kwargs):
         """
-        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as 
-        the first argument, followed by any number of arguments (tensors or other types). 
+        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
+        the first argument, followed by any number of arguments (tensors or other types).
         `None` can not be included in the returned result.
 
         Args:
@@ -574,7 +574,7 @@ class EagerPyLayer(
 
         Returns:
             tensors or other types : output of PyLayer.
-        
+
         Examples:
             .. code-block:: python
 
@@ -602,9 +602,9 @@ class EagerPyLayer(
     @staticmethod
     def backward(ctx, *args):
         """
-        This is a function to calculate the gradient. It is to be overloaded by subclasses. 
-        It must accept a object of `PyLayerContext` as the first argument, and the rest 
-        arguments are the gradient of forward's output tensors. Output tensors of backward 
+        This is a function to calculate the gradient. It is to be overloaded by subclasses.
+        It must accept a object of `PyLayerContext` as the first argument, and the rest
+        arguments are the gradient of forward's output tensors. Output tensors of backward
         are the gradient of forward's input tensors.
 
         Args:
@@ -613,7 +613,7 @@ class EagerPyLayer(
 
         Returns:
             Tensor or list of Tensors: The gradient of forward's input tensor(s).
-        
+
         Examples:
             .. code-block:: python
 
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index f787f603f7e..b87ddc74858 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -17,30 +17,30 @@ __all__ = []
 
 def batch(reader, batch_size, drop_last=False):
     """
-    This operator creates a batched reader which combines the data from the 
+    This operator creates a batched reader which combines the data from the
     input reader to batched data.
-    
+
     Args:
         reader(generator): the data reader to read from.
         batch_size(int): size of each mini-batch.
-        drop_last(bool, optional): If set to True, the last batch is dropped when 
+        drop_last(bool, optional): If set to True, the last batch is dropped when
             the size of last batch is not equal to batch_size, if set to False,
             it will not. Default: False.
     Returns:
-        The batched reader. 
-    
+        The batched reader.
+
     Return Type:
-        generator   
+        generator
 
     Examples:
         .. code-block:: python
-           
+
             import paddle
             def reader():
                 for i in range(10):
                     yield i
             batch_reader = paddle.batch(reader, batch_size=2)
-            
+
             for data in batch_reader():
                 print(data)
 
diff --git a/python/paddle/compat.py b/python/paddle/compat.py
index 82e6491b809..c94132e4398 100644
--- a/python/paddle/compat.py
+++ b/python/paddle/compat.py
@@ -25,7 +25,7 @@ long_type = int
 def to_text(obj, encoding='utf-8', inplace=False):
     """
     All string in PaddlePaddle should be represented as a literal string.
-    
+
     This function will convert object to a literal string without any encoding.
     Especially, if the object type is a list or set container, we will iterate
     all items in the object and convert them to literal string.
@@ -43,7 +43,7 @@ def to_text(obj, encoding='utf-8', inplace=False):
 
     Returns:
         Decoded result of obj
-    
+
     Examples:
 
         .. code-block:: python
@@ -121,7 +121,7 @@ def _to_text(obj, encoding):
 def to_bytes(obj, encoding='utf-8', inplace=False):
     """
     All string in PaddlePaddle should be represented as a literal string.
-    
+
     This function will convert object to a bytes with specific encoding.
     Especially, if the object type is a list or set container, we will iterate
     all items in the object and convert them to bytes.
@@ -140,7 +140,7 @@ def to_bytes(obj, encoding='utf-8', inplace=False):
 
     Returns:
         Decoded result of obj
-    
+
     Examples:
 
         .. code-block:: python
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index aa959150cec..e9894309999 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -119,7 +119,7 @@ def XPUPlace(dev_id):
         .. code-block:: python
 
             # required: xpu
-            
+
             import paddle
             place = paddle.device.XPUPlace(0)
     """
@@ -163,15 +163,15 @@ def MLUPlace(dev_id):
 
 def get_cudnn_version():
     """
-    This funciton return the version of cudnn. the retuen value is int which represents the 
+    This funciton return the version of cudnn. the retuen value is int which represents the
     cudnn version. For example, if it return 7600, it represents the version of cudnn is 7.6.
-    
+
     Returns:
         int: A int value which represents the cudnn version. If cudnn version is not installed, it return None.
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
 
             cudnn_version = paddle.device.get_cudnn_version()
@@ -305,7 +305,7 @@ def set_device(device):
     Examples:
 
      .. code-block:: python
-            
+
         import paddle
 
         paddle.device.set_device("cpu")
@@ -322,13 +322,13 @@ def get_device():
     """
     This funciton can get the current global device of the program is running.
     It's a string which is like 'cpu', 'gpu:x', 'xpu:x', 'mlu:x' and 'npu:x'. if the global device is not
-    set, it will return a string which is 'gpu:x' when cuda is avaliable or it 
+    set, it will return a string which is 'gpu:x' when cuda is avaliable or it
     will return a string which is 'cpu' when cuda is not avaliable.
 
     Examples:
 
      .. code-block:: python
-            
+
         import paddle
         device = paddle.device.get_device()
 
@@ -394,7 +394,7 @@ def get_all_custom_device_type():
     """
     Get all available custom device types.
 
-    Returns: 
+    Returns:
         A list of all available custom device types.
 
     Examples:
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index d867f071229..dc80bb9b563 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -42,12 +42,12 @@ def current_stream(device=None):
     Return the current CUDA stream by the device.
 
     Parameters:
-        device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from. 
+        device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from.
         If device is None, the device is the current device. Default: None.
-    
+
     Returns:
         CUDAStream: the stream to the device.
-    
+
     Examples:
         .. code-block:: python
 
@@ -82,7 +82,7 @@ def synchronize(device=None):
     Parameters:
         device(paddle.CUDAPlace()|int, optional): The device or the ID of the device.
         If device is None, the device is the current device. Default: None.
-    
+
     Examples:
         .. code-block:: python
 
@@ -111,7 +111,7 @@ def synchronize(device=None):
 def device_count():
     '''
     Return the number of GPUs available.
-    
+
     Returns:
         int: the number of GPUs available.
 
@@ -158,7 +158,7 @@ def extract_cuda_device_id(device, op_name):
     Return the id of the given cuda device. It is just a utility that will not be exposed to users.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
             the string name of device like 'gpu:x'.
             Default: None.
 
@@ -197,12 +197,12 @@ def max_memory_allocated(device=None):
     Return the peak size of gpu memory that is allocated to tensor of the given device.
 
     .. note::
-        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need. 
+        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need.
         For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
             Default: None.
 
     Return:
@@ -232,8 +232,8 @@ def max_memory_reserved(device=None):
     Return the peak size of GPU memory that is held by the allocator of the given device.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
             Default: None.
 
     Return:
@@ -263,12 +263,12 @@ def memory_allocated(device=None):
     Return the current size of gpu memory that is allocated to tensor of the given device.
 
     .. note::
-        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need. 
-        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes. 
+        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need.
+        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
             Default: None.
 
     Return:
@@ -298,14 +298,14 @@ def memory_reserved(device=None):
     Return the current size of GPU memory that is held by the allocator of the given device.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
             Default: None.
 
     Return:
         int: The current size of GPU memory that is held by the allocator of the given device, in bytes.
 
-    Examples:    
+    Examples:
         .. code-block:: python
 
             # required: gpu
@@ -389,18 +389,18 @@ def get_device_properties(device=None):
     Return the properties of given device.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x' which to get the properties of the 
-            device from. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x' which to get the properties of the
+            device from. If device is None, the device is the current device.
             Default: None.
 
     Returns:
-        _gpuDeviceProperties: The properties of the device which include ASCII string 
-        identifying device, major compute capability, minor compute capability, global 
+        _gpuDeviceProperties: The properties of the device which include ASCII string
+        identifying device, major compute capability, minor compute capability, global
         memory available and the number of multiprocessors on the device.
 
     Examples:
-    
+
         .. code-block:: python
 
             # required: gpu
@@ -484,7 +484,7 @@ def get_device_capability(device=None):
     Return the major and minor revision numbers defining the device's compute capability which are got from CUDA function `cudaDeviceProp <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g1bf9d625a931d657e08db2b4391170f0>`_.
 
     Parameters:
-        device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device. 
+        device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device.
 
     Returns:
         tuple(int,int): the major and minor revision numbers defining the device's compute capability.
diff --git a/python/paddle/distributed/auto_parallel/cluster_v2.py b/python/paddle/distributed/auto_parallel/cluster_v2.py
index bdaf18ee650..29429a8ad69 100644
--- a/python/paddle/distributed/auto_parallel/cluster_v2.py
+++ b/python/paddle/distributed/auto_parallel/cluster_v2.py
@@ -49,14 +49,14 @@ class LinkType(IntEnum):
 
 class DeviceMesh(core.DeviceMesh):
     r"""
-    The class `DeviceMesh` describes the topology of physical devices. 
+    The class `DeviceMesh` describes the topology of physical devices.
 
     Args:
         mesh (list|numpy.array): an N-dimensional array describes the toplogy
             of logical processes.
         dim_names (list, optional): the i-th element of this list gives the name of the
             i-th dimension.
-    
+
     Returns:
         None
 
@@ -65,9 +65,9 @@ class DeviceMesh(core.DeviceMesh):
 
             import paddle
             import paddle.distributed as dist
-            
+
             paddle.enable_static()
-            
+
             mesh = dist.DeviceMesh([[2, 4, 5], [0, 1, 3]])
             assert mesh.shape == [2, 3]
             assert mesh.device_ids == [2, 4, 5, 0, 1, 3]
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 87122882041..c8633b4a730 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -901,7 +901,7 @@ class Completer:
 
     def _complete_high_order_grad_annotation(self, serial_main_program=None):
         """
-        NOTE: 
+        NOTE:
             [HighOrderGrad] Complete the annotation of vars and ops only for high order gradient.
             This function is temporary to support high order gradient, and will be removed in the future.
         """
diff --git a/python/paddle/distributed/auto_parallel/converter.py b/python/paddle/distributed/auto_parallel/converter.py
index 69292ab1827..162ca135f37 100644
--- a/python/paddle/distributed/auto_parallel/converter.py
+++ b/python/paddle/distributed/auto_parallel/converter.py
@@ -21,18 +21,18 @@ from ..utils import get_logger
 
 class Converter(object):
     """
-    Converter is a class object for auto parallel to convert tensors from 
-    one parallel strategy to another one. Tensors will merge and slice value 
+    Converter is a class object for auto parallel to convert tensors from
+    one parallel strategy to another one. Tensors will merge and slice value
     with their strategy when strategies are different.
     """
 
     def __init__(self, tensors_dict, pre_strategy, cur_strategy):
         """
         Args:
-            tensors_dict(dict): tensors' value of all ranks that to be converted. 
+            tensors_dict(dict): tensors' value of all ranks that to be converted.
                 key is tensor's name(str), value is all ranks' data(list(numpy.ndarray))
             pre_strategy(dict): tensors' distributed attribute of last training process.
-                key is tensor's name(str), value is tensor's distributed attribute in last 
+                key is tensor's name(str), value is tensor's distributed attribute in last
                 training process.
             cur_strategy(dict): tensors' distributed attribute of current rank.
                 key is tensor's name(str), value is tensor's distributed attribute in current
@@ -432,7 +432,7 @@ class Converter(object):
                 process_group = [0, 1, 2]
 
                 slice_tensor = _slice_tensor(complete_tensor, [[], [], [2, 4]], 3)
-                # slice_tensor: 
+                # slice_tensor:
                 # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]
 
                 index = _get_sliced_index(rank, complete_shape, dims_mapping
diff --git a/python/paddle/distributed/auto_parallel/cost_model.py b/python/paddle/distributed/auto_parallel/cost_model.py
index e35fae57cae..ac8f4d156bb 100644
--- a/python/paddle/distributed/auto_parallel/cost_model.py
+++ b/python/paddle/distributed/auto_parallel/cost_model.py
@@ -433,9 +433,9 @@ class CostModel(object):
 
     def merge_linear(self):
         r'''
-        This method does the following: 
+        This method does the following:
         If X depends on Y only, they must be run sequentially.
-            [ e.g. A ->- C ->- D   D and E depends on C only.] 
+            [ e.g. A ->- C ->- D   D and E depends on C only.]
             [      B ->-/ \->- E   C depends on A and B.     ]
         We merge X and Y into a new node and sum up their cost time.
         '''
@@ -453,7 +453,7 @@ class CostModel(object):
         r'''
         This method does the following:
         If a node has more than one successor, there is *branch*.
-            [ e.g. A ->- B ->- D                                       ] 
+            [ e.g. A ->- B ->- D                                       ]
             [       \->- C ->- / , B and C can be run at the same time ]
             case 1: if B or C is null (or D is directly dependent on A),
                     it's equivalent to A->C->D or A->B->D, fall back to self.merge_linear
@@ -789,12 +789,12 @@ def estimate_cost(distributed_program, cluster, pipeline_config,
                   standalone_cost_data, batch_size):
     """
     Estimated cost from distributed program, cluster model and distributed settings.
-    
+
     Args:
         distributed_program(list): list of paddle programs
-        cluster(Cluster): cluster model 
+        cluster(Cluster): cluster model
         standalone_cost_data(CostData): cost data given by paddle.core
-        batch_size(int): batch size of the training workload 
+        batch_size(int): batch size of the training workload
         pipeline_config(list): configuration of pipeline stage allocation
     """
     # the following line is left for now, cluster model will be involved in the future
diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py
index 59a2d7a5823..b06e72aa9ae 100644
--- a/python/paddle/distributed/auto_parallel/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
@@ -25,11 +25,11 @@ from .utils import _linear_idx2coordinate
 
 class DistributedTensor:
     """
-    DistributedTensor represents the distribution of tensor on the process group and 
+    DistributedTensor represents the distribution of tensor on the process group and
     local tensors can be created by DistributedTensor.
     Only support even sharding now and uneven sharding will be supported in the future.
-    Local tensor information can be obtained from the DistributedTensor instance object, 
-    or obtained by the static methods provided by DistributedTensor, 
+    Local tensor information can be obtained from the DistributedTensor instance object,
+    or obtained by the static methods provided by DistributedTensor,
     including shard (i.e. the index in the serial tensor), offsets, and sizes.
     """
 
diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py
index 588d2b05b79..e06120a7e19 100644
--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -39,8 +39,8 @@ def shard_tensor(x, dist_attr=None):
         x (Tensor): the tensor to be sharded.
         dist_attr (dict): the tensor distributed attributes. The accepted attributes are as follow:
             "process_mesh": a nested list an to describe the mesh topology of logical processes.
-            "dims_mapping": a list to describe the mapping between `x` and `process_mesh`, the dimension 
-                `i` of `x` is split across the dimension `dims_mapping[i]` of `process_mesh`, 
+            "dims_mapping": a list to describe the mapping between `x` and `process_mesh`, the dimension
+                `i` of `x` is split across the dimension `dims_mapping[i]` of `process_mesh`,
                 where -1 means that tensor dimension is not split.
             Both process_mesh and dims_mapping are optional and users can specify as need.
 
@@ -52,7 +52,7 @@ def shard_tensor(x, dist_attr=None):
 
             import paddle
             import paddle.distributed as dist
-            
+
             paddle.enable_static()
 
             x = paddle.ones([4, 6])
@@ -76,12 +76,12 @@ def shard_op(op_fn, dist_attr=None):
 
     Args:
         op_fn (callable): a callable operator or module to be sharded.
-        dist_attr (dict): the operator distributed attributes. The accepted attributes are classified into 
-            two categories. The first category decsribes the distributed attributes shared by all inputs and 
+        dist_attr (dict): the operator distributed attributes. The accepted attributes are classified into
+            two categories. The first category decsribes the distributed attributes shared by all inputs and
             outputs, and only `process_mesh` can be specified now. The second category describes distributed
             attributes for inputs or outputs same as the `dist_attr` of `shard_tensor`. All of them are
             optional and users can specify them as need. Note that `process_mesh` for operators must be the
-            same as these process_meshes for inputs and outputs. 
+            same as these process_meshes for inputs and outputs.
 
     Returns:
         list: the outputs of the function `op_fn`, which are annotated with distributed attributes.
@@ -93,7 +93,7 @@ def shard_op(op_fn, dist_attr=None):
             import paddle.distributed as dist
 
             paddle.enable_static()
-            
+
             x = paddle.ones([4, 6])
             y = paddle.zeros([4, 6])
             dist_add = dist.shard_op(paddle.add,
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index e7e7ad1e0ea..247f8b9fac0 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -176,7 +176,7 @@ def register_distributed_operator_impl(op_type, dist_impl):
 
 def find_compatible_distributed_operator_impls(dist_op, fwd=True, partial=True):
     """
-    Here just return the first compatible implemention. 
+    Here just return the first compatible implemention.
     This will be improved by cost model in the future.
     """
     op_type = dist_op.serial_op.type
@@ -327,9 +327,9 @@ def get_data_parallel_group(dist_ctx, op, act_grad_names, rank):
 
     Args:
         dist_ctx (DistributedContext): dist context.
-        op (Operator): the current (backward) operator which might need. 
-        act_grad_names (list): list of input activation grads variable name to the current operator. 
-        out_grad_names (list): list of the output parameter's grads variable name of the current operator. 
+        op (Operator): the current (backward) operator which might need.
+        act_grad_names (list): list of input activation grads variable name to the current operator.
+        out_grad_names (list): list of the output parameter's grads variable name of the current operator.
         rank (int): global ranks index for current process.
     """
     dp_group = None
@@ -360,13 +360,13 @@ def get_data_parallel_group(dist_ctx, op, act_grad_names, rank):
 
 def sync_and_scale_gradients(dist_ctx, op, dp_group, allreduce_var_names):
     """
-    insert the allreudce and scale ops for gradients of model 
+    insert the allreudce and scale ops for gradients of model
     parameters for operator in data parallelism.
 
     Args:
         dist_ctx (DistributedContext): dist context.
-        op (Operator): the current (backward) operator which might need. 
-        allreduce_var_names (list): list of the parameter's grads variable name in the current operator output. 
+        op (Operator): the current (backward) operator which might need.
+        allreduce_var_names (list): list of the parameter's grads variable name in the current operator output.
     """
 
     op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
@@ -417,14 +417,14 @@ def sync_and_scale_gradients(dist_ctx, op, dp_group, allreduce_var_names):
 def gradient_synchronization(dist_ctx, op, act_grad_names, out_grad_names,
                              rank):
     """
-    conduct the allreudce and scaling（dp size）for gradients of model 
+    conduct the allreudce and scaling（dp size）for gradients of model
     parameters for operator in data parallelism.
 
     Args:
         dist_ctx (DistributedContext): dist context.
-        op (Operator): the current (backward) operator which might need. 
-        act_grad_names (list): list of input activation grads variable name to the current operator. 
-        out_grad_names (list): list of the output parameter's grads variable name of the current operator. 
+        op (Operator): the current (backward) operator which might need.
+        act_grad_names (list): list of input activation grads variable name to the current operator.
+        out_grad_names (list): list of the output parameter's grads variable name of the current operator.
         rank (int): global ranks index for current process.
     """
 
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index 01f7207ab91..70d9a2d21b5 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -57,9 +57,9 @@ class AutoParallelizer:
     AutoParallelizer is the main controller class to do the auto parallel process.
     And the auto parallel process will be triggered in the wrapped parallelize function.
     To facilitate the auto parallelization, it will contain information about program, cluster and the
-    related context. In this basic version, the program information will be retrevied from 
+    related context. In this basic version, the program information will be retrevied from
     Fleet object, and the cluster information can be retrevied in the new created Cluster object,
-    and the context information can be retrevied in the new created DistributedContext. 
+    and the context information can be retrevied in the new created DistributedContext.
     """
 
     def __init__(self, fleet):
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index 3262505416b..d59b4bb6617 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -39,7 +39,7 @@ class Partitioner(object):
     warning:: Partitioner is experimental and subject to change.
 
     Partitioner convert a program into another program.
-    Given a serial program which has been auto completed with shard annotation, the Partitioner 
+    Given a serial program which has been auto completed with shard annotation, the Partitioner
     convert the serial program into a "distributed" program. The Partitioner will  modify the serial
     program in following two ways, which is also the major difference between serial and distributed program:
         1. partition op: replace a serial op into its corresponding dist op infered from the shard annotation
diff --git a/python/paddle/distributed/auto_parallel/process_mesh.py b/python/paddle/distributed/auto_parallel/process_mesh.py
index f751087e29e..ab1d68bbf8e 100644
--- a/python/paddle/distributed/auto_parallel/process_mesh.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh.py
@@ -38,7 +38,7 @@ def _flatten_nested_list(nested_list):
 
 class ProcessMesh(object):
     r"""
-    The class `Processmesh` describes the topology of logical processes. 
+    The class `Processmesh` describes the topology of logical processes.
     A mesh is an N-dimensional array. The shape of the N-dimensional
     array represents the topology of logical processes and every
     element of the N-dimensional array represent a logical process. For
@@ -52,9 +52,9 @@ class ProcessMesh(object):
     Args:
         mesh (list): an N-dimensional array (nested list) describes the toplogy
             of logical processes. The shape of the N-dimensional array
-            represents the topology of logical processes and every 
+            represents the topology of logical processes and every
             element of the N-dimensional array represents a logical process.
-    
+
     Returns:
         None
 
@@ -66,9 +66,9 @@ class ProcessMesh(object):
 
             import paddle
             import paddle.distributed as dist
-            
+
             paddle.enable_static()
-            
+
             mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
             assert mesh.topology == [2, 3]
             assert mesh.processes == [2, 4, 5, 0, 1, 3]
diff --git a/python/paddle/distributed/auto_parallel/process_mesh_v2.py b/python/paddle/distributed/auto_parallel/process_mesh_v2.py
index 08a391e51eb..b57cecf41e2 100644
--- a/python/paddle/distributed/auto_parallel/process_mesh_v2.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh_v2.py
@@ -19,14 +19,14 @@ from paddle.fluid import core
 
 class ProcessMesh(core.ProcessMesh):
     r"""
-    The class `Processmesh` describes the topology of logical processes. 
+    The class `Processmesh` describes the topology of logical processes.
 
     Args:
         mesh (list|numpy.array): an N-dimensional array describes the toplogy
             of logical processes.
         dim_names (list, optional): the i-th element of this list gives the name of the
             i-th dimension.
-    
+
     Returns:
         None
 
@@ -35,9 +35,9 @@ class ProcessMesh(core.ProcessMesh):
 
             import paddle
             import paddle.distributed as dist
-            
+
             paddle.enable_static()
-            
+
             mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
             assert mesh.shape == [2, 3]
             assert mesh.processe_ids == [2, 4, 5, 0, 1, 3]
diff --git a/python/paddle/distributed/auto_parallel/tuner/algorithms.py b/python/paddle/distributed/auto_parallel/tuner/algorithms.py
index 8440ab91a81..63aa56f3e1f 100644
--- a/python/paddle/distributed/auto_parallel/tuner/algorithms.py
+++ b/python/paddle/distributed/auto_parallel/tuner/algorithms.py
@@ -23,12 +23,12 @@ from .trial import OptimizationTunerTrial as Trial
 
 class AlgorithmBase(ABC):
     """
-    An Tuning alogrithm is a class to find out an optimal configuration 
-    given the selected tuning optimization pass(es) and the arguments to be tuned. 
+    An Tuning alogrithm is a class to find out an optimal configuration
+    given the selected tuning optimization pass(es) and the arguments to be tuned.
     Different optimization pass(es) will correspond to a different algorithm,
     where different search space **pruning rules** will applied.
 
-    In another word, the key "algorithm" for this class is the 
+    In another word, the key "algorithm" for this class is the
     search space pruning rules specific for the given optimization scenario.
     """
     _REGISTERED_ALGORITHMS = {}
@@ -52,9 +52,9 @@ class AlgorithmBase(ABC):
 
     def collect_model_info(self, main_prog, startup_prog):
         """
-        Collect the model static info (from programs) that could be used to 
-        pruning candidate trials and saving tuning time.For instance, 
-        model info like number of model parameters and activation memory could be 
+        Collect the model static info (from programs) that could be used to
+        pruning candidate trials and saving tuning time.For instance,
+        model info like number of model parameters and activation memory could be
         used to prune candidated trial and decide the next trial.
         """
         pass
@@ -70,7 +70,7 @@ class AlgorithmBase(ABC):
     @abstractmethod
     def update(self, results):
         """
-        Update the algorthim with the results of last trial. Using this information is used to 
+        Update the algorthim with the results of last trial. Using this information is used to
         pruning the search space of the future trial.
         """
         pass
diff --git a/python/paddle/distributed/auto_parallel/tuner/config.py b/python/paddle/distributed/auto_parallel/tuner/config.py
index 19818a3a655..151a9a8bc76 100644
--- a/python/paddle/distributed/auto_parallel/tuner/config.py
+++ b/python/paddle/distributed/auto_parallel/tuner/config.py
@@ -33,7 +33,7 @@ class TuningConfig(object):
     """
     A uniform config wrap:
     distributed strategy: the user defined configuration for optimization pass
-    tuning config: configuration for the tuning process: mode (profile or cost model), log dir, extra tuning config for optimization like search range for specific 
+    tuning config: configuration for the tuning process: mode (profile or cost model), log dir, extra tuning config for optimization like search range for specific
     """
 
     def __init__(self, user_config, strategy):
diff --git a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
index ec50371c7ec..261a382eb17 100644
--- a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
@@ -161,7 +161,7 @@ def _copy_context(ref_dist_context):
 
 class OptimizationTuner:
     """
-    OptimizationTuner is used to manage the tuning procedure of hyper-parameters (configs) 
+    OptimizationTuner is used to manage the tuning procedure of hyper-parameters (configs)
     of Optimization Pass in AutoParallel.
     """
 
@@ -466,7 +466,7 @@ class OptimizationTuner:
         Return the best optimization configuration found in the tuning.
 
         Returns:
-            A object of fleet.DistributedStrategy with best configuration.       
+            A object of fleet.DistributedStrategy with best configuration.
         """
         assert self._best_iter >= 0, "The best configuration is not found yet !"
         best_trial = self._finished_trials[self._best_iter]
@@ -481,7 +481,7 @@ class OptimizationTuner:
         summary_ = """
 Tuning Result Summary
 Run total {} trials with {} min.
-The best trial is: [{}], whose configuration is following: 
+The best trial is: [{}], whose configuration is following:
         """.format(len(self._finished_trials),
                    (time.time() - self._tuning_start_time) / 60,
                    best_trial.name)
@@ -508,8 +508,8 @@ The best trial is: [{}], whose configuration is following:
 
     def tune(self):
         """
-        Performs the search for best hyperparameter configuations 
-        for the selected optimization pass(es). 
+        Performs the search for best hyperparameter configuations
+        for the selected optimization pass(es).
         """
 
         # step1: collect model info which might be used for
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index 8813bbe5449..bc797530b75 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -171,7 +171,7 @@ def print_program_with_dist_attr(program, dist_context=None):
 
 def _get_comm_group(processes, shape, axis, rank):
     """
-    Given a rank and the processes mesh the rank belongs to,  
+    Given a rank and the processes mesh the rank belongs to,
     compute the communication peers of the rank based on the give axis in the mesh.
 
     Example: 16 processes managed in a 4-Dimensinal mesh with shape of [2, 2, 2, 2].
@@ -205,7 +205,7 @@ def _get_comm_group(processes, shape, axis, rank):
 
 def _get_idx_in_axis(processes, shape, axis, rank):
     """
-    Given a rank and the processes mesh the rank belongs to,  
+    Given a rank and the processes mesh the rank belongs to,
     compute the index of the rank in given axis.
 
     Example: 27 processes managed in a 3-Dimensinal mesh with shape of [3, 3, 3].
@@ -226,20 +226,20 @@ def _coordinate2linear_idx(mesh_shape, coordinate):
     """
     convert a coordinate in multidimensional mesh space into a scala idx in linear space.
 
-    it use Row-major order for dimension conversion. 
+    it use Row-major order for dimension conversion.
     so it has:  [most_significant_dim, ..., least_significant_dim]
-    assume: 
+    assume:
 
         the size of i-th dimension to be:  S[i]
         the index of j-th dimension is: I[j]
 
-    linear_idx of a n dimensional coordinate is: 
+    linear_idx of a n dimensional coordinate is:
 
         I[n-1] * (S[n-2] * S[n-3] * S[n-4] *     ....    S[0]) +
-        I[n-2] * (         S[n-3] * S[n-4] *     ....    S[0]) +       
-        I[n-3] * (                  S[n-4] *     ....    S[0]) +  
+        I[n-2] * (         S[n-3] * S[n-4] *     ....    S[0]) +
+        I[n-3] * (                  S[n-4] *     ....    S[0]) +
         ...
-        I[1]   * (                                       S[0]) + 
+        I[1]   * (                                       S[0]) +
         I[0]
 
     """
@@ -279,7 +279,7 @@ def _linear_idx2coordinate(mesh_shape, linear_idx):
     mapping a linear scala into multidimensional mesh space, return it coordinate in that space.
 
     it is the inverse function of _coordinate2linear_idx.
-    assume: 
+    assume:
 
         the size of i-th dimension to be:  S[i]
         the index of j-th dimension is: I[j]
@@ -460,8 +460,8 @@ def save_distributed_checkpoint(program,
                                 addition_info=None,
                                 is_integrated=False,
                                 dist_context=None):
-    """ 
-    Save model parameter state, optimzer state, distributed attribute and 
+    """
+    Save model parameter state, optimzer state, distributed attribute and
     additional information of each rank.
 
     Args:
@@ -502,7 +502,7 @@ def save_distributed_checkpoint(program,
 
 
 def load_distributed_checkpoint(checkpoint_path, dist_attr_path):
-    """ 
+    """
     Load parameter, optimizer, distributed attribute and addition_info.
 
     Args:
@@ -512,7 +512,7 @@ def load_distributed_checkpoint(checkpoint_path, dist_attr_path):
     Returns:
         param_dict(dict): parameters' value of all ranks.
         dist_attr(dict): parameters' distributed attribute.
-        addition_info(dict): additional information user saved in last training. 
+        addition_info(dict): additional information user saved in last training.
 
     Notes:
         The return, 'addition_info', is belonging to the first file of checkpoint_path by default.
@@ -520,9 +520,9 @@ def load_distributed_checkpoint(checkpoint_path, dist_attr_path):
     Examples:
         .. code-block:: python
 
-            ckpt_path = ['./model_state_rank0.pdmodel', 
+            ckpt_path = ['./model_state_rank0.pdmodel',
                          './model_state_rank1.pdmodel']
-            dist_attr_path = ['./dist_attr_rank0.pdattr', 
+            dist_attr_path = ['./dist_attr_rank0.pdattr',
                               './dist_attr_rank1.pdattr']
             param_dict, dist_attr, add_info = load_distributed_checkpoint(ckpt_path, dist_attr_path)
     """
@@ -542,7 +542,7 @@ def load_checkpoint_into_program(checkpoint_path,
                                  dist_attr_path,
                                  program,
                                  dist_context=None):
-    """ 
+    """
     Load parameter, optimizer, distributed attribute and addition_info into model.
 
     Args:
@@ -553,7 +553,7 @@ def load_checkpoint_into_program(checkpoint_path,
 
     Returns:
         addition_info(dict): user saved in last train.
-    
+
     Notes:
         The return, 'addition_info', is belonging to the first file of checkpoint_path by default.
 
@@ -561,9 +561,9 @@ def load_checkpoint_into_program(checkpoint_path,
         .. code-block:: python
 
             exe.run(startup_program)
-            ckpt_path = ['./model_state_rank0.pdmodel', 
+            ckpt_path = ['./model_state_rank0.pdmodel',
                          './model_state_rank1.pdmodel']
-            dist_attr_path = ['./dist_attr_rank0.pdattr', 
+            dist_attr_path = ['./dist_attr_rank0.pdattr',
                               './dist_attr_rank1.pdattr']
             load_checkpoint_into_program(ckpt_path, dist_attr_path, main_program)
     """
@@ -590,7 +590,7 @@ def load_checkpoint_into_program(checkpoint_path,
 
 
 def load_parameter_into_program(param_dict, program):
-    """ 
+    """
     Load parameters into program.
 
     Args:
@@ -672,7 +672,7 @@ def _load_distributed_state_dict(checkpoint_path):
 
 
 def get_dist_attr(program, dist_context=None):
-    """ 
+    """
     Get distributed attribute of current rank.
 
     Args:
@@ -935,7 +935,7 @@ def _get_sliced_param_index(rank, complete_shape, dims_mapping, process_shape,
             process_group = [0, 1, 2]
 
             slice_param = _slice_parameter(complete_param, [[], [], [2, 4]], 3)
-            # slice_param: 
+            # slice_param:
             # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]
 
             index = _get_sliced_param_index(rank, complete_shape, dims_mapping
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 3a06c2f84af..57731b8ad0e 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -579,10 +579,10 @@ def destroy_process_group(group=None):
     Destroy a given group for communication
 
     Args:
-        group (ProcessGroup, optional): The group to be destroyed. All of process groups, including 
-                                        the default group, will be destroyed and the distributed 
+        group (ProcessGroup, optional): The group to be destroyed. All of process groups, including
+                                        the default group, will be destroyed and the distributed
                                         environment will be deinitialized.
-    
+
     Returns : None
 
     Examples:
@@ -776,7 +776,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):
 
     Reduce a tensor over all ranks so that all get the result.
     As shown below, one process is started with a GPU and the data of this process is represented
-    by its group rank. The reduce operator is sum. Through all_reduce operator, 
+    by its group rank. The reduce operator is sum. Through all_reduce operator,
     each GPU will have the sum of the data from all GPUs.
 
     .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/allreduce.png
@@ -1662,10 +1662,10 @@ def _parallel_linear(x,
     """
     Parallel Linear
 
-    axis the dimension of the parameter of linear layer. 
+    axis the dimension of the parameter of linear layer.
     axis = 0: the row dimension
     axis = 1: the col dimension
-    
+
     """
     if group is not None and not group.is_member():
         return
@@ -1840,7 +1840,7 @@ def split(x,
         of which is a matrix with N/num_partitions rows and M column.
 
         The linear layer put on single card is shown as below, the input variable is represented by X,
-        the weight matrix is represented by W and the output vaiable is O. The linear layer on single card is 
+        the weight matrix is represented by W and the output vaiable is O. The linear layer on single card is
         simple matrix multiplication operation, O = X * W.
 
         .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_single.png
@@ -1863,14 +1863,14 @@ def split(x,
         of which is a matrix with N rows and M/num_partitions column.
 
         The linear layer put on single card has been illustrated on case 2 and Column Parallel Linear
-        is shown as below. The Column Parallel Linear splits the weight matrix W into [W_col1, W_col2] along the column and 
-        these splitted matrices respectively multiply the input. Finally apply AllGather on the output from each card to get the final output. 
+        is shown as below. The Column Parallel Linear splits the weight matrix W into [W_col1, W_col2] along the column and
+        these splitted matrices respectively multiply the input. Finally apply AllGather on the output from each card to get the final output.
 
         .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_col.png
             :width: 800
             :alt: split_col
             :align: center
-    
+
     As observed, the column parallel linear and row parallel linear can be combined to skip one ALLGATHER communication
     operator. Furthermore the Attention and MLP can be combined to imporve the performance as shown below.
 
@@ -2019,10 +2019,10 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
             data type of the input Tensors.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
         use_calc_stream (bool, optional): Whether to use calculation stream (True) or communication stream. Default: True.
-    
+
     Returns:
         None.
-    
+
     Examples:
         .. code-block:: python
 
@@ -2116,16 +2116,16 @@ def alltoall_single(in_tensor,
     Args:
         in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
         out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor.
-        in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor`` 
+        in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor``
             must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None.
-        out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor`` 
+        out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor``
             must be divisible by group size and ``out_tensor`` will be gathered averagely from all participators. Default: None.
         group (Group, optional): The group instance return by ``new_group`` or None for global default group. Default: None.
         use_calc_stream (bool, optional): Whether to use calculation stream (True) or communication stream. Default: True.
-    
+
     Returns:
         None, if ``use_calc_stream`` is set to ``True``; ``Task`` of ``group``, if ``use_calc_stream`` is set to ``False``.
-    
+
     Examples:
         .. code-block:: python
 
@@ -2207,7 +2207,7 @@ def send(tensor, dst=0, group=None, use_calc_stream=True):
         dst (int): The destination rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
         use_calc_stream (bool, optional): Whether to use calculate stream or communication stream. Default: True.
-    
+
     Returns:
         None.
 
@@ -2272,7 +2272,7 @@ def recv(tensor, src=0, group=None, use_calc_stream=True):
         src (int): The source rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
         use_calc_stream (bool, optional): Whether to use calculate stream or communication stream. Default: True.
-    
+
     Returns:
         None.
 
@@ -2353,11 +2353,11 @@ def isend(tensor, dst, group=None):
             should be float16, float32, float64, int32, int64, int8, uint8 or bool.
         dst (int): The destination rank.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
-    
+
     Returns:
         A distributed task object.
 
-    Warning:    
+    Warning:
         This API only supports the dygraph mode.
 
     Examples:
@@ -2407,7 +2407,7 @@ def irecv(tensor, src=None, group=None):
     Returns:
         A distributed task object.
 
-    Warning:    
+    Warning:
         This API only supports the dygraph mode.
 
     Examples:
@@ -2456,7 +2456,7 @@ class P2POp(object):
             The type of ``op`` is either ``paddle.distributed.isend`` or ``paddle.distributed.irecv``.
         tensor (Tensor): Tensor to send or receive.
         peer (int): The destination or source rank.
-        group (Group, optional): The group instance return by new_group or None for global 
+        group (Group, optional): The group instance return by new_group or None for global
             default group. Default: None.
 
     """
@@ -2505,7 +2505,7 @@ def batch_isend_irecv(p2p_op_list):
     """
     Send or Receive a batch of tensors asynchronously and return a list of requests.
 
-    Process each of the point-to-point operations in ``p2p_op_list`` and return the 
+    Process each of the point-to-point operations in ``p2p_op_list`` and return the
     corresponding tasks. NCCL are currently supported.
 
     Args:
@@ -2516,9 +2516,9 @@ def batch_isend_irecv(p2p_op_list):
 
     Returns:
         A list of distributed tasks returned by calling the corresponding
-        op in the op_list. 
+        op in the op_list.
 
-    Warning:    
+    Warning:
         This API only supports the dygraph mode.
 
     Examples:
@@ -2546,7 +2546,7 @@ def batch_isend_irecv(p2p_op_list):
 
             for task in tasks:
                 task.wait()
-            
+
             print(recv_t)
             # paddle.tensor([1, 2])     # Rank-0
             # paddle.tensor([0, 1])     # Rank-1
@@ -2587,15 +2587,15 @@ def reduce_scatter(tensor,
         tensor_list (list[Tensor]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type
             should be float16, float32, float64, int32, int64, int8, uint8 or bool.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
-        group (Group, optional): The group instance return by new_group or None for global 
+        group (Group, optional): The group instance return by new_group or None for global
             default group. Default: None.
         use_calc_stream (bool, optional): Whether this op should be an async op.
 
     Returns:
         Async task handle, if use_calc_stream is set to False.
         None, if use_calc_stream or if not part of the group.
-    
-    Warning:    
+
+    Warning:
         This API only supports the dygraph mode.
 
 
@@ -2652,7 +2652,7 @@ def _reduce_scatter_base(output,
 
     Args:
         output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
-        input (Tensor): Input tensor that is of size output tensor size times world size. Its data type 
+        input (Tensor): Input tensor that is of size output tensor size times world size. Its data type
             should be float16, float32, float64, int32, int64, int8, uint8 or bool.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
         group (ProcessGroup, optional): The process group to work on. If None,
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 2a11dd7eace..61ce3d6bb7d 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -114,12 +114,12 @@ class DistributedStrategy(object):
         """
         DistributedStrategy is the main configuration entry for distributed training of Paddle.
         All of the distributed training configurations can be configured in DistributedStrategy,
-        such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS), 
+        such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS),
         asynchronous update parameter server(ASGD), etc.
 
         DistributedStrategy can be serialized into protobuf file or deserialized from protobuf file
 
-        Users who run local training usually configure BuildStrategy and ExecutionStrategy, and 
+        Users who run local training usually configure BuildStrategy and ExecutionStrategy, and
         DistributedStrategy supports configurations from BuildStrategy and ExecutionStrategy
 
         """
@@ -290,7 +290,7 @@ class DistributedStrategy(object):
     def a_sync(self):
         """
         Indicating whether we are using asynchronous stocastic gradient descent updates
-        for training. This property is valid when we are using parameter server training, 
+        for training. This property is valid when we are using parameter server training,
         which is implied by setting approperate RoleMaker
         Default value: True
 
@@ -372,7 +372,7 @@ class DistributedStrategy(object):
     @property
     def trainer_desc_configs(self):
         """
-        Set trainer desc configurations. 
+        Set trainer desc configurations.
 
         **Notes**:
             dump_fields_path(str): the path of dump fields
@@ -381,7 +381,7 @@ class DistributedStrategy(object):
 
             dump_param(list(str)): the param that you want to dump
 
-            stat_var_names(list(str)): 
+            stat_var_names(list(str)):
 
         Examples:
 
@@ -443,12 +443,12 @@ class DistributedStrategy(object):
     @property
     def fs_client_param(self):
         """
-        Set fs client configurations. 
+        Set fs client configurations.
         **Notes**:
             uri(str): the uri of fs client
             user(str): the user_name of fs client
             passwd(str): the passwd of fs client
-            hadoop_bin(str): 
+            hadoop_bin(str):
         Examples:
           .. code-block:: python
             import paddle.distributed.fleet as fleet
@@ -1001,15 +1001,15 @@ class DistributedStrategy(object):
     @property
     def last_comm_group_size_MB(self):
         """
-        Specifying the size of gradient to fuse in Mega-Bytes when 
-        the last group of each batch communicates. Making the last group 
-        small is useful to improve performance. 
+        Specifying the size of gradient to fuse in Mega-Bytes when
+        the last group of each batch communicates. Making the last group
+        small is useful to improve performance.
 
         Default value: 1
 
         Examples:
           .. code-block:: python
-        
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.last_comm_group_size_MB = 2
@@ -1027,7 +1027,7 @@ class DistributedStrategy(object):
     @property
     def find_unused_parameters(self):
         """
-        Indicating whether we are using find_unused_parameters to 
+        Indicating whether we are using find_unused_parameters to
         find unused parameters in DataParallel.
 
         Default value: False
@@ -1104,20 +1104,20 @@ class DistributedStrategy(object):
     @property
     def recompute_configs(self):
         """
-        Set recompute configurations. 
-        
+        Set recompute configurations.
+
         **Note**:
         checkpoints(list): list of string name of checkpoints. In general, the recompute
         strategy of current implementation should have some manually assign checkpoints.
 
-        enable_offload(bool): enable recompute checkpoints offload feature. this feature 
+        enable_offload(bool): enable recompute checkpoints offload feature. this feature
         will offload the checkpoint to host memory to allow even larger batch size. since
         the memcpy from host to device takes time, it is a trade off between larger batch
         size and training speed.
 
         checkpoint_shape(list): list of int that specific the shape of checkpoint. so far
         recompute-offload requires that all checkpoint to be same shape, and every dimension
-        specific here should be determined ("-1" is not allowed). 
+        specific here should be determined ("-1" is not allowed).
 
         Examples:
 
@@ -1145,7 +1145,7 @@ class DistributedStrategy(object):
     def sharding(self):
         """
         Indicating whether we are using sharding Optimizer for memory
-        optimization. We implement the sharding optimizer following the ZeRO-DP 
+        optimization. We implement the sharding optimizer following the ZeRO-DP
         idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054).
         Model parameters and Optimizer State are sharded into different ranks allowing to fit larger model.
 
@@ -1174,26 +1174,26 @@ class DistributedStrategy(object):
     @property
     def sharding_configs(self):
         """
-        Set sharding configurations. 
+        Set sharding configurations.
 
         **Note**:
-            sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are 
-            available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and 
+            sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are
+            available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and
             communication. Default is segment_broadcast_MB.
 
-            segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and 
+            segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and
             after every segment_broadcast_MB size parameter being broadcasted, the program will be cutted into one segment.
             This configuration will affect the communication speed in sharding training, and should be an empirical value decided by your model size and network topology.
             Only enable when sharding_segment_strategy = segment_broadcast_MB. Default is 32.0 .
 
-            segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation. 
+            segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation.
             this strategy is experimental by now. Only enable when sharding_segment_strategy = segment_anchors.
 
             sharding_degree(int, optional): specific the number of gpus within each sharding parallelism group; and sharding will be turn off if sharding_degree=1.  Default is 8.
 
             gradient_merge_acc_step(int, optional): specific the accumulation steps in gradient merge; and gradient merge will be turn off if gradient_merge_acc_step=1.  Default is 1.
 
-            optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model. 
+            optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model.
             the moment var will be prefetch from and offloaded to Host memory during update stage. it is a stragtegy that trades off between training speed and GPU memory, and is recommened to be turn on only when gradient_merge_acc_step large, where
             the number of time of update stage will be relatively small compared with forward&backward's.  Default is False.
 
@@ -1203,7 +1203,7 @@ class DistributedStrategy(object):
 
             pp_degree(int, optional): [Hybrid parallelism ONLY] specific the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.
 
-            pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on. 
+            pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on.
             This configuration will affect the communication speed of Hybrid parallelism training depeneded on network topology. this strategy is experimental by now..  Default is False.
 
             optimize_cast(bool, optional): [Hybrid parallelism ONLY] Move the cast op of AMP which cast fp32 param to fp16 param to optimizer. optimize_cast will persist fp16 param, it
@@ -1385,11 +1385,11 @@ class DistributedStrategy(object):
         """
         Set pipeline parallelism configurations. In pipeline parallelism,
         different parts of neural networks are running on different GPUS.
-        There are Tensor queue buffer between each pair of neighborhood GPUS 
+        There are Tensor queue buffer between each pair of neighborhood GPUS
         that are responsible for synchronizing hidden Tensor results between
         GPUs. Pipeline parallelism consists of serveral producer-consumer style
         hardware pairs, such as GPU-GPU, CPU-GPU, GPU-XPU. The best way to speedup
-        pipeline parallelism is to make the size of Tensor in Tensor queue smaller, 
+        pipeline parallelism is to make the size of Tensor in Tensor queue smaller,
         so that we will have a faster producer for downstream consumers.
 
         **Notes**:
@@ -1475,7 +1475,7 @@ class DistributedStrategy(object):
     @property
     def hybrid_configs(self):
         """
-        Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism 
+        Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism
         needs to meet the following relationships
 
         total_number_GPUs = dp_degree * mp_degree * pp_degree
@@ -1483,7 +1483,7 @@ class DistributedStrategy(object):
         **Note**:
             dp_degree(int): set number of GPUs in a data parallel group. Default -1.
                                     This value should be an integer greater than 0.
-                                    If it is not set, or set to -1, its value will be inferred 
+                                    If it is not set, or set to -1, its value will be inferred
                                     based on the total number of cards.
             mp_degree(int): set number of GPUs in a model parallel group. Default 1
             pp_degree(int): set number of GPUs in a pipeline parallel group. Default 1
@@ -1567,7 +1567,7 @@ class DistributedStrategy(object):
     def adaptive_localsgd(self):
         """
         Indicating whether we are using Adaptive Local SGD training. Default Value: False
-        For more details, please refer to `Adaptive Communication Strategies to Achieve 
+        For more details, please refer to `Adaptive Communication Strategies to Achieve
         the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
 
 
@@ -1770,8 +1770,8 @@ class DistributedStrategy(object):
     @property
     def lars(self):
         """
-        Set lars configurations. lars is used to deal with the convergence problems when the global 
-        batch size is larger than 8k.  For more details, please refer to 
+        Set lars configurations. lars is used to deal with the convergence problems when the global
+        batch size is larger than 8k.  For more details, please refer to
         [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).
 
         Default Value: False
@@ -1802,8 +1802,8 @@ class DistributedStrategy(object):
         **Notes**:
         **lars_coeff (float)**: trust ratio in lars formula.
         **lars_weight_decay** (float): weight decay coefficient in lars formula.
-        **epsilon (float)**: argument is used to avoid potential devision-by-zero 
-        when compute the local lr; 
+        **epsilon (float)**: argument is used to avoid potential devision-by-zero
+        when compute the local lr;
         **exclude_from_weight_decay ([string])**: is a list of name strings of layers which
         will be exclude from weight decay in lars formula.
 
@@ -1832,9 +1832,9 @@ class DistributedStrategy(object):
     @property
     def lamb(self):
         """
-        Set lamb configurations. lamb is used to deal with the convergence problems for large 
-        batch size training, specially for attention-related model like BERT. For more details, 
-        please refer to 
+        Set lamb configurations. lamb is used to deal with the convergence problems for large
+        batch size training, specially for attention-related model like BERT. For more details,
+        please refer to
         [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962).
 
         Default Value: False
@@ -1908,7 +1908,7 @@ class DistributedStrategy(object):
     def auto(self):
         """
         Indicating whether we are using auto-parallel configuration
-        This feature is currently an experimental feature. Currently, 
+        This feature is currently an experimental feature. Currently,
         auto-parallelism can be used only when a user does not set any other
         strategy configs except auto. For details, please reference the following
         code example
@@ -1943,7 +1943,7 @@ class DistributedStrategy(object):
     def semi_auto(self):
         """
         Indicating whether we are using semi-auto parallel function
-        This feature is currently an experimental feature. Currently, 
+        This feature is currently an experimental feature. Currently,
         auto-parallelism can be used only when a user does not set any other
         strategy configs except semi-auto. For details, please reference the following
         code example
@@ -2047,7 +2047,7 @@ class DistributedStrategy(object):
 
             activation_bits(int): quantization bit number for activation. Default is 8.
 
-            not_quant_pattern(list[str]): When the skip pattern is detected in an op's name scope, 
+            not_quant_pattern(list[str]): When the skip pattern is detected in an op's name scope,
                 the corresponding op will not be quantized.
 
             algo(str): Other quantization training algorithm.
diff --git a/python/paddle/distributed/fleet/base/private_helper_function.py b/python/paddle/distributed/fleet/base/private_helper_function.py
index 8e2871272a9..223e3afbd59 100644
--- a/python/paddle/distributed/fleet/base/private_helper_function.py
+++ b/python/paddle/distributed/fleet/base/private_helper_function.py
@@ -24,11 +24,11 @@ def wait_server_ready(endpoints):
     """
     Wait until parameter servers are ready, use connext_ex to detect
     port readiness.
-    
+
     Args:
     endpoints (list|tuple): endpoints string list, like:
     ["127.0.0.1:8080", "127.0.0.1:8081"]
-    
+
     Examples:
     .. code-block:: python
 
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 67350be6210..a0b39c3b7a3 100755
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -750,7 +750,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
 
     def _get_previous_trainers(self):
         """
-        invoked by heter worker 
+        invoked by heter worker
         """
         if not self._role_is_generated:
             self._generate_role()
@@ -761,7 +761,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
 
     def _get_next_trainers(self):
         """
-        invoked by heter worker 
+        invoked by heter worker
         """
         if not self._role_is_generated:
             self._generate_role()
diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py
index 823061f9035..fa9dc1e0cdc 100644
--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
@@ -116,7 +116,7 @@ class StrategyCompiler(StrategyCompilerBase):
     """
     StrategyCompiler is responsible for meta optimizers combination
     Generally, a user can define serveral distributed strategies that
-    can generate serveral meta optimizer. The combination of these 
+    can generate serveral meta optimizer. The combination of these
     meta optimizers should have the right order to apply the optimizers'
     minimize function.
     This class is responsible for the executable distributed optimizer
@@ -162,7 +162,7 @@ class StrategyCompiler(StrategyCompilerBase):
     """
     Meta Optimizer Type A: rewrite forward, backward. e.g. recompute, async, sync, pipeline.
                            results will be splitted in async, sync, pipeline
-    Meta Optimizer Type B: rewrite forward, 
+    Meta Optimizer Type B: rewrite forward,
                            e.g. AMP and the corresponding backward is generated by rewritten forward
     Meta Opitmizer Type B: rewrite backward. e.g. gradient fusion
     Meta Optimizer Type D: rewrite optimize. e.g. lars, lamb, localsgd, gradient merge, dgc
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index bbaca895120..6f3f732f71a 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -32,7 +32,7 @@ class ParallelMode(object):
     - DATA_PARALLEL: Distribute input data to different devices.
     - TENSOR_PARALLEL: Shards tensors in the network to different devices.
     - PIPELINE_PARALLEL: Place different layers of the network on different devices.
-    - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states 
+    - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states
                          corresponding to the parameters to each device.
 
     Examples:
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
index 6705eb36bf3..d6fb8a7de71 100755
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -286,7 +286,7 @@ class UtilBase(object):
 
     def print_on_rank(self, message, rank_id):
         """
-        Woker of rank `rank_id` print some message. 
+        Woker of rank `rank_id` print some message.
 
         Args:
             message(str): Log to be printed.
diff --git a/python/paddle/distributed/fleet/data_generator/data_generator.py b/python/paddle/distributed/fleet/data_generator/data_generator.py
index 47d9e4cc8ef..af66cbdb04d 100644
--- a/python/paddle/distributed/fleet/data_generator/data_generator.py
+++ b/python/paddle/distributed/fleet/data_generator/data_generator.py
@@ -22,7 +22,7 @@ class DataGenerator(object):
     """
     DataGenerator is a general Base class for user to inherit
     A user who wants to define his/her own python processing logic
-    with paddle.distributed.InMemoryDataset/QueueDataset should 
+    with paddle.distributed.InMemoryDataset/QueueDataset should
     inherit this class.
     """
 
@@ -96,7 +96,7 @@ class DataGenerator(object):
     def run_from_stdin(self):
         '''
         This function reads the data row from stdin, parses it with the
-        process function, and further parses the return value of the 
+        process function, and further parses the return value of the
         process function with the _gen_str function. The parsed data will
         be wrote to stdout and the corresponding protofile will be
         generated.
@@ -152,7 +152,7 @@ class DataGenerator(object):
 
     def generate_sample(self, line):
         '''
-        This function needs to be overridden by the user to process the 
+        This function needs to be overridden by the user to process the
         original data row into a list or tuple.
 
         Args:
@@ -160,8 +160,8 @@ class DataGenerator(object):
 
         Returns:
             Returns the data processed by the user.
-              The data format is list or tuple: 
-            [(name, [feasign, ...]), ...] 
+              The data format is list or tuple:
+            [(name, [feasign, ...]), ...]
               or ((name, [feasign, ...]), ...)
 
             For example:
@@ -290,7 +290,7 @@ class MultiSlotDataGenerator(DataGenerator):
         and updating proto_info information.
 
         The input line will be in this format:
-            >>> [(name, [feasign, ...]), ...] 
+            >>> [(name, [feasign, ...]), ...]
             >>> or ((name, [feasign, ...]), ...)
         The output will be in this format:
             >>> [ids_num id1 id2 ...] ...
diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
index 3c6da4bd957..de8708855a6 100755
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -46,7 +46,7 @@ class DatasetBase(object):
              fs_ugi="",
              download_cmd="cat"):
         """
-        should be called only once in user's python scripts to initialize setings of dataset instance. 
+        should be called only once in user's python scripts to initialize setings of dataset instance.
         Normally, it is called by InMemoryDataset or QueueDataset.
 
         Args:
@@ -341,7 +341,7 @@ class DatasetBase(object):
 class InMemoryDataset(DatasetBase):
     """
     :api_attr: Static Graph
-    
+
     It will load data into memory and shuffle data before training.
 
     Examples:
@@ -376,8 +376,8 @@ class InMemoryDataset(DatasetBase):
         Args:
             kwargs: Keyword arguments. Currently, we support following keys in **kwargs:
 
-            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id, 
-                             instances of same line id will be merged after shuffle, 
+            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id,
+                             instances of same line id will be merged after shuffle,
                              you should parse line id in data generator. default is -1.
             parse_ins_id(bool): Set if Dataset need to parse ins_id. default is False.
             parse_content(bool): Set if Dataset need to parse content. default is False.
@@ -404,7 +404,7 @@ class InMemoryDataset(DatasetBase):
                     parse_content=True,
                     fea_eval=True,
                     candidate_size=10000)
-              
+
         """
         merge_size = kwargs.get("merge_size", -1)
         if merge_size > 0:
@@ -449,8 +449,8 @@ class InMemoryDataset(DatasetBase):
             data_feed_type(str): data feed type used in c++ code. default is "MultiSlotInMemoryDataFeed".
             queue_num(int): Dataset output queue num, training threads get data from queues. default is-1, which is set same as thread number in c++.
 
-            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id, 
-                             instances of same line id will be merged after shuffle, 
+            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id,
+                             instances of same line id will be merged after shuffle,
                              you should parse line id in data generator. default is -1.
             parse_ins_id(bool): Set if Dataset need to parse ins_id. default is False.
             parse_content(bool): Set if Dataset need to parse content. default is False.
@@ -463,7 +463,7 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-                import paddle    
+                import paddle
                 paddle.enable_static()
 
                 dataset = paddle.distributed.InMemoryDataset()
@@ -479,7 +479,7 @@ class InMemoryDataset(DatasetBase):
                     fea_eval=True,
                     candidate_size=10000)
                 dataset.update_settings(batch_size=2)
-            
+
         """
         for key in kwargs:
             if key == "pipe_command":
@@ -515,10 +515,10 @@ class InMemoryDataset(DatasetBase):
         :api_attr: Static Graph
 
         should be called only once in user's python scripts to initialize setings of dataset instance
-        
+
         Args:
             kwargs: Keyword arguments. Currently, we support following keys in **kwargs:
-            
+
             batch_size(int): batch size. It will be effective during training. default is 1.
             thread_num(int): thread num, it is the num of readers. default is 1.
             use_var(list): list of variables. Variables which you will use. default is [].
@@ -561,7 +561,7 @@ class InMemoryDataset(DatasetBase):
                 dataset.set_filelist(
                     ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
                 dataset.load_into_memory()
-                
+
                 place = paddle.CPUPlace()
                 exe = paddle.static.Executor(place)
                 startup_program = paddle.static.Program()
@@ -569,7 +569,7 @@ class InMemoryDataset(DatasetBase):
                 exe.run(startup_program)
 
                 exe.train_from_dataset(main_program, dataset)
-                
+
                 os.remove("./test_queue_dataset_run_a.txt")
                 os.remove("./test_queue_dataset_run_b.txt")
 
@@ -831,7 +831,7 @@ class InMemoryDataset(DatasetBase):
     def load_into_memory(self, is_shuffle=False):
         """
         :api_attr: Static Graph
-        
+
         Load data into memory
 
         Args:
@@ -842,7 +842,7 @@ class InMemoryDataset(DatasetBase):
 
                 import paddle
                 paddle.enable_static()
-                
+
                 dataset = paddle.distributed.InMemoryDataset()
                 slots = ["slot1", "slot2", "slot3", "slot4"]
                 slots_vars = []
@@ -1035,7 +1035,7 @@ class InMemoryDataset(DatasetBase):
     def release_memory(self):
         """
         :api_attr: Static Graph
-        
+
         Release InMemoryDataset memory data, when data will not be used again.
 
         Examples:
@@ -1043,7 +1043,7 @@ class InMemoryDataset(DatasetBase):
 
                 import paddle
                 paddle.enable_static()
-                
+
                 dataset = paddle.distributed.InMemoryDataset()
                 slots = ["slot1", "slot2", "slot3", "slot4"]
                 slots_vars = []
@@ -1144,7 +1144,7 @@ class InMemoryDataset(DatasetBase):
 
                 import paddle
                 paddle.enable_static()
-                
+
                 dataset = paddle.distributed.InMemoryDataset()
                 dataset = paddle.distributed.InMemoryDataset()
                 slots = ["slot1", "slot2", "slot3", "slot4"]
@@ -1180,13 +1180,13 @@ class InMemoryDataset(DatasetBase):
         """
         set fea eval mode for slots shuffle to debug the importance level of
         slots(features), fea_eval need to be set True for slots shuffle.
-        
+
         Args:
-            record_candidate_size(int): size of instances candidate to shuffle 
+            record_candidate_size(int): size of instances candidate to shuffle
                                         one slot
             fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
                             default is True.
-            
+
         Examples:
             .. code-block:: python
 
@@ -1202,12 +1202,12 @@ class InMemoryDataset(DatasetBase):
 
     def slots_shuffle(self, slots):
         """
-        Slots Shuffle 
-        Slots Shuffle is a shuffle method in slots level, which is usually used 
+        Slots Shuffle
+        Slots Shuffle is a shuffle method in slots level, which is usually used
         in sparse feature with large scale of instances. To compare the metric, i.e.
-        auc while doing slots shuffle on one or several slots with baseline to 
+        auc while doing slots shuffle on one or several slots with baseline to
         evaluate the importance level of slots(features).
-        
+
         Args:
             slots(list[string]): the set of slots(string) to do slots shuffle.
 
@@ -1216,7 +1216,7 @@ class InMemoryDataset(DatasetBase):
 
                 import paddle
                 paddle.enable_static()
-                
+
                 dataset = paddle.distributed.InMemoryDataset()
                 dataset._init_distributed_settings(fea_eval=True)
                 slots = ["slot1", "slot2", "slot3", "slot4"]
@@ -1442,7 +1442,7 @@ class BoxPSDataset(InMemoryDataset):
     def begin_pass(self):
         """
         Begin Pass
-        Notify BoxPS to load sparse parameters of next pass to GPU Memory 
+        Notify BoxPS to load sparse parameters of next pass to GPU Memory
 
         Examples:
             .. code-block:: python
@@ -1456,7 +1456,7 @@ class BoxPSDataset(InMemoryDataset):
     def end_pass(self, need_save_delta):
         """
         End Pass
-        Notify BoxPS that current pass ended 
+        Notify BoxPS that current pass ended
         Examples:
             .. code-block:: python
 
@@ -1522,12 +1522,12 @@ class BoxPSDataset(InMemoryDataset):
 
     def slots_shuffle(self, slots):
         """
-        Slots Shuffle 
-        Slots Shuffle is a shuffle method in slots level, which is usually used 
+        Slots Shuffle
+        Slots Shuffle is a shuffle method in slots level, which is usually used
         in sparse feature with large scale of instances. To compare the metric, i.e.
-        auc while doing slots shuffle on one or several slots with baseline to 
+        auc while doing slots shuffle on one or several slots with baseline to
         evaluate the importance level of slots(features).
-        
+
         Args:
             slots(list[string]): the set of slots(string) to do slots shuffle.
 
@@ -1585,7 +1585,7 @@ class BoxPSDataset(InMemoryDataset):
 
     def preprocess_instance(self):
         """
-        Merge pv instance and convey it from input_channel to input_pv_channel. 
+        Merge pv instance and convey it from input_channel to input_pv_channel.
         It will be effective when enable_pv_merge_ is True.
 
         Examples:
diff --git a/python/paddle/distributed/fleet/elastic/manager.py b/python/paddle/distributed/fleet/elastic/manager.py
index e0a6bd81c8e..473fcc131be 100644
--- a/python/paddle/distributed/fleet/elastic/manager.py
+++ b/python/paddle/distributed/fleet/elastic/manager.py
@@ -360,7 +360,7 @@ class ElasticManager(object):
 
     def _parse_np(self, np: str):
         """
-        np format is "MIN" or "MIN:MAX" 
+        np format is "MIN" or "MIN:MAX"
         """
         np_str = np or os.getenv('PADDLE_ELASTIC_NP', "0")
         np_dict = np_str.split(":")
diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py
index 089e4e51c92..93bfeaf5880 100644
--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -174,14 +174,14 @@ class Fleet(object):
 
         Args:
             role_maker (RoleMakerBase, optional): A ``RoleMakerBase`` containing the configuration
-                of environment variables related to distributed training.If you did not initialize 
+                of environment variables related to distributed training.If you did not initialize
                 the rolemaker by yourself, it will be automatically initialized to PaddleRoleMaker.
                 The default value is None.
-            is_collective (Boolean, optional): A ``Boolean`` variable determines whether the program 
+            is_collective (Boolean, optional): A ``Boolean`` variable determines whether the program
                 runs on Collective mode or ParameterServer mode. True means the program runs on
-                Collective mode, and False means running on ParameterServer mode. The default value 
+                Collective mode, and False means running on ParameterServer mode. The default value
                 is False.
-            strategy (DistributedStrategy): Extra properties for distributed training. 
+            strategy (DistributedStrategy): Extra properties for distributed training.
                 For details, please refer to paddle.distributed.fleet.DistributedStrategy. Default: None.
 
 
@@ -991,10 +991,10 @@ class Fleet(object):
 
         Args:
             optimizer(Optimizer): The executor to run for init server.
-            strategy(DistributedStrategy): Extra properties for distributed optimizer. 
+            strategy(DistributedStrategy): Extra properties for distributed optimizer.
                 It is recommended to use DistributedStrategy in fleet.init(). The strategy
-                here is for compatibility. If the strategy in fleet.distributed_optimizer() 
-                is not None, then it will overwrite the DistributedStrategy in fleet.init(), 
+                here is for compatibility. If the strategy in fleet.distributed_optimizer()
+                is not None, then it will overwrite the DistributedStrategy in fleet.init(),
                 which will take effect in distributed training.
 
         Returns:
@@ -1057,14 +1057,14 @@ class Fleet(object):
                  use_fp16_test=False):
         """
         Init the amp training, such as cast fp32 parameters to fp16 type.
-  
+
         Args:
-            place(CUDAPlace): place is used to initialize 
+            place(CUDAPlace): place is used to initialize
                 fp16 parameters with fp32 values.
             scope(Scope): The scope is used to find fp32 parameters.
             test_program(Program): The program is used for testing.
             use_fp16_test(bool): Whether to use fp16 testing.
-            
+
         Examples:
             .. code-block:: python
 
@@ -1086,7 +1086,7 @@ class Fleet(object):
                         loss = paddle.mean(hidden)
                     # 2) Create the optimizer and set `multi_precision` to True.
                     # Setting `multi_precision` to True can avoid the poor accuracy
-                    # or the slow convergence in a way. 
+                    # or the slow convergence in a way.
                     optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                     # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                     amp_list = paddle.static.amp.CustomOpLists(
@@ -1106,9 +1106,9 @@ class Fleet(object):
                     # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                     # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                     optimizer.amp_init(place, scope=paddle.static.global_scope())
-                    
+
                 if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
-                    run_example_code()       
+                    run_example_code()
         """
         amp_optimizer = self._get_amp_optimizer()
         return amp_optimizer.amp_init(place, scope, test_program, use_fp16_test)
diff --git a/python/paddle/distributed/fleet/fleet_executor_utils.py b/python/paddle/distributed/fleet/fleet_executor_utils.py
index f5a1d8b1814..48ef34f4603 100644
--- a/python/paddle/distributed/fleet/fleet_executor_utils.py
+++ b/python/paddle/distributed/fleet/fleet_executor_utils.py
@@ -39,7 +39,7 @@ class TaskNode:
         :param role (int): The role of the task node. (Will be removed in the future)
         :param node_type (str): The type of the task node.
         :param task_id (int): The id of task node.
-        :param ops (list): A list of op.desc to init the task node. (Will be removed in the future) 
+        :param ops (list): A list of op.desc to init the task node. (Will be removed in the future)
         :param program (Program): An instance of Program to init the task node.
         :param lazy_initialize (bool): In user-defined task, the program may change adding feed/fetch op. As efficient consideration, the task node will have the C++ object later.
         """
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 158938b76d0..e4bcaa59de0 100755
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -543,7 +543,7 @@ def which_distributed_mode(args):
 def launch():
     """
     Paddle distribution training entry ``python -m paddle.distributed.launch``.
-    
+
     Usage:
         .. code-block:: bash
             :name: code-block-bash1
@@ -553,7 +553,7 @@ def launch():
                              [--worker_num WORKER_NUM] [--server_num SERVER_NUM] [--heter_worker_num HETER_WORKER_NUM]
                              [--http_port HTTP_PORT] [--elastic_server ELASTIC_SERVER] [--job_id JOB_ID] [--np NP] [--scale SCALE]
                              [--host HOST] [--force FORCE]
-                             training_script ...    
+                             training_script ...
 
 
     Base Parameters:
@@ -566,9 +566,9 @@ def launch():
         - ``--gpus``: It's for gpu training. e.g., ``--gpus=0,1,2,3`` will launch four training processes each bound to one gpu.
 
         - ``--selected_gpus``: gpus aliases, recommend to use ``--gpus``.
-        
+
         - ``--xpus``: It's for xpu training if xpu is available. e.g., ``--xpus=0,1,2,3``.
-        
+
         - ``--selected_xpus``: xpus aliases, recommend to use ``--xpus``.
 
         - ``--mlus``: It's for mlu training. e.g., ``--mlus=0,1,2,3`` will launch four training processes each bound to one mlu.
@@ -594,7 +594,7 @@ def launch():
         - ``--server_num``: Number of servers (It recommend to set when in the emulated distributed environment using single node)
 
         - ``--heter_worker_num``: Number of heter_workers in each stage (It recommend to set when in the emulated distributed environment using single node)
-        
+
         - ``--heter_devices``: Type of heter_device in each stage
 
         - ``--http_port``: Gloo http Port
@@ -615,18 +615,18 @@ def launch():
     Examples 1 (collective, single node):
         .. code-block:: bash
             :name: code-block-example-bash1
-            
+
             # For training on single node using 4 gpus.
 
             python -m paddle.distributed.launch --gpus=0,1,2,3 train.py --lr=0.01
-        
+
     Examples 2 (collective, multi node):
         .. code-block:: bash
             :name: code-block-example-bash2
 
             # The parameters of --gpus and --ips must be consistent in each node.
 
-            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17
 
             # On 192.168.0.16:
 
@@ -634,15 +634,15 @@ def launch():
 
             # On 192.168.0.17:
             python -m paddle.distributed.launch --gpus=0,1,2,3 --ips=192.168.0.16,192.168.0.17 train.py --lr=0.01
-        
+
     Examples 3 (ps, cpu, single node):
         .. code-block:: bash
             :name: code-block-example-bash3
 
             # To simulate distributed environment using single node, e.g., 2 servers and 4 workers.
-            
+
             python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
-        
+
     Examples 4 (ps, cpu, multi node):
         .. code-block:: bash
             :name: code-block-example-bash4
@@ -662,10 +662,10 @@ def launch():
             :name: code-block-example-bash5
 
            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu.
-            
+
             export CUDA_VISIBLE_DEVICES=0,1,2,3
             python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
-            
+
     Examples 6 (ps, gpu, multi node):
         .. code-block:: bash
             :name: code-block-example-bash6
@@ -687,10 +687,10 @@ def launch():
             :name: code-block-example-bash7
 
             # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, two workers use gpu, two workers use cpu.
-            
+
             export CUDA_VISIBLE_DEVICES=0,1
             python -m paddle.distributed.launch --server_num=2 --worker_num=2 --heter_worker_num=2 train.py --lr=0.01
-            
+
     Examples 8 (ps-heter, cpu + gpu, multi node):
         .. code-block:: bash
             :name: code-block-example-bash8
@@ -712,7 +712,7 @@ def launch():
             :name: code-block-example-bash9
 
             python -m paddle.distributed.launch --elastic_server=127.0.0.1:2379 --np=2 --job_id=job1  --gpus=0,1,2,3 train.py
-        
+
     """
 
     args = _parse_args()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index 8a6ec33b39b..69090c31767 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -27,7 +27,7 @@ def _is_trainable(param):
 
 class DygraphShardingOptimizer(object):
     """
-    A wrapper for Sharding Optimizer in Dygraph. 
+    A wrapper for Sharding Optimizer in Dygraph.
 
     .. warning: DygraphShardingOptimizer is experimental and subject to change.
 
@@ -88,7 +88,7 @@ class DygraphShardingOptimizer(object):
         Partitions parameters among sharding ranks.
 
         Return:
-        Dict[int, List] 
+        Dict[int, List]
         """
         # TODO(JZ-LIANG) support multiple partition methods
         # method1: greedy even but unorder
@@ -113,7 +113,7 @@ class DygraphShardingOptimizer(object):
         mapping parameters to the shard which holds it.
 
         Return:
-        Dict[str, int] 
+        Dict[str, int]
         """
         mapping = {}
         for rank, params in self._rank2params.items():
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index 3359e63b1de..6756892be18 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -49,7 +49,7 @@ align = {
 
 class ShardingOptimizerStage2(Optimizer):
     """
-    A wrapper for Sharding Stage2 Optimizer in Dygraph. 
+    A wrapper for Sharding Stage2 Optimizer in Dygraph.
 
     .. warning: ShardingOptimizer encapsulates the optimization strategy and integrates it into the optimizer.
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index 39f71be0cde..2db046a0e26 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -816,7 +816,7 @@ def insert_scale_loss_grad_ops(block, scale=1.0):
 
 def comm_analyse(main_program):
     """
-    Analyse the parameter size that need to be broadcast/allreduce during sharding training 
+    Analyse the parameter size that need to be broadcast/allreduce during sharding training
     """
     reduce_vars = {}
     broadcast_vars = {}
@@ -858,7 +858,7 @@ def comm_analyse(main_program):
 
 def add_sync_comm(program, sharding_ring_id):
     """
-    When clone a test prog by clone from the sharding main prog, 
+    When clone a test prog by clone from the sharding main prog,
     part of the sync_comm op maybe be pruned by mistake, this function
     add the sync_comm op for the test prog.
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index fcecc3a9a67..68b2021b6ea 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -961,7 +961,7 @@ class ShardingOptimizer(MetaOptimizerBase):
         2. prune cast_fp32_to_fp16; update amp_infine_checking
         3. prune gradient_clip related; update global_norm_sum
         4. prune optimizer op + param + gradient
-            
+
         """
         weightdecay_helper = WeightDecayHelper()
         weightdecay_helper.prune_weight_decay(block, shard)
@@ -1066,7 +1066,7 @@ class ShardingOptimizer(MetaOptimizerBase):
         add broadcast allreduce op
         if enable gradient_merge, insert related ops
 
-        if combined with pipeline(grad accumulate), 
+        if combined with pipeline(grad accumulate),
         the grad allreduce should be done in optimize role
         """
         if len(self._segments) < 1:
@@ -1302,7 +1302,7 @@ class ShardingOptimizer(MetaOptimizerBase):
             pp: 4
             pp-pair: >= 20
         if one parallelism is not enable: -1
-        and only support parallelism hierarchy: mp --> sharding --> pp --> dp        
+        and only support parallelism hierarchy: mp --> sharding --> pp --> dp
         """
         # step 1: initialize nccl
         self.global_word_size = self.role_maker._worker_num()
@@ -1688,7 +1688,7 @@ class ShardingOptimizer(MetaOptimizerBase):
         grad@gradientmerge / acc_step
         re-create all optimize ops of origin main block and rename them
             cast(backward)
-            amp 
+            amp
             clip
             opt
         # fill constant grad@gradientmerge
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
index 21770fff656..f6878ec1d86 100755
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -198,11 +198,11 @@ class PipelineLayer(Layer):
     """PipelineLayer
     Args:
         layers(Iterable): A sequence of layers description to define the structure for pipeline.
-        num_stages(int, optional): pp degree, if not specified, 'topology' parameter must be given. 
+        num_stages(int, optional): pp degree, if not specified, 'topology' parameter must be given.
         topology(CommunicateTopology, optional): topo of hybrid parallel, if it is None, 'num_stages' parameters must be given.
         loss_fn(callable, optional): Loss function.
         seg_method(str, optional): the method of splitting pp layer, default 'uniform', or use specific layer to split, method's name must be start with 'layer:'.
-        recompute_interval(int, optional): the number of layers to be used recompute, the value of 0 represents no recompute. default 0. 
+        recompute_interval(int, optional): the number of layers to be used recompute, the value of 0 represents no recompute. default 0.
         recompute_ctx(dict,optional): the context of recompute, when 'recompute_interval' > 0, the context must be given.
         num_virtual_pipeline_stages(int, optional): the num of virtual pipeline stages for interleave pp.
     Examples:
@@ -212,7 +212,7 @@ class PipelineLayer(Layer):
         from paddle.fluid.dygraph.layers import Layer
         import paddle.nn.functional as F
         from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-        
+
         pipeline_parallel_size = 2
         strategy = fleet.DistributedStrategy()
         strategy.hybrid_configs = {
@@ -224,19 +224,19 @@ class PipelineLayer(Layer):
             "accumulate_steps": 4,
             "micro_batch_size": 2
         }
-        
+
         fleet.init(is_collective=True, strategy=strategy)
-        
+
         hcg = fleet.get_hybrid_communicate_group()
-        
+
         class ReshapeHelp(Layer):
             def __init__(self, shape):
                 super(ReshapeHelp, self).__init__()
                 self.shape = shape
-        
+
             def forward(self, x):
                 return x.reshape(shape=self.shape)
-        
+
         class AlexNetPipeDesc(PipelineLayer):
             def __init__(self, num_classes=10, **kwargs):
                 self.num_classes = num_classes
@@ -268,7 +268,7 @@ class PipelineLayer(Layer):
                 ]
                 super(AlexNetPipeDesc, self).__init__(
                     layers=decs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
-        
+
         model = AlexNetPipeDesc(num_stages=pipeline_parallel_size, topology=hcg._topo)
 
     """
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
index bb774b8a0e5..8ec7f0f037b 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -107,7 +107,7 @@ def _initialize_recompute_hcg(hcg):
 
 def _all_gather(tensor, group=None, use_calc_stream=True):
     """
-    The main difference with paddle.distributed.all_gather: 
+    The main difference with paddle.distributed.all_gather:
     no need to pass in tensor_list, the returned tensor is spliced
     """
     if group is not None and not group.is_member():
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
index 7bdbe2ce32e..2c8cd18e1bf 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
@@ -47,7 +47,7 @@ align = {
 
 class GroupShardedOptimizerStage2(Optimizer):
     """
-    A wrapper for Sharding Stage2 Optimizer in Dygraph. 
+    A wrapper for Sharding Stage2 Optimizer in Dygraph.
 
     .. warning: ShardingOptimizer encapsulates the optimization strategy and integrates it into the optimizer.
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
index 905af0487ba..6d9ce68e493 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
@@ -47,8 +47,8 @@ def _trainable(param):
 
 
 class GroupShardedStage2(nn.Layer):
-    """ 
-    A wrapper for Sharding Stage2 Layer in Dygraph. 
+    """
+    A wrapper for Sharding Stage2 Layer in Dygraph.
     .. warning: GroupShardedStage2 encapsulates the layer strategy and integrates it into the nn.Layer.
     .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
     """
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
index abc5e0549ae..5898c352a90 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -33,7 +33,7 @@ from .group_sharded_utils import Type, GroupShardedClipGrad, device_guard
 
 def _all_gather(tensor, buffer_size, group):
     """
-    The main difference with paddle.distributed.all_gather: 
+    The main difference with paddle.distributed.all_gather:
     no need to pass in tensor_list, the returned tensor is spliced
     """
 
@@ -58,8 +58,8 @@ CHECK_LAYER = dict()  # Help to check layer's id -> layer's name
 
 
 class GroupShardedStage3(nn.Layer):
-    """ 
-    A wrapper for Sharding Stage3 Layer in Dygraph. 
+    """
+    A wrapper for Sharding Stage3 Layer in Dygraph.
 
     .. warning: GroupShardedStage3 encapsulates the layer strategy and integrates it into the nn.Layer.
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index 7834e6d9398..4157edc57bc 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -48,8 +48,8 @@ def _trainable(param):
 
 
 class ShardingStage2(nn.Layer):
-    """ 
-    A wrapper for Sharding Stage2 Layer in Dygraph. 
+    """
+    A wrapper for Sharding Stage2 Layer in Dygraph.
     .. warning: ShardingStage2 encapsulates the layer strategy and integrates it into the nn.Layer.
     .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
     """
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index 67d48c8abba..69fe91677ee 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -50,8 +50,8 @@ CHECK_LAYER = dict()  # Help to check layer's id -> layer's name
 
 
 class ShardingStage3(nn.Layer):
-    """ 
-    A wrapper for Sharding Stage3 Layer in Dygraph. 
+    """
+    A wrapper for Sharding Stage3 Layer in Dygraph.
 
     .. warning: ShardingStage3 encapsulates the layer strategy and integrates it into the nn.Layer.
 
diff --git a/python/paddle/distributed/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py
index d2050585df7..6aa3793cd65 100644
--- a/python/paddle/distributed/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -41,7 +41,7 @@ def sum(input, scope=None, util=None):
           global_cnt = fluid.layers.create_global_var(persistable=True, dtype='float32', shape=[1], value=0)
           tmp = fluid.layers.elementwise_add(cnt, global_cnt)
           fluid.layers.assign(tmp, global_cnt)
-          
+
           # in train.py, after train or infer
           res = np.array(scope.find_var(global_cnt.name).get_tensor())
           print("sum array: ", paddle.distributed.fleet.sum(res))
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 95635154c33..7ea639d70e6 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -131,14 +131,14 @@ class LocalFS(FS):
     """
 
     def ls_dir(self, fs_path):
-        """	
+        """
         List directorys and files under `fs_path` .
 
         Args:
             fs_path(str): The local file path.
 
         Returns:
-            Tuple: Return a 2-tuple, the first is a list of all its subdirectories, 
+            Tuple: Return a 2-tuple, the first is a list of all its subdirectories,
             and the second is a list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
 
         Examples:
@@ -290,7 +290,7 @@ class LocalFS(FS):
             fs_path(str): The local file path.
 
         Returns:
-            Bool: Wheter it's a file or directory, return true if the path exists, 
+            Bool: Wheter it's a file or directory, return true if the path exists,
             otherwise return false.
 
         Examples:
@@ -359,7 +359,7 @@ class LocalFS(FS):
         return self.rename(src_path, dst_path)
 
     def list_dirs(self, fs_path):
-        """	
+        """
         Only list directorys under `fs_path` .
 
         Args:
@@ -430,7 +430,7 @@ class HDFSClient(FS):
     A tool of HDFS.
 
     Args:
-        hadoop_home(str): Hadoop home. 
+        hadoop_home(str): Hadoop home.
         configs(dict): Hadoop config. It is a dictionary and needs to contain the
             keys: "fs.default.name" and "hadoop.job.ugi".
 
@@ -491,7 +491,7 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def list_dirs(self, fs_path):
-        """	
+        """
         Only list directorys under `fs_path` .
 
         Args:
@@ -523,14 +523,14 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def ls_dir(self, fs_path):
-        """	
+        """
         List directorys and files under `fs_path` .
 
         Args:
             fs_path(str): The HDFS file path.
 
         Returns:
-            Tuple: Return a 2-tuple, the first element is the list of all its subdirectories, 
+            Tuple: Return a 2-tuple, the first element is the list of all its subdirectories,
             and the second one is the list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
 
         Examples:
@@ -923,7 +923,7 @@ class HDFSClient(FS):
             fs_src_path(str):  Name of the file or directory, that's needed to be moved.
             fs_dst_path(str):  Name of the file or directory to which to move to.
             overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False.
-            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption. 
+            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption.
 
         Examples:
 
@@ -1174,7 +1174,7 @@ class AFSClient(FS):
         self._fs.init(fs_name, fs_user, fs_passwd, fs_conf)
 
     def list_dirs(self, fs_path):
-        """	
+        """
         Only list directorys under `fs_path` .
 
         Args:
@@ -1200,14 +1200,14 @@ class AFSClient(FS):
         return dirs
 
     def ls_dir(self, fs_path):
-        """	
+        """
         List directorys and files under `fs_path` .
 
         Args:
             fs_path(str): The HDFS file path.
 
         Returns:
-            Tuple: Return a 2-tuple, the first element is the list of all its subdirectories, 
+            Tuple: Return a 2-tuple, the first element is the list of all its subdirectories,
             and the second one is the list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
 
         Examples:
@@ -1438,7 +1438,7 @@ class AFSClient(FS):
             fs_src_path(str):  Name of the file or directory, that's needed to be moved.
             fs_dst_path(str):  Name of the file or directory to which to move to.
             overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False.
-            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption. 
+            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption.
 
         Examples:
 
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
index e6b581464fa..7e9152f92a3 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -23,7 +23,7 @@ import numpy as np
 class HybridParallelInferenceHelper(object):
     """
     A helper class to split program for inference with hybrid parallelism.
-    
+
     Args:
         startup_program (Program): the startup program.
         main_program (Program): the main program.
@@ -34,15 +34,15 @@ class HybridParallelInferenceHelper(object):
         init_comm (bool): wheter if initilize comminication group. Default ``True``.
         role_maker (RoleMakerBase or subclass): user custom define RoleMakerBase.
             If ``role_maker==None``, then use PaddleCloudRoleMaker. Default ``None``.
-    
+
     Returns:
         None.
-        
+
     Write Paradigm:
-    
+
     .. code-block:: bash
         :name: bash-example1
-        
+
         # while op pattern
         with paddle.fluid.device_guard(f'{device}:all'):
             # init global cond
@@ -51,10 +51,10 @@ class HybridParallelInferenceHelper(object):
             cond_int = layers.fill_constant(shape=[1], dtype="int64", value=0, force_cpu=False, name="cond_int")
             cond = layers.cast(step_idx < max_len, dtype="bool")
             while_op = layers.While(cond, is_test=True)
-            
+
             # init global lod_tensor_array for generation task
             arr = layers.array_write(data, step_idx)
-            
+
         with while_op.block():
             with paddle.fluid.device_guard(f'{device}:all'):
                 # read data from global lod_tensor_array
@@ -63,36 +63,36 @@ class HybridParallelInferenceHelper(object):
                 # it need for send_v2 of lod_tensor_array
                 layers.increment(x=step_idx, value=1.0, in_place=True)
                 layers.array_write(element_in_arr, i=step_idx, array=arr)
-                
+
             with paddle.fluid.device_guard(f'{device}:0'):
                 ... some code
-                
+
             with paddle.fluid.device_guard(f'{device}:1'):
                 ... some code
-                
+
             with paddle.fluid.device_guard(f'{device}:{num_pp-1}'):
                 # generate some data in while block and write to global lod_tensor_array
                 # that they are read in next while step.
                 # we will using send_v2 to send global lod_tensor_array to other pipeline and sync
                 layers.array_write(other_var, i=step_idx, array=arr)
-                
+
                 # update cond and assign to cond_int, we will sync cond_int
                 layers.assign(layers.cast(cond, dtype="int32"), cond_int)
-                
+
             with paddle.fluid.device_guard(f'{model._device}:all'):
                 # the code below must at end of while block and exists in device:all
                 layers.assign(layers.cast(cond_int, dtype='bool'), cond)
-                
+
         with paddle.fluid.device_guard(f'{model._device}:all'):
             # use a empty lod_tensor_array to clear lod_tensor_array
             layers.assign(layers.create_array(data.dtype), arr)
-            
-            
+
+
     Examples:
-    
+
     .. code-block:: python
         :name: code-example1
-    
+
         # required: distributed
         import os
         import numpy as np
@@ -172,7 +172,7 @@ class HybridParallelInferenceHelper(object):
 
         exe = paddle.static.Executor(paddle.CUDAPlace(dev_id))
         exe.run(startup_program)
-        
+
         np.random.seed(2333)
         for step in range(5):
             init_data = np.random.uniform(low=0.0, high=1.0, size=[2, 2]).astype('float32')
@@ -358,7 +358,7 @@ class HybridParallelInferenceHelper(object):
         Args:
             stage (int): pipeline stage
             block_idx (int): block index
-            
+
         Returns:
             used_var_names (set): used var names in block_idx block
         """
@@ -445,9 +445,9 @@ class HybridParallelInferenceHelper(object):
 
     def _add_op_device_attr(self, block):
         """
-        Add op_device attrribute for ops in block that have 
+        Add op_device attrribute for ops in block that have
         not that attribute set.
-        
+
         Args:
             block (Block): the block to process.
         """
@@ -474,7 +474,7 @@ class HybridParallelInferenceHelper(object):
 
     def _check_validation(self, block):
         """
-        Check whether ops in a block have both the op_device and the 
+        Check whether ops in a block have both the op_device and the
         op_role attributes set.
         """
         assert isinstance(block, Block)
@@ -729,7 +729,7 @@ class HybridParallelInferenceHelper(object):
         """
         Generate inference program.
         Params:
-            sync_in_while_lastpp2firstpp_var_names (list(str)): the vars in the last pipeline 
+            sync_in_while_lastpp2firstpp_var_names (list(str)): the vars in the last pipeline
                 that need to send var to first pipeline and exclude bool dtype var
             sync_in_while_var_names (list(str)): the vars sync among all pipeline in while block
                 e.g cond. Note that cond cannot be bool dtype.
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
index f0c74159488..2dddb1d9fb4 100755
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -352,13 +352,13 @@ def recompute(function, *args, **kwargs):
     recompute intermediate activations to save then memory.
 
     Parameters:
-        function(paddle.nn.Sequential): layer of sequence of layers that describes part of forward pass of the model  
-              whose intermediate activations will be released to save memory in forward stage and will be recomputed 
-              in backward stage for gradient calculation. 
-        *args(Tensor): inputs to the function.    
-        **kwargs(Dict): Kwargs should only contain the key-value pair of preserve_rng_state, which is used to 
-              indicate whether to save the forward rng. If it is True, then the last forward rng value will be 
-              restored when the forward recalculation of backpropagation is performed. The default 
+        function(paddle.nn.Sequential): layer of sequence of layers that describes part of forward pass of the model
+              whose intermediate activations will be released to save memory in forward stage and will be recomputed
+              in backward stage for gradient calculation.
+        *args(Tensor): inputs to the function.
+        **kwargs(Dict): Kwargs should only contain the key-value pair of preserve_rng_state, which is used to
+              indicate whether to save the forward rng. If it is True, then the last forward rng value will be
+              restored when the forward recalculation of backpropagation is performed. The default
               preserve_rng_state is True.
 
     Returns:
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index fccb352c2a3..e37ae141f3f 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -18,7 +18,7 @@ from .context import Context
 def launch():
     """
     Paddle distribution training entry ``python -m paddle.distributed.launch``.
-    
+
     Usage:
         .. code-block:: bash
             :name: code-block-bash1
@@ -77,7 +77,7 @@ def launch():
         - ``--heter_workers``: User defined heter workers ip1:port1;ip2:port2, e.g., ``--heter_workers="192.168.0.16:6172;192.168.0.17:6172"``
 
         - ``--heter_worker_num``: Number of heter_workers in each stage (It recommend to set when in the emulated distributed environment using single node)
-        
+
         - ``--heter_devices``: Type of heter_device in each stage
 
         - ``--gloo_port``: Gloo http Port. Default ``--gloo_port=6767``.
@@ -94,12 +94,12 @@ def launch():
     IPU Parameters:
         IPU distributed launch only requires and allowes three arguments ``--devices``, ``training_script`` and ``training_script_args``.
         The ``--devices`` is the number of IPU devices. e.g., ``--devices=4`` will launch the training program with four IPU devices.
-        The ``training_script`` is only allowed to set as ``ipu``. 
+        The ``training_script`` is only allowed to set as ``ipu``.
         The ``training_script_args`` includes arguments required by IPU distributed launch and illustrated as below.
         ``Examples 10`` has provided a example of paddle.distributed.launch with IPUs.
 
         - ``--hosts``: The hosts for IPU distributd training. Each host is able to include multiple processes.
-        
+
         - ``--nproc_per_host``: The number of processes launched per host. Each process is able to include multiple replicas.
 
         - ``--ipus_per_replica``: The number of IPUs requested per replica. Each replica is able to include multiple IPUs.
@@ -144,16 +144,16 @@ def launch():
     Examples 1 (collective, single node):
         .. code-block:: bash
             :name: code-block-example-bash1
-            
+
             # For training on single node using 4 gpus.
 
             python -m paddle.distributed.launch --devices=0,1,2,3 train.py --lr=0.01
-        
+
     Examples 2 (collective, multi node):
         .. code-block:: bash
             :name: code-block-example-bash2
 
-            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17
 
             # On 192.168.0.16:
 
@@ -161,15 +161,15 @@ def launch():
 
             # On 192.168.0.17:
             python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01
-        
+
     Examples 3 (ps, cpu, single node):
         .. code-block:: bash
             :name: code-block-example-bash3
 
             # To simulate distributed environment using single node, e.g., 2 servers and 4 workers.
-            
+
             python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
-        
+
     Examples 4 (ps, cpu, multi node):
         .. code-block:: bash
             :name: code-block-example-bash4
@@ -194,10 +194,10 @@ def launch():
             :name: code-block-example-bash5
 
             # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu.
-            
+
             export CUDA_VISIBLE_DEVICES=0,1,2,3
             python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
-            
+
     Examples 6 (ps, gpu, multi node):
         .. code-block:: bash
             :name: code-block-example-bash6
@@ -219,10 +219,10 @@ def launch():
             :name: code-block-example-bash7
 
             # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, two workers use gpu, two workers use cpu.
-            
+
             export CUDA_VISIBLE_DEVICES=0,1
             python -m paddle.distributed.launch --server_num=2 --worker_num=2 --heter_worker_num=2 train.py --lr=0.01
-            
+
     Examples 8 (ps-heter, cpu + gpu, multi node):
         .. code-block:: bash
             :name: code-block-example-bash8
@@ -246,7 +246,7 @@ def launch():
             # With the following command, the job will begin to run immediately if 4 nodes are ready,
             # or it will run after elastic_timeout if only 2 or 3 nodes ready
             python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:4 train.py
-            
+
             # once the number of nodes changes between 2:4 during training, the strategy holds
 
     Examples 10 (ipu):
diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py
index cde6a8a97f0..7518eb8eaf6 100644
--- a/python/paddle/distributed/models/moe/utils.py
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -60,18 +60,18 @@ def _number_count(numbers, upper_range):
 
 def _assign_pos(x, cum_count):
     """
-    Assign pos decides which tokens should be fetched belong to 
+    Assign pos decides which tokens should be fetched belong to
     specially expert orderingly.
-    
+
     Args:
         x (Tensor): Tensor. Every element in the list must be a Tensor whose data type
             should be float16, float32, float64, int32 or int64.
-        cum_count (Tensor): The cumulative sum tokens of counters. Every element in the list must be a Tensor whose 
+        cum_count (Tensor): The cumulative sum tokens of counters. Every element in the list must be a Tensor whose
             data type should be int64.
-  
+
     Returns:
-        out (Tensor): Assemble numbers in the order of counters. 
-    
+        out (Tensor): Assemble numbers in the order of counters.
+
     Examples:
         .. code-block:: python
 
@@ -185,10 +185,10 @@ def _prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker):
         gate_idx (Tensor): Represents the gate_id sequence corresponding to the input data with type int32, int64.
         expert_count (Tensor): The quantity value counted on the gate_id sequence of the input data with type int32, int64.
         n_worker(int，optional): The number of workers on the trainer with type int64.
-  
+
     Returns:
         new_gate_idx (Tensor): The gate_id sequence corresponding to the new input data after passing through prune.
-    
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 6cda451a266..6fdbf8627c9 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -105,7 +105,7 @@ def init_parallel_env():
 
     Returns:
         None
-        
+
     Examples:
         .. code-block:: python
             # required: gpu
@@ -119,7 +119,7 @@ def init_parallel_env():
                     super(LinearNet, self).__init__()
                     self._linear1 = nn.Linear(10, 10)
                     self._linear2 = nn.Linear(10, 1)
-                    
+
                 def forward(self, x):
                     return self._linear2(self._linear1(x))
 
@@ -140,7 +140,7 @@ def init_parallel_env():
                 outputs = dp_layer(inputs)
                 labels = paddle.randn([10, 1], 'float32')
                 loss = loss_fn(outputs, labels)
-                
+
                 loss.backward()
 
                 adam.step()
diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
index ac923be9a1a..9495ffa22b0 100644
--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -41,7 +41,7 @@ def numel(var):
 class DataParallelOptimizationPass(PassBase):
     """
     Apply Optimizations that specialized for data parallelism in Auto Parallel.
-    1. prune grad scaling 
+    1. prune grad scaling
     2. overlap comm and calc
     3. fuse allreduce
     """
@@ -350,9 +350,9 @@ class DataParallelOptimizationPass(PassBase):
         """
         conditions for gradients to be grouped:
         1. group size < max_fuse_numel
-        2. same dp group 
+        2. same dp group
         3. same dtype
-        4. dependency: grad would NOT be used by other ops within group segment 
+        4. dependency: grad would NOT be used by other ops within group segment
 
         gradients inside same group would be fuse into one coalesce tensor
         """
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index 07fd1d60043..89ff2019d73 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -126,7 +126,7 @@ class FP16State(object):
 
     def _build_state(self):
         """
-        mark the execution mode (fp16 or fp32) for ops in all blocks 
+        mark the execution mode (fp16 or fp32) for ops in all blocks
         include forward ops & backward ops
         """
         # mark op dtype
diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py
index a9c83a98c19..0840c3c90fc 100644
--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -95,7 +95,7 @@ class RecomputeState(ProgramStats):
 
     def modify_forward_desc_for_recompute(self, dist_context):
         """
-        If program's foward part has 'dropout' op, this function will insert 
+        If program's foward part has 'dropout' op, this function will insert
         a seed op before it to guarantee that two dropout op have the same outputs.
         """
         op_types = [op.desc.type() for op in self._ops]
diff --git a/python/paddle/distributed/passes/pass_utils.py b/python/paddle/distributed/passes/pass_utils.py
index 6e43930d2e1..e779048c56b 100644
--- a/python/paddle/distributed/passes/pass_utils.py
+++ b/python/paddle/distributed/passes/pass_utils.py
@@ -86,11 +86,11 @@ def prune_program(program, start_op_idx, end_op_idx):
 
 def split_program(program, op_indices):
     """
-    Split the program by op_indices. 
+    Split the program by op_indices.
 
     For examples, a program has 100 ops, and op_indices = [25, 60].
     Then the program is splitted into 3 parts, containing 25, 35 and 40
-    ops respectively.  
+    ops respectively.
 
     The return values are a tuple with 3 elements: the splitted program
     list, the input var names of each splitted program, and the output
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index cdb377a72be..982d3f61e6e 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -1140,7 +1140,7 @@ class SplitTrainerOpsPass(PassBase):
         split cpu-trainer program from origin-program
         1. find heter op (located on different device)
         2. find input&output of every heter-block
-        3. create cpu-trainer program, add send&recv op 
+        3. create cpu-trainer program, add send&recv op
         """
         attrs = pass_ctx._attrs
         default_device_ = 'cpu'
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index 3b2310f1143..1ab43bd1edf 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -611,7 +611,7 @@ def find_heter_ops(program, default_device="cpu"):
                     if no_grad_var in var2idx:
                         """
                        insert sum op & remove sum op from var2idx and origin place
-  
+
                        """
                         op_list = list(block.ops)
                         sum_op = op_list[var2idx[no_grad_var]]
@@ -1335,7 +1335,7 @@ def build_var_distributed(context):
 
     context["param_name_to_grad_name"] = param_name_to_grad_name
     context["grad_name_to_param_name"] = grad_name_to_param_name
-    '''    
+    '''
     print("public build_var_distributed origin_sparse_pairs:",
         context["origin_sparse_pairs"])
     print("public build_var_distributed origin_for_dense:",
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
index 9ebe7fd6031..9cb17949acb 100644
--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -62,12 +62,12 @@ def group_sharded_parallel(model,
         buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23.
         segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20.
         sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used.
-    
+
     Returns:
         model: A wrapper for group sharded given model.
         optimizer: A wrapper for group sharded given optimizer.
         scaler: A wrapper for group sharded given scaler.
-    
+
     Examples:
         .. code-block:: python
 
@@ -184,7 +184,7 @@ def save_group_sharded_model(model, output, optimizer=None):
         model (Layer): A wrapper for group sharded given model.
         output (str): Save directory.
         optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved.
-    
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 6d8454a6e9e..8177c827c45 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -60,10 +60,10 @@ def global_scatter(x,
                    group=None,
                    use_calc_stream=True):
     """
-    The global_scatter operator distributes the data of x to n_expert * world_size experts according to local_count, 
-    and then receives data according to global_count. The expert refers to a user-defined expert network, 
+    The global_scatter operator distributes the data of x to n_expert * world_size experts according to local_count,
+    and then receives data according to global_count. The expert refers to a user-defined expert network,
     n_expert refers to the number of expert networks owned by each card, and world_size refers to the number of graphics cards running the network.
-    
+
     As shown below, the value of the world size is 2, n_expert 2, the batch size of the x 4 and local_count is [2, 0, 2, 0].
     The global_count of the rank 0 is [2, 0, , ], rank 1 is [2, 0, ,](Due to the limited space, only the data calculated on rank 0 is shown here).
     In the global_scatter operator, local_count[i] represents sending local_count[i] data to the (i % n_expert)th expert of the (i // n_expert)th card,
@@ -101,10 +101,10 @@ def global_scatter(x,
             how many data needed to be received. The tensor data type should be int64.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
         use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True.
-    
+
     Returns:
-        out (Tensor): The data received from all experts. 
-    
+        out (Tensor): The data received from all experts.
+
     Examples:
         .. code-block:: python
 
@@ -120,7 +120,7 @@ def global_scatter(x,
             local_input_buf = np.array([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]], \
             dtype=np.float32)
             if paddle.distributed.ParallelEnv().local_rank == 0:
-                local_count = np.array([2, 1, 1, 1]) 
+                local_count = np.array([2, 1, 1, 1])
                 global_count = np.array([2, 1, 1, 1])
             else:
                 local_count = np.array([1, 1, 2, 1])
@@ -195,11 +195,11 @@ def global_gather(x,
     The process of global_gather sending data is as follows:
 
     The global_count[0] of the 0th card represents sending 2 data to the 0th expert of the 0th card;
-    
+
     The global_count[1] of the 0th card represents sending 0 data to the 1th expert of the 0th card;
-    
+
     The global_count[0] of the 1th card represents sending 2 data to the 0th expert of the 0th card;
-    
+
     The global_count[1] of the 1th card represents sending 0 data to the 1th expert of the 0th card.
 
     .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/global_scatter_gather.png
@@ -216,10 +216,10 @@ def global_gather(x,
             how many data needed to be sent. Tensor data type should be int64.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
         use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True.
-    
+
     Returns:
-        out (Tensor): The data received from all experts. 
-    
+        out (Tensor): The data received from all experts.
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/distribution/beta.py b/python/paddle/distribution/beta.py
index e371b56eb66..3474ee87b4c 100644
--- a/python/paddle/distribution/beta.py
+++ b/python/paddle/distribution/beta.py
@@ -21,11 +21,11 @@ class Beta(exponential_family.ExponentialFamily):
     r"""
     Beta distribution parameterized by alpha and beta.
 
-    In probability theory and statistics, the beta distribution is a family of 
-    continuous probability distributions defined on the interval [0, 1] 
-    parameterized by two positive shape parameters, denoted by alpha and beta, 
-    that appear as exponents of the random variable and control the shape of 
-    the distribution. The generalization to multiple variables is called a 
+    In probability theory and statistics, the beta distribution is a family of
+    continuous probability distributions defined on the interval [0, 1]
+    parameterized by two positive shape parameters, denoted by alpha and beta,
+    that appear as exponents of the random variable and control the shape of
+    the distribution. The generalization to multiple variables is called a
     Dirichlet distribution.
 
     The probability density function (pdf) is
@@ -38,18 +38,18 @@ class Beta(exponential_family.ExponentialFamily):
 
     .. math::
 
-        B(\alpha, \beta) = \int_{0}^{1} t^{\alpha - 1} (1-t)^{\beta - 1}\mathrm{d}t 
+        B(\alpha, \beta) = \int_{0}^{1} t^{\alpha - 1} (1-t)^{\beta - 1}\mathrm{d}t
 
 
     Args:
-        alpha (float|Tensor): Alpha parameter. It supports broadcast semantics. 
-            The value of alpha must be positive. When the parameter is a tensor, 
-            it represents multiple independent distribution with 
+        alpha (float|Tensor): Alpha parameter. It supports broadcast semantics.
+            The value of alpha must be positive. When the parameter is a tensor,
+            it represents multiple independent distribution with
+            a batch_shape(refer to ``Distribution`` ).
+        beta (float|Tensor): Beta parameter. It supports broadcast semantics.
+            The value of beta must be positive(>0). When the parameter is tensor,
+            it represent multiple independent distribution with
             a batch_shape(refer to ``Distribution`` ).
-        beta (float|Tensor): Beta parameter. It supports broadcast semantics. 
-            The value of beta must be positive(>0). When the parameter is tensor, 
-            it represent multiple independent distribution with 
-            a batch_shape(refer to ``Distribution`` ). 
 
     Examples:
 
@@ -114,7 +114,7 @@ class Beta(exponential_family.ExponentialFamily):
 
         Args:
             value (Tensor): Value to be evaluated.
-        
+
         Returns:
             Tensor: Probability.
         """
@@ -125,7 +125,7 @@ class Beta(exponential_family.ExponentialFamily):
 
         Args:
             value (Tensor): Value to be evaluated
-        
+
         Returns:
             Tensor: Log probability.
         """
diff --git a/python/paddle/distribution/categorical.py b/python/paddle/distribution/categorical.py
index cd44277f3e8..7728f063c04 100644
--- a/python/paddle/distribution/categorical.py
+++ b/python/paddle/distribution/categorical.py
@@ -31,9 +31,9 @@ from paddle.tensor import arange, concat, gather_nd, multinomial
 
 class Categorical(distribution.Distribution):
     r"""
-    Categorical distribution is a discrete probability distribution that 
-    describes the possible results of a random variable that can take on 
-    one of K possible categories, with the probability of each category 
+    Categorical distribution is a discrete probability distribution that
+    describes the possible results of a random variable that can take on
+    one of K possible categories, with the probability of each category
     separately specified.
 
     The probability mass function (pmf) is:
@@ -267,9 +267,9 @@ class Categorical(distribution.Distribution):
     def probs(self, value):
         """Probabilities of the given category (``value``).
 
-        If ``logits`` is 2-D or higher dimension, the last dimension will be regarded as 
+        If ``logits`` is 2-D or higher dimension, the last dimension will be regarded as
         category, and the others represents the different distributions.
-        At the same time, if ``vlaue`` is 1-D Tensor, ``value`` will be broadcast to the 
+        At the same time, if ``vlaue`` is 1-D Tensor, ``value`` will be broadcast to the
         same number of distributions as ``logits``.
         If ``value`` is not 1-D Tensor, ``value`` should have the same number distributions
         with ``logits. That is, ``value[:-1] = logits[:-1]``.
diff --git a/python/paddle/distribution/dirichlet.py b/python/paddle/distribution/dirichlet.py
index 6862bf30e06..7a8e9180968 100644
--- a/python/paddle/distribution/dirichlet.py
+++ b/python/paddle/distribution/dirichlet.py
@@ -23,32 +23,32 @@ class Dirichlet(exponential_family.ExponentialFamily):
     r"""
     Dirichlet distribution with parameter "concentration".
 
-    The Dirichlet distribution is defined over the `(k-1)-simplex` using a 
+    The Dirichlet distribution is defined over the `(k-1)-simplex` using a
     positive, lenght-k vector concentration(`k > 1`).
     The Dirichlet is identically the Beta distribution when `k = 2`.
 
-    For independent and identically distributed continuous random variable 
-    :math:`\boldsymbol X \in R_k` , and support 
-    :math:`\boldsymbol X \in (0,1), ||\boldsymbol X|| = 1` , 
+    For independent and identically distributed continuous random variable
+    :math:`\boldsymbol X \in R_k` , and support
+    :math:`\boldsymbol X \in (0,1), ||\boldsymbol X|| = 1` ,
     The probability density function (pdf) is
 
     .. math::
-    
-        f(\boldsymbol X; \boldsymbol \alpha) = \frac{1}{B(\boldsymbol \alpha)} \prod_{i=1}^{k}x_i^{\alpha_i-1} 
 
-    where :math:`\boldsymbol \alpha = {\alpha_1,...,\alpha_k}, k \ge 2` is 
+        f(\boldsymbol X; \boldsymbol \alpha) = \frac{1}{B(\boldsymbol \alpha)} \prod_{i=1}^{k}x_i^{\alpha_i-1}
+
+    where :math:`\boldsymbol \alpha = {\alpha_1,...,\alpha_k}, k \ge 2` is
     parameter, the normalizing constant is the multivariate beta function.
 
     .. math::
 
         B(\boldsymbol \alpha) = \frac{\prod_{i=1}^{k} \Gamma(\alpha_i)}{\Gamma(\alpha_0)}
 
-    :math:`\alpha_0=\sum_{i=1}^{k} \alpha_i` is the sum of parameters, 
+    :math:`\alpha_0=\sum_{i=1}^{k} \alpha_i` is the sum of parameters,
     :math:`\Gamma(\alpha)` is gamma function.
 
     Args:
-        concentration (Tensor): "Concentration" parameter of dirichlet 
-            distribution, also called :math:`\alpha`. When it's over one 
+        concentration (Tensor): "Concentration" parameter of dirichlet
+            distribution, also called :math:`\alpha`. When it's over one
             dimension, the last axis denotes the parameter of distribution,
             ``event_shape=concentration.shape[-1:]`` , axes other than last are
             condsider batch dimensions with ``batch_shape=concentration.shape[:-1]`` .
diff --git a/python/paddle/distribution/distribution.py b/python/paddle/distribution/distribution.py
index 8c5843521b0..c7695059fc0 100644
--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
@@ -39,15 +39,15 @@ from paddle.tensor import arange, concat, gather_nd, multinomial
 
 class Distribution(object):
     """
-    The abstract base class for probability distributions. Functions are 
+    The abstract base class for probability distributions. Functions are
     implemented in specific distributions.
 
     Args:
-        batch_shape(Sequence[int], optional):  independent, not identically 
+        batch_shape(Sequence[int], optional):  independent, not identically
             distributed draws, aka a "collection" or "bunch" of distributions.
-        event_shape(Sequence[int], optional): the shape of a single 
-            draw from the distribution; it may be dependent across dimensions. 
-            For scalar distributions, the event shape is []. For n-dimension 
+        event_shape(Sequence[int], optional): the shape of a single
+            draw from the distribution; it may be dependent across dimensions.
+            For scalar distributions, the event shape is []. For n-dimension
             multivariate distribution, the event shape is [n].
     """
 
@@ -118,16 +118,16 @@ class Distribution(object):
 
     def probs(self, value):
         """Probability density/mass function.
-        
-        .. note:: 
-        
-            This method will be deprecated in the future, please use `prob` 
+
+        .. note::
+
+            This method will be deprecated in the future, please use `prob`
             instead.
         """
         raise NotImplementedError
 
     def _extend_shape(self, sample_shape):
-        """compute shape of the sample 
+        """compute shape of the sample
 
         Args:
             sample_shape (Tensor): sample shape
@@ -239,9 +239,9 @@ class Distribution(object):
 
     def _probs_to_logits(self, probs, is_binary=False):
         r"""
-        Converts probabilities into logits. For the binary, probs denotes the 
-        probability of occurrence of the event indexed by `1`. For the 
-        multi-dimensional, values of last axis denote the probabilities of 
+        Converts probabilities into logits. For the binary, probs denotes the
+        probability of occurrence of the event indexed by `1`. For the
+        multi-dimensional, values of last axis denote the probabilities of
         occurrence of each of the events.
         """
         return (paddle.log(probs) - paddle.log1p(-probs)) \
@@ -249,8 +249,8 @@ class Distribution(object):
 
     def _logits_to_probs(self, logits, is_binary=False):
         r"""
-        Converts logits into probabilities. For the binary, each value denotes 
-        log odds, whereas for the multi-dimensional case, the values along the 
+        Converts logits into probabilities. For the binary, each value denotes
+        log odds, whereas for the multi-dimensional case, the values along the
         last dimension denote the log probabilities of the events.
         """
         return paddle.nn.functional.sigmoid(logits) \
diff --git a/python/paddle/distribution/exponential_family.py b/python/paddle/distribution/exponential_family.py
index b78e7749704..63019667fff 100644
--- a/python/paddle/distribution/exponential_family.py
+++ b/python/paddle/distribution/exponential_family.py
@@ -18,19 +18,19 @@ from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
 
 
 class ExponentialFamily(distribution.Distribution):
-    r""" 
-    ExponentialFamily is the base class for probability distributions belonging 
-    to exponential family, whose probability mass/density function has the 
+    r"""
+    ExponentialFamily is the base class for probability distributions belonging
+    to exponential family, whose probability mass/density function has the
     form is defined below
 
     ExponentialFamily is derived from `paddle.distribution.Distribution`.
-    
+
     .. math::
 
         f_{F}(x; \theta) = \exp(\langle t(x), \theta\rangle - F(\theta) + k(x))
-    
-    where :math:`\theta` denotes the natural parameters, :math:`t(x)` denotes 
-    the sufficient statistic, :math:`F(\theta)` is the log normalizer function 
+
+    where :math:`\theta` denotes the natural parameters, :math:`t(x)` denotes
+    the sufficient statistic, :math:`F(\theta)` is the log normalizer function
     for a given family and :math:`k(x)` is the carrier measure.
 
     Distribution belongs to exponential family referring to https://en.wikipedia.org/wiki/Exponential_family
@@ -48,7 +48,7 @@ class ExponentialFamily(distribution.Distribution):
         raise NotImplementedError
 
     def entropy(self):
-        """caculate entropy use `bregman divergence` 
+        """caculate entropy use `bregman divergence`
         https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf
         """
         entropy_value = -self._mean_carrier_measure
diff --git a/python/paddle/distribution/independent.py b/python/paddle/distribution/independent.py
index 884c34b4b6a..9f02d802fc8 100644
--- a/python/paddle/distribution/independent.py
+++ b/python/paddle/distribution/independent.py
@@ -20,17 +20,17 @@ class Independent(distribution.Distribution):
     Reinterprets some of the batch dimensions of a distribution as event dimensions.
 
     This is mainly useful for changing the shape of the result of
-    :meth:`log_prob`. 
+    :meth:`log_prob`.
 
     Args:
         base (Distribution): The base distribution.
-        reinterpreted_batch_rank (int): The number of batch dimensions to 
+        reinterpreted_batch_rank (int): The number of batch dimensions to
             reinterpret as event dimensions.
 
     Examples:
 
         .. code-block:: python
-        
+
             import paddle
             from paddle.distribution import independent
 
diff --git a/python/paddle/distribution/kl.py b/python/paddle/distribution/kl.py
index c5ad3f04358..6a0838e588c 100644
--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -35,7 +35,7 @@ def kl_divergence(p, q):
 
     .. math::
 
-        KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x 
+        KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x
 
     Args:
         p (Distribution): ``Distribution`` object.
@@ -64,11 +64,11 @@ def kl_divergence(p, q):
 def register_kl(cls_p, cls_q):
     """Decorator for register a KL divergence implemention function.
 
-    The ``kl_divergence(p, q)`` function will search concrete implemention 
-    functions registered by ``register_kl``, according to multi-dispatch pattern. 
-    If an implemention function is found, it will return the result, otherwise, 
-    it will raise ``NotImplementError`` exception. Users can register 
-    implemention funciton by the decorator. 
+    The ``kl_divergence(p, q)`` function will search concrete implemention
+    functions registered by ``register_kl``, according to multi-dispatch pattern.
+    If an implemention function is found, it will return the result, otherwise,
+    it will raise ``NotImplementError`` exception. Users can register
+    implemention funciton by the decorator.
 
     Args:
         cls_p(Distribution): Subclass derived from ``Distribution``.
diff --git a/python/paddle/distribution/multinomial.py b/python/paddle/distribution/multinomial.py
index 424ec4b120d..d960c6d8f6b 100644
--- a/python/paddle/distribution/multinomial.py
+++ b/python/paddle/distribution/multinomial.py
@@ -24,14 +24,14 @@ except:
 
 class Multinomial(distribution.Distribution):
     r"""
-    Multinomial distribution parameterized by :attr:`total_count` and 
+    Multinomial distribution parameterized by :attr:`total_count` and
     :attr:`probs`.
 
-    In probability theory, the multinomial distribution is a generalization of 
+    In probability theory, the multinomial distribution is a generalization of
     the binomial distribution, it models the probability of counts for each side
-    of a k-sided die rolled n times. When k is 2 and n is 1, the multinomial is 
-    the bernoulli distribution, when k is 2 and n is grater than 1, it is the 
-    binomial distribution, when k is grater than 2 and n is 1, it is the 
+    of a k-sided die rolled n times. When k is 2 and n is 1, the multinomial is
+    the bernoulli distribution, when k is 2 and n is grater than 1, it is the
+    binomial distribution, when k is grater than 2 and n is 1, it is the
     categorical distribution.
 
     The probability mass function (PMF) for multinomial is
@@ -40,18 +40,18 @@ class Multinomial(distribution.Distribution):
 
         f(x_1, ..., x_k; n, p_1,...,p_k) = \frac{n!}{x_1!...x_k!}p_1^{x_1}...p_k^{x_k}
 
-    where, :math:`n` is number of trials, k is the number of categories, 
-    :math:`p_i` denote probability of a trial falling into each category, 
-    :math:`{\textstyle \sum_{i=1}^{k}p_i=1}, p_i \ge 0`, and :math:`x_i` denote 
-    count of each category. 
+    where, :math:`n` is number of trials, k is the number of categories,
+    :math:`p_i` denote probability of a trial falling into each category,
+    :math:`{\textstyle \sum_{i=1}^{k}p_i=1}, p_i \ge 0`, and :math:`x_i` denote
+    count of each category.
 
     Args:
         total_count (int): Number of trials.
-        probs (Tensor): Probability of a trial falling into each category. Last 
+        probs (Tensor): Probability of a trial falling into each category. Last
             axis of probs indexes over categories, other axes index over batches.
-            Probs value should between [0, 1], and sum to 1 along last axis. If 
-            the value over 1, it will be normalized to sum to 1 along the last 
-            axis. 
+            Probs value should between [0, 1], and sum to 1 along last axis. If
+            the value over 1, it will be normalized to sum to 1 along the last
+            axis.
 
     Examples:
 
diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py
index f248e1a0927..8a9e5cd7372 100644
--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -55,7 +55,7 @@ class Normal(distribution.Distribution):
 
     Examples:
         .. code-block:: python
-          
+
           import paddle
           from paddle.distribution import Normal
 
@@ -248,7 +248,7 @@ class Normal(distribution.Distribution):
         .. math::
 
             ratio = \\frac{\sigma_0}{\sigma_1}
-        
+
         .. math::
 
             diff = \mu_1 - \mu_0
diff --git a/python/paddle/distribution/transform.py b/python/paddle/distribution/transform.py
index d7a512aade2..efa32489651 100644
--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
@@ -50,45 +50,45 @@ class Type(enum.Enum):
 class Transform(object):
     r"""Base class for the transformations of random variables.
 
-    ``Transform`` can be used to represent any differentiable and injective 
-    function from the subset of :math:`R^n` to subset of :math:`R^m`, generally 
-    used for transforming a random sample generated by ``Distribution`` 
-    instance. 
-
-    Suppose :math:`X` is a K-dimensional random variable with probability 
-    density function :math:`p_X(x)`. A new random variable :math:`Y = f(X)` may 
-    be defined by transforming :math:`X` with a suitably well-behaved funciton 
-    :math:`f`. It suffices for what follows to note that if f is one-to-one and 
-    its inverse :math:`f^{-1}` have a well-defined Jacobian, then the density of 
+    ``Transform`` can be used to represent any differentiable and injective
+    function from the subset of :math:`R^n` to subset of :math:`R^m`, generally
+    used for transforming a random sample generated by ``Distribution``
+    instance.
+
+    Suppose :math:`X` is a K-dimensional random variable with probability
+    density function :math:`p_X(x)`. A new random variable :math:`Y = f(X)` may
+    be defined by transforming :math:`X` with a suitably well-behaved funciton
+    :math:`f`. It suffices for what follows to note that if f is one-to-one and
+    its inverse :math:`f^{-1}` have a well-defined Jacobian, then the density of
     :math:`Y` is
 
     .. math::
 
         p_Y(y) = p_X(f^{-1}(y)) |det J_{f^{-1}}(y)|
 
-    where det is the matrix determinant operation and :math:`J_{f^{-1}}(y)` is 
+    where det is the matrix determinant operation and :math:`J_{f^{-1}}(y)` is
     the Jacobian matrix of :math:`f^{-1}` evaluated at :math:`y`.
     Taking :math:`x = f^{-1}(y)`, the Jacobian matrix is defined by
 
     .. math::
 
         J(y) = \begin{bmatrix}
-        {\frac{\partial x_1}{\partial y_1}} &{\frac{\partial x_1}{\partial y_2}} 
+        {\frac{\partial x_1}{\partial y_1}} &{\frac{\partial x_1}{\partial y_2}}
         &{\cdots} &{\frac{\partial x_1}{\partial y_K}} \\
         {\frac{\partial x_2}{\partial y_1}}  &{\frac{\partial x_2}
         {\partial y_2}}&{\cdots} &{\frac{\partial x_2}{\partial y_K}} \\
         {\vdots} &{\vdots} &{\ddots} &{\vdots}\\
-        {\frac{\partial x_K}{\partial y_1}} &{\frac{\partial x_K}{\partial y_2}} 
-        &{\cdots} &{\frac{\partial x_K}{\partial y_K}} 
+        {\frac{\partial x_K}{\partial y_1}} &{\frac{\partial x_K}{\partial y_2}}
+        &{\cdots} &{\frac{\partial x_K}{\partial y_K}}
         \end{bmatrix}
 
     A ``Transform`` can be characterized by three operations:
 
         #. forward
-           Forward implements :math:`x \rightarrow f(x)`, and is used to convert 
+           Forward implements :math:`x \rightarrow f(x)`, and is used to convert
            one random outcome into another.
         #. inverse
-           Undoes the transformation :math:`y \rightarrow f^{-1}(y)`.  
+           Undoes the transformation :math:`y \rightarrow f^{-1}(y)`.
         #. log_det_jacobian
            The log of the absolute value of the determinant of the matrix of all
            first-order partial derivatives of the inverse function.
@@ -121,14 +121,14 @@ class Transform(object):
         return Type.is_injective(cls._type)
 
     def __call__(self, input):
-        """Make this instance as a callable object. The return value is 
-        depening on the input type. 
+        """Make this instance as a callable object. The return value is
+        depening on the input type.
 
-        * If the input is a ``Tensor`` instance, return 
+        * If the input is a ``Tensor`` instance, return
           ``self.forward(input)`` .
-        * If the input is a ``Distribution`` instance, return 
+        * If the input is a ``Distribution`` instance, return
           ``TransformedDistribution(base=input, transforms=[self])`` .
-        * If the input is a ``Transform`` instance, return 
+        * If the input is a ``Transform`` instance, return
           ``ChainTransform([self, input])`` .
 
         Args:
@@ -145,12 +145,12 @@ class Transform(object):
         return self.forward(x)
 
     def forward(self, x):
-        """Forward transformation with mapping :math:`y = f(x)`. 
+        """Forward transformation with mapping :math:`y = f(x)`.
 
         Useful for turning one random outcome into another.
 
         Args:
-            x (Tensos): Input parameter, generally is a sample generated 
+            x (Tensos): Input parameter, generally is a sample generated
                 from ``Distribution``.
 
         Returns:
@@ -166,7 +166,7 @@ class Transform(object):
         return self._forward(x)
 
     def inverse(self, y):
-        """Inverse transformation :math:`x = f^{-1}(y)`. It's useful for "reversing" 
+        """Inverse transformation :math:`x = f^{-1}(y)`. It's useful for "reversing"
         a transformation to compute one probability in terms of another.
 
         Args:
@@ -185,15 +185,15 @@ class Transform(object):
         return self._inverse(y)
 
     def forward_log_det_jacobian(self, x):
-        """The log of the absolute value of the determinant of the matrix of all 
+        """The log of the absolute value of the determinant of the matrix of all
         first-order partial derivatives of the inverse function.
 
         Args:
-            x (Tensor): Input tensor, generally is a sample generated from 
+            x (Tensor): Input tensor, generally is a sample generated from
                 ``Distribution``
 
         Returns:
-            Tensor: The log of the absolute value of Jacobian determinant. 
+            Tensor: The log of the absolute value of Jacobian determinant.
         """
         if not isinstance(x, paddle.fluid.framework.Variable):
             raise TypeError(
@@ -212,11 +212,11 @@ class Transform(object):
 
     def inverse_log_det_jacobian(self, y):
         """Compute :math:`log|det J_{f^{-1}}(y)|`.
-        Note that ``forward_log_det_jacobian`` is the negative of this function, 
+        Note that ``forward_log_det_jacobian`` is the negative of this function,
         evaluated at :math:`f^{-1}(y)`.
 
         Args:
-            y (Tensor): The input to the ``inverse`` Jacobian determinant 
+            y (Tensor): The input to the ``inverse`` Jacobian determinant
                 evaluation.
 
         Returns:
@@ -269,13 +269,13 @@ class Transform(object):
         return variable.real
 
     def _forward(self, x):
-        """Inner method for publid API ``forward``, subclass should 
+        """Inner method for publid API ``forward``, subclass should
         overwrite this method for supporting forward transformation.
         """
         raise NotImplementedError('Forward not implemented')
 
     def _inverse(self, y):
-        """Inner method of public API ``inverse``, subclass should 
+        """Inner method of public API ``inverse``, subclass should
         overwrite this method for supporting inverse transformation.
         """
         raise NotImplementedError('Inverse not implemented')
@@ -301,35 +301,35 @@ class Transform(object):
             'is implemented. One of them is required')
 
     def _forward_shape(self, shape):
-        """Inner method called by ``forward_shape``, which is used to infer the 
-        forward shape. Subclass should overwrite this method for supporting 
+        """Inner method called by ``forward_shape``, which is used to infer the
+        forward shape. Subclass should overwrite this method for supporting
         ``forward_shape``.
         """
         return shape
 
     def _inverse_shape(self, shape):
-        """Inner method called by ``inverse_shape``, whic is used to infer the 
-        invese shape. Subclass should overwrite this method for supporting 
+        """Inner method called by ``inverse_shape``, whic is used to infer the
+        invese shape. Subclass should overwrite this method for supporting
         ``inverse_shape``.
         """
         return shape
 
 
 class AbsTransform(Transform):
-    r"""Absolute transformation with formula :math:`y = f(x) = abs(x)`, 
+    r"""Absolute transformation with formula :math:`y = f(x) = abs(x)`,
     element-wise.
 
-    This non-injective transformation allows for transformations of scalar 
-    distributions with the absolute value function, which maps ``(-inf, inf)`` 
+    This non-injective transformation allows for transformations of scalar
+    distributions with the absolute value function, which maps ``(-inf, inf)``
     to ``[0, inf)`` .
 
-    * For ``y`` in ``(0, inf)`` , ``AbsTransform.inverse(y)`` returns the set invese 
+    * For ``y`` in ``(0, inf)`` , ``AbsTransform.inverse(y)`` returns the set invese
       ``{x  in (-inf, inf) : |x| = y}`` as a tuple, ``-y, y`` .
-    * For ``y`` equal ``0`` , ``AbsTransform.inverse(0)`` returns ``0, 0``, which is not 
-      the set inverse (the set inverse is the singleton {0}), but "works" in 
-      conjunction with ``TransformedDistribution`` to produce a left 
+    * For ``y`` equal ``0`` , ``AbsTransform.inverse(0)`` returns ``0, 0``, which is not
+      the set inverse (the set inverse is the singleton {0}), but "works" in
+      conjunction with ``TransformedDistribution`` to produce a left
       semi-continuous pdf.
-    * For ``y`` in ``(-inf, 0)`` , ``AbsTransform.inverse(y)`` returns the 
+    * For ``y`` in ``(-inf, 0)`` , ``AbsTransform.inverse(y)`` returns the
       wrong thing ``-y, y``. This is done for efficiency.
 
     Examples:
@@ -388,7 +388,7 @@ class AbsTransform(Transform):
 
 
 class AffineTransform(Transform):
-    r"""Affine transformation with mapping 
+    r"""Affine transformation with mapping
     :math:`y = \text{loc} + \text{scale} \times x`.
 
     Args:
@@ -638,26 +638,26 @@ class ExpTransform(Transform):
 
 class IndependentTransform(Transform):
     r"""
-    ``IndependentTransform`` wraps a base transformation, reinterprets 
+    ``IndependentTransform`` wraps a base transformation, reinterprets
     some of the rightmost batch axes as event axes.
 
     Generally, it is used to expand the event axes. This has no effect on the
-    forward or inverse transformaion, but does sum out the 
-    ``reinterpretd_bach_rank`` rightmost dimensions in computing the determinant 
+    forward or inverse transformaion, but does sum out the
+    ``reinterpretd_bach_rank`` rightmost dimensions in computing the determinant
     of Jacobian matrix.
 
-    To see this, consider the ``ExpTransform`` applied to a Tensor which has 
-    sample, batch, and event ``(S,B,E)`` shape semantics. Suppose the Tensor's 
+    To see this, consider the ``ExpTransform`` applied to a Tensor which has
+    sample, batch, and event ``(S,B,E)`` shape semantics. Suppose the Tensor's
     paritioned-shape is ``(S=[4], B=[2, 2], E=[3])`` , reinterpreted_batch_rank
     is 1. Then the reinterpreted Tensor's shape  is ``(S=[4], B=[2], E=[2, 3])`` .
-    The shape returned by ``forward`` and ``inverse`` is unchanged, ie, 
-    ``[4,2,2,3]`` . However the shape returned by ``inverse_log_det_jacobian`` 
-    is ``[4,2]``, because the Jacobian determinant is a reduction over the 
+    The shape returned by ``forward`` and ``inverse`` is unchanged, ie,
+    ``[4,2,2,3]`` . However the shape returned by ``inverse_log_det_jacobian``
+    is ``[4,2]``, because the Jacobian determinant is a reduction over the
     event dimensions.
 
     Args:
         base (Transform): The base transformation.
-        reinterpreted_batch_rank (int): The num of rightmost batch rank that 
+        reinterpreted_batch_rank (int): The num of rightmost batch rank that
             will be reinterpreted as event rank.
 
     Examples:
@@ -793,7 +793,7 @@ class PowerTransform(Transform):
 class ReshapeTransform(Transform):
     r"""Reshape the event shape of a tensor.
 
-    Note that ``in_event_shape`` and ``out_event_shape`` must have the same 
+    Note that ``in_event_shape`` and ``out_event_shape`` must have the same
     number of elements.
 
     Args:
@@ -943,8 +943,8 @@ class SigmoidTransform(Transform):
 class SoftmaxTransform(Transform):
     r"""Softmax transformation with mapping :math:`y=\exp(x)` then normalizing.
 
-    It's generally used to convert unconstrained space to simplex. This mapping 
-    is not injective, so ``forward_log_det_jacobian`` and 
+    It's generally used to convert unconstrained space to simplex. This mapping
+    is not injective, so ``forward_log_det_jacobian`` and
     ``inverse_log_det_jacobian`` are not implemented.
 
     Examples:
@@ -997,11 +997,11 @@ class SoftmaxTransform(Transform):
 
 
 class StackTransform(Transform):
-    r""" ``StackTransform`` applies a sequence of transformations along the 
+    r""" ``StackTransform`` applies a sequence of transformations along the
     specific axis.
 
     Args:
-        transforms(Sequence[Transform]): The sequence of transformations. 
+        transforms(Sequence[Transform]): The sequence of transformations.
         axis(int): The axis along which will be transformed.
 
     Examples:
@@ -1102,7 +1102,7 @@ class StackTransform(Transform):
 
 
 class StickBreakingTransform(Transform):
-    r"""Convert an unconstrained vector to the simplex with one additional 
+    r"""Convert an unconstrained vector to the simplex with one additional
     dimension by the stick-breaking construction.
 
     Examples:
@@ -1213,8 +1213,8 @@ class TanhTransform(Transform):
         return y.atanh()
 
     def _forward_log_det_jacobian(self, x):
-        """We implicitly rely on _forward_log_det_jacobian rather than 
-        explicitly implement ``_inverse_log_det_jacobian`` since directly using 
+        """We implicitly rely on _forward_log_det_jacobian rather than
+        explicitly implement ``_inverse_log_det_jacobian`` since directly using
         ``-tf.math.log1p(-tf.square(y))`` has lower numerical precision.
 
         See details: https://github.com/tensorflow/probability/blob/master/tensorflow_probability/python/bijectors/tanh.py#L69-L80
diff --git a/python/paddle/distribution/transformed_distribution.py b/python/paddle/distribution/transformed_distribution.py
index ce386971e5f..bb2e181d7bb 100644
--- a/python/paddle/distribution/transformed_distribution.py
+++ b/python/paddle/distribution/transformed_distribution.py
@@ -20,8 +20,8 @@ from paddle.distribution import independent
 
 
 class TransformedDistribution(distribution.Distribution):
-    r"""    
-    Applies a sequence of Transforms to a base distribution. 
+    r"""
+    Applies a sequence of Transforms to a base distribution.
 
     Args:
         base (Distribution): The base distribution.
@@ -30,12 +30,12 @@ class TransformedDistribution(distribution.Distribution):
     Examples:
 
         .. code-block:: python
-        
-            import paddle 
+
+            import paddle
             from paddle.distribution import transformed_distribution
 
             d = transformed_distribution.TransformedDistribution(
-                paddle.distribution.Normal(0., 1.), 
+                paddle.distribution.Normal(0., 1.),
                 [paddle.distribution.AffineTransform(paddle.to_tensor(1.), paddle.to_tensor(2.))]
             )
 
diff --git a/python/paddle/distribution/variable.py b/python/paddle/distribution/variable.py
index b5c3d71d3fa..f6cced04164 100644
--- a/python/paddle/distribution/variable.py
+++ b/python/paddle/distribution/variable.py
@@ -37,7 +37,7 @@ class Variable(object):
         return self._event_rank
 
     def constraint(self, value):
-        """Check whether the 'value' meet the constraint conditions of this 
+        """Check whether the 'value' meet the constraint conditions of this
         random variable."""
         return self._constraint(value)
 
@@ -59,8 +59,8 @@ class Independent(Variable):
 
     Args:
         base (Variable): Base variable.
-        reinterpreted_batch_rank (int): The rightmost batch rank to be 
-            reinterpreted. 
+        reinterpreted_batch_rank (int): The rightmost batch rank to be
+            reinterpreted.
     """
 
     def __init__(self, base, reinterpreted_batch_rank):
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index 5cbc8f5e3be..161539aa9c4 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -155,30 +155,30 @@ def fft(x, n=None, axis=-1, norm="backward", name=None):
     """
     Calculate one-dimensional discrete Fourier transform.
 
-    This function uses the efficient fast Fourier transform (FFT) algorithm [1] to 
+    This function uses the efficient fast Fourier transform (FFT) algorithm [1] to
     calculate the 1-D * n * point discrete Fourier transform (DFT).
 
     Args:
         x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. If `n` is less than 
-            the length input, the input will be cropped. If larger, the input is filled 
-            with zeros. If `n` is not given, the input length along the axis specified 
+        n (int, optional): The length of the output transform axis. If `n` is less than
+            the length input, the input will be cropped. If larger, the input is filled
+            with zeros. If `n` is not given, the input length along the axis specified
             by `axis` is used.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis
+            is used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
+        complex tensor. The truncated or zero-padded input, transformed along the axis indicated
         by `axis`, or the last one if `axis` is not specified.
-    
+
     Examples:
 
         .. code-block:: python
@@ -212,7 +212,7 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
     """
     Compute the 1-D inverse discrete Fourier Transform.
 
-    This function computes the inverse of the 1-D *n*-point discrete Fourier transform 
+    This function computes the inverse of the 1-D *n*-point discrete Fourier transform
     computed by `fft`.  In other words, ``ifft(fft(x)) == x`` to within numerical accuracy.
 
     The input should be ordered in the same way as is returned by `fft`,
@@ -225,27 +225,27 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
 
     For an even number of input points, ``x[n//2]`` represents the sum of
     the values at the positive and negative Nyquist frequencies, as the two
-    are aliased together. 
+    are aliased together.
 
     Args:
         x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. If `n` is less than 
-            the length input, the input will be cropped. If larger, the input is filled 
-            with zeros. If `n` is not given, the input length along the axis specified 
+        n (int, optional): The length of the output transform axis. If `n` is less than
+            the length input, the input will be cropped. If larger, the input is filled
+            with zeros. If `n` is not given, the input length along the axis specified
             by `axis` is used.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis
+            is used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
+        complex tensor. The truncated or zero-padded input, transformed along the axis indicated
         by `axis`, or the last one if `axis` is not specified.
 
     Examples:
@@ -286,40 +286,40 @@ def rfft(x, n=None, axis=-1, norm="backward", name=None):
     called the Fast Fourier Transform (FFT).
 
     When the DFT is computed for purely real input, the output is
-    Hermitian-symmetric. This function does not compute the negative frequency 
-    terms, and the length of the transformed axis of the output is therefore 
+    Hermitian-symmetric. This function does not compute the negative frequency
+    terms, and the length of the transformed axis of the output is therefore
     ``n//2 + 1``.
 
     Args:
-        x(Tensor) : Real-valued input tensor 
-        n(int, optional): Number of points along transformation axis in the 
-            input to use. If `n` is smaller than the length of the input, the 
-            input is cropped. If it is larger, the input is padded with zeros. 
-            If `n` is not given, the length of the input along the axis 
+        x(Tensor) : Real-valued input tensor
+        n(int, optional): Number of points along transformation axis in the
+            input to use. If `n` is smaller than the length of the input, the
+            input is cropped. If it is larger, the input is padded with zeros.
+            If `n` is not given, the length of the input along the axis
             specified by `axis` is used.
-        axis(int, optional): Axis over which to compute the FFT. Default value 
+        axis(int, optional): Axis over which to compute the FFT. Default value
             is last axis.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward  pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
+        norm(str, optional) : Normalization mode, indicates which direction of
+            the forward/backward  pair of transforms is scaled and with what
+            normalization factor. Include {"backward", "ortho", "forward"},
             default value is "backward".
-            
+
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         out(Tensor) : complex tensor
 
     Examples:
-    
+
     .. code-block:: python
-    
+
         import paddle
 
         x = paddle.to_tensor([0.0, 1.0, 0.0, 0.0])
@@ -334,38 +334,38 @@ def irfft(x, n=None, axis=-1, norm="backward", name=None):
     """
     Computes the inverse of `rfft`.
 
-    This function calculates the inverse of the one-dimensional *n* point discrete 
-    Fourier transform of the actual input calculated by "rfft". In other words, 
+    This function calculates the inverse of the one-dimensional *n* point discrete
+    Fourier transform of the actual input calculated by "rfft". In other words,
     ``irfft(rfft(a),len(a)) == a`` is within the numerical accuracy range.
 
-    The input shall be in the form of "rfft", i.e. the actual zero frequency term, 
-    followed by the complex positive frequency term, in the order of increasing frequency. 
-    Because the discrete Fourier transform of the actual input is Hermite symmetric, 
-    the negative frequency term is regarded as the complex conjugate term of the corresponding 
+    The input shall be in the form of "rfft", i.e. the actual zero frequency term,
+    followed by the complex positive frequency term, in the order of increasing frequency.
+    Because the discrete Fourier transform of the actual input is Hermite symmetric,
+    the negative frequency term is regarded as the complex conjugate term of the corresponding
     positive frequency term.
 
     Args:
         x (Tensor): The input data. It's a Tensor type. It's a complex.
         n (int, optional): The length of the output transform axis. For `n` output
-            points, ``n//2 + 1``input points are necessary. If the length of the input tensor is greater 
-            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
-            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
+            points, ``n//2 + 1``input points are necessary. If the length of the input tensor is greater
+            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given,
+            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified
             along the ` axis'.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis
+            is used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
-        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
-        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
+        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by
+        `axis`, or the last input if `axis` is not specified. The length of the conversion axis
+        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis.
         If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1``
         in some cases.
-    
+
     Examples:
 
         .. code-block:: python
@@ -389,25 +389,25 @@ def hfft(x, n=None, axis=-1, norm="backward", name=None):
     Args:
         x (Tensor): The input data. It's a Tensor type. It's a complex.
         n (int, optional): The length of the output transform axis. For `n` output
-            points, ``n//2 + 1`` input points are necessary. If the length of the input tensor is greater 
-            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
-            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
+            points, ``n//2 + 1`` input points are necessary. If the length of the input tensor is greater
+            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given,
+            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified
             along the ` axis'.
-        axis (int,optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
+        axis (int,optional): Axis used to calculate FFT. If not specified, the last axis
+            is used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
-        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
-        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
-        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1`` in 
+        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by
+        `axis`, or the last input if `axis` is not specified. The length of the conversion axis
+        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis.
+        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1`` in
         some cases.
-    
+
     Examples:
 
         .. code-block:: python
@@ -428,40 +428,40 @@ def ihfft(x, n=None, axis=-1, norm="backward", name=None):
     """
     The inverse FFT of a signal that has Hermitian symmetry.
 
-    This function computes the one dimensional *n*-point inverse FFT of a signal 
-    that has Hermitian symmetry by means of an efficient algorithm called 
+    This function computes the one dimensional *n*-point inverse FFT of a signal
+    that has Hermitian symmetry by means of an efficient algorithm called
     the Fast Fourier Transform (FFT).
 
     When the DFT is computed for purely real input, the output is
-    Hermitian-symmetric. This function does not compute the negative frequency 
-    terms, and the length of the transformed axis of the output is therefore 
+    Hermitian-symmetric. This function does not compute the negative frequency
+    terms, and the length of the transformed axis of the output is therefore
     ``n//2 + 1``.
 
     Args:
         x(Tensor): Input tensor.
-        n(int, optional): The number of points along transformation axis in the 
-            input to use.  If `n` is smaller than the length of the input, the 
-            input is cropped.  If it is larger, the input is padded with zeros. 
-            If `n` is not given, the length of the input along the axis 
+        n(int, optional): The number of points along transformation axis in the
+            input to use.  If `n` is smaller than the length of the input, the
+            input is cropped.  If it is larger, the input is padded with zeros.
+            If `n` is not given, the length of the input along the axis
             specified by `axis` is used.
         axis(int, optional) : Axis over which to compute the inverse FFT. If not
             given, the last axis is used.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
+        norm(str, optional) : Normalization mode, indicates which direction of
+            the forward/backward pair of transforms is scaled and with what
+            normalization factor. Include {"backward", "ortho", "forward"},
             default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         out(Tensor) : complex tensor.
 
     Examples:
-    
+
     .. code-block:: python
-    
-        import paddle 
+
+        import paddle
 
         spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
         print(paddle.fft.ifft(spectrum))
@@ -480,7 +480,7 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
     """
     Compute the N-D discrete Fourier Transform.
 
-    This function calculates the n-D discrete Fourier transform on any number of axes 
+    This function calculates the n-D discrete Fourier transform on any number of axes
     in the M-D array by fast Fourier transform (FFT).
 
     Args:
@@ -493,20 +493,20 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
             if `s` is not given, the shape of the input along the axes specified
             by `axes` is used.
         axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
-            axes are used, or all axes if `s` is also not specified.      
+            axes are used, or all axes if `s` is also not specified.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
+        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by
         `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
-    
+
     Examples:
 
         .. code-block:: python
@@ -573,20 +573,20 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
             if `s` is not given, the shape of the input along the axes specified
             by `axes` is used.
         axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
-            axes are used, or all axes if `s` is also not specified.      
+            axes are used, or all axes if `s` is also not specified.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
-        
+
     Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
+        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by
         `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
-    
+
     Examples:
 
         .. code-block:: python
@@ -637,40 +637,40 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None):
 
     Args:
         x(Tensor) : Input tensor, taken to be real.
-        s(Sequence[int], optional) : Shape to use from the exec fft. The final element of 
-            `s` corresponds to `n` for ``rfft(x, n)``, while for the remaining 
-            axes, it corresponds to `n` for ``fft(x, n)``. Along any axis, if 
-            the given shape is smaller than that of the input, the input is 
-            cropped.  If it is larger, the input is padded with zeros. if `s` is 
-            not given, the shape of the input along the axes specified by `axes` 
+        s(Sequence[int], optional) : Shape to use from the exec fft. The final element of
+            `s` corresponds to `n` for ``rfft(x, n)``, while for the remaining
+            axes, it corresponds to `n` for ``fft(x, n)``. Along any axis, if
+            the given shape is smaller than that of the input, the input is
+            cropped.  If it is larger, the input is padded with zeros. if `s` is
+            not given, the shape of the input along the axes specified by `axes`
             is used.
-        axes(Sequence[int], optional) : Axes over which to compute the FFT.  If not given, 
-            the last ``len(s)`` axes are used, or all axes if `s` is also not 
+        axes(Sequence[int], optional) : Axes over which to compute the FFT.  If not given,
+            the last ``len(s)`` axes are used, or all axes if `s` is also not
             specified.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
-            default value is "backward". The details of 
+        norm(str, optional) : Normalization mode, indicates which direction of
+            the forward/backward pair of transforms is scaled and with what
+            normalization factor. Include {"backward", "ortho", "forward"},
+            default value is "backward". The details of
             three operations are shown below:
-            
-                - "backward": The factor of forward direction and backward direction are ``1`` 
+
+                - "backward": The factor of forward direction and backward direction are ``1``
                 and ``1/n`` respectively;
-                - "forward": The factor of forward direction and backward direction are ``1/n`` 
+                - "forward": The factor of forward direction and backward direction are ``1/n``
                 and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         out(Tensor): complex tensor
 
     Examples:
-    
+
     .. code-block:: python
-    
+
         import paddle
 
         # default, all axis will be used to exec fft
@@ -717,37 +717,37 @@ def irfftn(x, s=None, axes=None, norm="backward", name=None):
 
     Args:
         x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): The length of the output transform axis. 
-            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). 
-            
-            - `s` is also the number of input points used along this axis, except for the last axis, where ``s[-1]//2+1`` points of the input are used. 
-            - Along any axis, if the shape indicated by `s` is smaller than that of the input, the input is cropped. If it is larger, the input is padded with zeros. 
-            - If `s` is not given, the shape of the input along the axes specified by axes is used. Except for the last axis which is taken to be ``2*(k-1)`` 
-            
+        s (sequence of ints, optional): The length of the output transform axis.
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
+
+            - `s` is also the number of input points used along this axis, except for the last axis, where ``s[-1]//2+1`` points of the input are used.
+            - Along any axis, if the shape indicated by `s` is smaller than that of the input, the input is cropped. If it is larger, the input is padded with zeros.
+            - If `s` is not given, the shape of the input along the axes specified by axes is used. Except for the last axis which is taken to be ``2*(k-1)``
+
             where ``k`` is the length of the input along that axis.
-            
+
         axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
-            `len(s)` axes are used, or all axes if `s` is also not specified.      
+            `len(s)` axes are used, or all axes if `s` is also not specified.
         norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward". The details of 
+            pair and what normalization factor to use. The parameter value must be one
+            of "forward" or "backward" or "ortho". Default is "backward". The details of
             three operations are shown below:
-            
+
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
-        Real tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
-        or by a combination of `s` or `x`, as explained in the parameters section above. The length of 
+        Real tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`,
+        or by a combination of `s` or `x`, as explained in the parameters section above. The length of
         each transformed axis is as given by the corresponding element of `s`, or the length of the input
         in every axis except for the last one if `s` is not given. In the final transformed axis the length
-        of the output when `s` is not given is ``2*(m-1)``, where ``m`` is the length of the final 
-        transformed axis of the input. To get an odd number of output points in the final axis, 
+        of the output when `s` is not given is ``2*(m-1)``, where ``m`` is the length of the final
+        transformed axis of the input. To get an odd number of output points in the final axis,
         `s` must be specified.
 
     Examples:
@@ -760,12 +760,12 @@ def irfftn(x, s=None, axes=None, norm="backward", name=None):
             print(x)
             irfftn_x = paddle.fft.irfftn(x)
             print(irfftn_x)
-            
+
             # Tensor(shape=[3], dtype=complex128, place=Place(cpu), stop_gradient=True,
             #        [(2+2j), (2+2j), (3+3j)])
             # Tensor(shape=[4], dtype=float64, place=Place(cpu), stop_gradient=True,
             #        [ 2.25000000, -1.25000000,  0.25000000,  0.75000000])
-    
+
     """
     return fftn_c2r(x, s, axes, norm, forward=False, name=name)
 
@@ -775,35 +775,35 @@ def hfftn(x, s=None, axes=None, norm="backward", name=None):
     Compute the N-D FFT of Hermitian symmetric complex input, i.e., a
     signal with a real spectrum.
 
-    This function calculates the n-D discrete Fourier transform of Hermite symmetric 
-    complex input on any axis in M-D array by fast Fourier transform (FFT). 
-    In other words, ``ihfftn(hfftn(x, s)) == x is within the numerical accuracy range. 
-    (``s`` here are ``x.shape`` and ``s[-1] = x.shape[- 1] * 2 - 1``. This is necessary 
+    This function calculates the n-D discrete Fourier transform of Hermite symmetric
+    complex input on any axis in M-D array by fast Fourier transform (FFT).
+    In other words, ``ihfftn(hfftn(x, s)) == x is within the numerical accuracy range.
+    (``s`` here are ``x.shape`` and ``s[-1] = x.shape[- 1] * 2 - 1``. This is necessary
     for the same reason that ``irfft` requires ``x.shape``.)
 
     Args:
         x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): The length of the output transform axis. 
+        s (sequence of ints, optional): The length of the output transform axis.
             (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
             number of input points used along this axis, except for the last axis,
-            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
-            the shape indicated by `s` is smaller than that of the input, the input 
-            is cropped. If it is larger, the input is padded with zeros. 
-            If `s` is not given, the shape of the input along the axes specified by axes 
-            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
+            where ``s[-1]//2+1`` points of the input are used. Along any axis, if
+            the shape indicated by `s` is smaller than that of the input, the input
+            is cropped. If it is larger, the input is padded with zeros.
+            If `s` is not given, the shape of the input along the axes specified by axes
+            is used. Except for the last axis which is taken to be ``2*(k-1)`` where
             ``k`` is the length of the input along that axis.
         axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
-            `len(s)` axes are used, or all axes if `s` is also not specified.      
+            `len(s)` axes are used, or all axes if `s` is also not specified.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
-        Real tensor. Truncate or zero fill input, transforming along the axis indicated by axis or 
+        Real tensor. Truncate or zero fill input, transforming along the axis indicated by axis or
         a combination of `s` or `X`.
-    
+
     Examples:
 
         .. code-block:: python
@@ -823,36 +823,36 @@ def ihfftn(x, s=None, axes=None, norm="backward", name=None):
     """
     The n dimensional inverse FFT of a signal that has Hermitian symmetry.
 
-    This function computes the n dimensional inverse FFT over any number of axes 
-    in an M-dimensional of a signal that has Hermitian symmetry by means of an 
+    This function computes the n dimensional inverse FFT over any number of axes
+    in an M-dimensional of a signal that has Hermitian symmetry by means of an
     efficient algorithm called the Fast Fourier Transform (FFT).
 
     Args:
         x(Tensor): Input tensor.
-        s(Sequence[int], optional) : Shape (length along each transformed axis) 
-            to use from the input. (``s[0]`` refers to axis 0, ``s[1]`` to axis 
-            1, etc.). Along any axis, if the given shape is smaller than that 
-            of the input, the input is cropped. If it is larger, the input is 
-            padded with zeros. if `s` is not given, the shape of the input 
+        s(Sequence[int], optional) : Shape (length along each transformed axis)
+            to use from the input. (``s[0]`` refers to axis 0, ``s[1]`` to axis
+            1, etc.). Along any axis, if the given shape is smaller than that
+            of the input, the input is cropped. If it is larger, the input is
+            padded with zeros. if `s` is not given, the shape of the input
             along the axes specified by `axes` is used.
         axes(Sequence[int], optional) : Axis over which to compute the inverse FFT. If not
             given, the last axis is used.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
+        norm(str, optional) : Normalization mode, indicates which direction of
+            the forward/backward pair of transforms is scaled and with what
+            normalization factor. Include {"backward", "ortho", "forward"},
             default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         out(Tensor) : complex tensor.
 
     Examples:
-    
+
     .. code-block:: python
-    
-        import paddle 
+
+        import paddle
 
         spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
         print(paddle.fft.ifft(spectrum))
@@ -877,22 +877,22 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
 
     Args:
         x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
-            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output.
+            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``.
             Along each axis, if the given shape is smaller than that of the input,
             the input is cropped. If it is larger, the input is padded with zeros.
             if `s` is not given, the shape of the input along the axes specified
             by `axes` is used. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
-            sequence of 2 integers. If not specified, the last two axes are used by default.       
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a
+            sequence of 2 integers. If not specified, the last two axes are used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
-        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`,
         or the last two axes if `axes` is not given.
 
     Examples:
@@ -943,22 +943,22 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
 
     Args:
         x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
-            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output.
+            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``.
             Along each axis, if the given shape is smaller than that of the input,
             the input is cropped. If it is larger, the input is padded with zeros.
             if `s` is not given, the shape of the input along the axes specified
             by `axes` is used. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
-            sequence of 2 integers. If not specified, the last two axes are used by default.       
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a
+            sequence of 2 integers. If not specified, the last two axes are used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
-        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`,
         or the last two axes if `axes` is not given.
 
     Examples:
@@ -1000,28 +1000,28 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
         x(Tensor): Input tensor, taken to be real.
         s(Sequence[int], optional) : Shape of the FFT.
         axes(Sequence[int], optional): Axes over which to compute the FFT.
-        norm(str, optional) : {"backward", "ortho", "forward"}, 
-            default is "backward". Indicates which direction of the 
-            forward/backward pair of transforms is scaled and with what 
-            normalization factor. The details of 
+        norm(str, optional) : {"backward", "ortho", "forward"},
+            default is "backward". Indicates which direction of the
+            forward/backward pair of transforms is scaled and with what
+            normalization factor. The details of
             three operations are shown below:
-            
+
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
-    Returns: 
+    Returns:
         out(Tensor): The result of the real 2-D FFT.
 
     Examples:
 
     .. code-block:: python
-    
+
         import paddle
         import numpy as np
 
@@ -1055,24 +1055,24 @@ def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     Args:
         x (Tensor): The input data. It's a Tensor type.
         s (sequence of ints, optional): Shape of the real output to the inverse FFT. Default is None.
-        axes (sequence of ints, optional): The axes over which to compute the inverse FFT. Axes 
-            must be two-dimensional. If not specified, the last two axes are used by default.       
+        axes (sequence of ints, optional): The axes over which to compute the inverse FFT. Axes
+            must be two-dimensional. If not specified, the last two axes are used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward". The details of 
+            pair and what normalization factor to use. The parameter value must be one
+            of "forward" or "backward" or "ortho". Default is "backward". The details of
             three operations are shown below:
-            
+
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name` .
+
     Returns:
         Real tensor. The result of the inverse real 2-D FFT.
-    
+
     Examples:
 
         .. code-block:: python
@@ -1107,17 +1107,17 @@ def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     Args:
         x (Tensor): The input data. It's a Tensor type.
         s (sequence of ints, optional): Shape of the real output. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. Axes must be 
-            two-dimensional. If not specified, the last two axes are used by default.       
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. Axes must be
+            two-dimensional. If not specified, the last two axes are used by default.
         norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
         Real tensor. The real result of the 2-D Hermitian complex real FFT.
-    
+
     Examples:
 
         .. code-block:: python
@@ -1155,13 +1155,13 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     Args:
         x(Tensor): Input tensor.
         s(Sequence[int], optional): Shape of the real input to the inverse FFT.
-        axes(Sequance[int], optional): The axes over which to compute the 
+        axes(Sequance[int], optional): The axes over which to compute the
             inverse fft. Default is the last two axes.
-        norm(str, optional): {"backward", "ortho", "forward"}. Default is 
+        norm(str, optional): {"backward", "ortho", "forward"}. Default is
             "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         out(Tensor) : The result of the inverse hermitian 2-D FFT.
@@ -1214,7 +1214,7 @@ def fftfreq(n, d=1.0, dtype=None, name=None):
     Args:
         n (int): Dimension inputed.
         d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -1250,8 +1250,8 @@ def rfftfreq(n, d=1.0, dtype=None, name=None):
     """
     Return the Discrete Fourier Transform sample frequencies.
 
-    The returned floating-point array "F" contains the center of the frequency unit, 
-    and the unit is the number of cycles of the sampling interval (the starting point is zero). 
+    The returned floating-point array "F" contains the center of the frequency unit,
+    and the unit is the number of cycles of the sampling interval (the starting point is zero).
 
     Given input length `n` and a sample spacing `d`::
 
@@ -1263,9 +1263,9 @@ def rfftfreq(n, d=1.0, dtype=None, name=None):
     Args:
         n (int): Dimension inputed.
         d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
-        dtype (str, optional): The data type of returns. Defaults is the data type of returns 
+        dtype (str, optional): The data type of returns. Defaults is the data type of returns
             of ``paddle.get_default_dtype()``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -1307,12 +1307,12 @@ def fftshift(x, axes=None, name=None):
         n (int): Dimension inputed.
         axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
             Default is None.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor. The shifted tensor.
-    
+
     Examples:
 
         .. code-block:: python
@@ -1343,19 +1343,19 @@ def fftshift(x, axes=None, name=None):
 
 def ifftshift(x, axes=None, name=None):
     """
-    The inverse of `fftshift`. Although the even length 'x' is the same, the function of the 
+    The inverse of `fftshift`. Although the even length 'x' is the same, the function of the
     odd length 'x' is different. An example.
 
     Args:
         n (int): Dimension inputed.
         axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
             Default is None.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor. The shifted tensor.
-    
+
     Examples:
 
         .. code-block:: python
diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py
index bb5f4cb84f6..f94cb7d355a 100644
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -19,9 +19,9 @@ import warnings
 """
     Class of all kinds of Average.
 
-    All Averages are accomplished via Python totally. 
+    All Averages are accomplished via Python totally.
     They do not change Paddle's Program, nor do anything to
-    modify NN model's configuration. They are completely 
+    modify NN model's configuration. They are completely
     wrappers of Python functions.
 """
 
@@ -41,9 +41,9 @@ class WeightedAverage(object):
     """
     Calculate weighted average.
 
-    The average calculating is accomplished via Python totally. 
+    The average calculating is accomplished via Python totally.
     They do not change Paddle's Program, nor do anything to
-    modify NN model's configuration. They are completely 
+    modify NN model's configuration. They are completely
     wrappers of Python functions.
 
     Examples:
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 93d6d798dc4..09b847b0796 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1409,11 +1409,11 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
     """
     ops_to_remove = []
     '''
-    NOTE(paddle-dev): while_grad op may hold some inputs which are not found 
-    in the parent/forward block, and they are also the outputs of while_grad 
-    op. These kinds of inputs are the recursive outputs inside while_grad op. 
-    They should be considered as "already created" when scanning the inner 
-    ops of while_grad ops.  
+    NOTE(paddle-dev): while_grad op may hold some inputs which are not found
+    in the parent/forward block, and they are also the outputs of while_grad
+    op. These kinds of inputs are the recursive outputs inside while_grad op.
+    They should be considered as "already created" when scanning the inner
+    ops of while_grad ops.
     '''
     parent_op = _find_parent_op_(block)
     parent_op_vars = []
@@ -1452,7 +1452,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
             continue
         else:
             '''
-            If the output is not empty and there is any grad input, find 
+            If the output is not empty and there is any grad input, find
             whether there is any existing input. If not, just remove it.
             '''
             if grad_var_ins:
@@ -1464,11 +1464,11 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
                 if not existing_grad_var_ins:
                     '''
                     FIXME(paddle-dev, zengjinle): rnn_memory_helper_grad is used
-                    in recurrent op. The input of this op does not even exist in 
-                    the program! Therefore, any dependency analysis would not 
+                    in recurrent op. The input of this op does not even exist in
+                    the program! Therefore, any dependency analysis would not
                     work to this op! If I do not add the following code, this op
-                    would be pruned, and the calculation result would be wrong. 
-                    Maybe we should re-design this op later...  
+                    would be pruned, and the calculation result would be wrong.
+                    Maybe we should re-design this op later...
                     '''
                     if op_desc.type() not in ['rnn_memory_helper_grad']:
                         ops_to_remove.append(op_idx)
@@ -2206,7 +2206,7 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
         will be None.
 
     Examples:
-    
+
         .. code-block:: python
           :name: code-example
             import paddle
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 24f69a86662..f83947bf6cd 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -209,34 +209,34 @@ class ClipGradBase(object):
 class ClipGradByValue(ClipGradBase):
     """
     Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
-    
+
     - Any values less than min are set to ``min``.
-    
+
     - Any values greater than max are set to ``max``.
 
-    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. 
+    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
     If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-    
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
     (for example: :ref:`api_paddle_optimizer_SGD`).
 
     Note:
-        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0. 
+        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
         Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-    
+
     Args:
         max (float): The maximum value to clip by.
-        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max`` 
+        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
             automatically. In this case, ``max`` must be greater than 0.
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
 
             x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10, 
-                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
                                       bias_attr=paddle.ParamAttr(need_clip=False))
             out = linear(x)
             loss = paddle.mean(out)
@@ -300,17 +300,17 @@ class ClipGradByValue(ClipGradBase):
 class ClipGradByNorm(ClipGradBase):
     r"""
     Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
-    
+
     - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
-    
+
     - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
-    
+
     The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
     If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-    
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
     (for example: :ref:`api_paddle_optimizer_SGD`).
-    
+
     The clipping formula is:
 
     .. math::
@@ -329,7 +329,7 @@ class ClipGradByNorm(ClipGradBase):
         norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
 
     Note:
-        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. 
+        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
         Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
 
     Args:
@@ -337,12 +337,12 @@ class ClipGradByNorm(ClipGradBase):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
 
             x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10, 
-                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
                                       bias_attr=paddle.ParamAttr(need_clip=False))
             out = linear(x)
             loss = paddle.mean(out)
@@ -415,17 +415,17 @@ def _allow_pure_fp16_global_norm_clip(*args):
 
 class ClipGradByGlobalNorm(ClipGradBase):
     r"""
-    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in 
+    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
     :math:`t\_list` , and limit it to ``clip_norm`` .
-    
+
     - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
-    
+
     - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
-    
+
     The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
     If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-    
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
     (for example: :ref:`api_paddle_optimizer_SGD`).
 
     The clipping formula is:
@@ -441,7 +441,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
         global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
 
     Note:
-        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. 
+        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
         Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
 
     Args:
@@ -450,12 +450,12 @@ class ClipGradByGlobalNorm(ClipGradBase):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
 
             x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10, 
-                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
                                       bias_attr=paddle.ParamAttr(need_clip=False))
             out = linear(x)
             loss = paddle.mean(out)
@@ -719,23 +719,23 @@ class ClipGradByGlobalNorm(ClipGradBase):
 def set_gradient_clip(clip, param_list=None, program=None):
     """
     :api_attr: Static Graph
-    
+
     Warning:
-    
-        This API must be used after building network, and before ``minimize`` , 
-        and it may be removed in future releases, so it is not recommended. 
+
+        This API must be used after building network, and before ``minimize`` ,
+        and it may be removed in future releases, so it is not recommended.
         It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
         this is a better method to clip gradient. There are three clipping strategies:
-         :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+         :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
          :ref:`api_fluid_clip_GradientClipByValue` .
-        
+
     To specify parameters that require gradient clip.
 
     Args:
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
-            :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
             gradient clipping.
         param_list (list(Variable), optional): Parameters that require gradient clip.
                 It can be a list of parameter or a list of parameter's name.
@@ -789,7 +789,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
                     param_list=[param_var1, param_var2])
                 sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                 sgd.minimize(loss)
-            
+
             # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 loss = network()
@@ -800,10 +800,10 @@ def set_gradient_clip(clip, param_list=None, program=None):
                 # Set the gradient clipping strategy: clip2
                 sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
                 sgd.minimize(loss)
-                # 'set_gradient_clip' will not take effect when setting has a conflict, 
+                # 'set_gradient_clip' will not take effect when setting has a conflict,
                 # and the gradient clipping strategy will be 'clip2'
-            
-            
+
+
     """
     warnings.warn("Caution! 'set_gradient_clip' is not recommended "
                   "and may be deprecated in future! "
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 1f81afbed64..db4b2c2df2e 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -102,7 +102,7 @@ def _should_broadcast_or_not_exists(program, var_name):
 class CompiledProgram(object):
     """
     :api_attr: Static Graph
-    
+
     The CompiledProgram is used to transform a program or graph for
     various optimizations according to the configuration of build_strategy,
     for example, the operators' fusion in the computation graph, memory
@@ -187,12 +187,12 @@ class CompiledProgram(object):
         exec_strategy to set some optimizations that can be applied during the construction
         and computation of the Graph, such as reducing the number of AllReduce operations,
         specifying the size of the thread pool used in the computation Graph running the model,
-        and so on. 
-        
+        and so on.
+
         .. note::
-            If build_strategy is specified when building CompiledProgram and calling 
-            with_data_parallel, build_strategy in CompiledProgram will be overwritten, therefore, 
-            if it is data parallel training, it is recommended to set build_strategy when calling 
+            If build_strategy is specified when building CompiledProgram and calling
+            with_data_parallel, build_strategy in CompiledProgram will be overwritten, therefore,
+            if it is data parallel training, it is recommended to set build_strategy when calling
             with_data_parallel interface.
 
         Args:
@@ -228,7 +228,7 @@ class CompiledProgram(object):
                 export CPU_NUM=4, if the environment variable is not set, the executor will
                 add the variable to the environment variable and set its value to 1.
                 The default is None. If ``places`` is the list of string, the string in the list
-                can be ``cpu``, ``gpu:x``, where ``x`` is the index of the GPUs. 
+                can be ``cpu``, ``gpu:x``, where ``x`` is the index of the GPUs.
 
         Returns:
             CompiledProgram
@@ -270,7 +270,7 @@ class CompiledProgram(object):
                     static.default_main_program()).with_data_parallel(
                             loss_name=loss.name, places=parallel_places)
                 # NOTE: if not set share_vars_from=compiled_train_prog,
-                # the parameters used in test process are different with 
+                # the parameters used in test process are different with
                 # the parameters used by train process
                 compiled_test_prog = static.CompiledProgram(
                     test_program).with_data_parallel(
@@ -701,7 +701,7 @@ class IpuStrategy(object):
 
     Examples:
         .. code-block:: python
-	
+
             # required: ipu
 
             import paddle
@@ -744,7 +744,7 @@ class IpuStrategy(object):
 
         Examples:
             .. code-block:: python
-	
+
                 # required: ipu
 
                 import paddle
@@ -762,7 +762,7 @@ class IpuStrategy(object):
 
         Examples:
             .. code-block:: python
-	
+
                 # required: ipu
 
                 import paddle
@@ -780,13 +780,13 @@ class IpuStrategy(object):
 
           Args:
               optimizer (Optimizer): Optimizer to be used in training.
-              
+
           Returns:
               None.
 
           Examples:
               .. code-block:: python
-	
+
                   # required: ipu
 
                   import paddle
@@ -812,13 +812,13 @@ class IpuStrategy(object):
 
           Args:
               optimizer (Optimizer): Optimizer to be parsed.
-              
+
           Returns:
               Dict.
 
           Examples:
               .. code-block:: python
-	
+
                   # required: ipu
 
                   import paddle
@@ -857,15 +857,15 @@ class IpuStrategy(object):
             is_training (bool, optional): True is training graph, False is inference graph. Default True, which means is training mode.
             batch_size (int, optional): The batch-size in the graph. Used to make the graph batch-size fixed,
                 if the batch-size in the graph is dynamic. Default 1, which means the batch-size would be set 1, if the batch-size is dynamice.
-            enable_manual_shard (bool, optional): Enable graph sharding or not. Only if num_ipus > 1, enable_manual_shard is able to be set True. 
-                Default False, which means disabled.    
-            
+            enable_manual_shard (bool, optional): Enable graph sharding or not. Only if num_ipus > 1, enable_manual_shard is able to be set True.
+                Default False, which means disabled.
+
         Returns:
             None.
 
         Examples:
             .. code-block:: python
-	
+
                 # required: ipu
 
                 import paddle
@@ -900,15 +900,15 @@ class IpuStrategy(object):
         Set pipelining configuration to the IpuStrategy instance. Used to optimize the throughput performance.
 
         Args:
-            enable_pipelining (bool, optional): Enable data pipelining between subgraphs. Only if enable_manual_shard=True, enable_pipelining is able to be set True. 
+            enable_pipelining (bool, optional): Enable data pipelining between subgraphs. Only if enable_manual_shard=True, enable_pipelining is able to be set True.
                 Default False, which means disabled.
             batches_per_step (int, optional): Set the batches per run in data pipelining mode. Only if enable_pipelining=True, batches_per_step is able to be set > 1.
                 Default 1, which means no data pipelining.
             enable_gradient_accumulation (bool, optional): Enable to accumulate gradients before updating the weights in training mode. Only if enable_pipelining=True,
-                enable_gradient_accumulation is able to be set True. Default False, which means no gradient accumulation. 
-            accumulation_factor (int, optional): Specify the number of micro-batches to accumulate 
+                enable_gradient_accumulation is able to be set True. Default False, which means no gradient accumulation.
+            accumulation_factor (int, optional): Specify the number of micro-batches to accumulate
                 before applying the varUpdate. Default 1, which means disable the accumulation.
-        
+
         Returns:
             None.
 
@@ -947,7 +947,7 @@ class IpuStrategy(object):
 
         Args:
             enable_fp16 (bool, optional): Enable FLOAT16 mode and transform FLOAT32 to FLOAT16. Default False, which means disable FLOAT16 mode.
-        
+
         Returns:
             None.
 
@@ -985,7 +985,7 @@ class IpuStrategy(object):
             domain(str): domain name of custom op in popart.
 
             version(int): version of custom op in popart.
-        
+
         Returns:
             None.
 
@@ -1021,7 +1021,7 @@ class IpuStrategy(object):
 
         Args:
             options(dict): dict of options.
-        
+
         Returns:
             None.
 
@@ -1051,7 +1051,7 @@ class IpuStrategy(object):
 
         Args:
             option(str): name of option.
-        
+
         Returns:
             option value.
 
@@ -1076,7 +1076,7 @@ class IpuStrategy(object):
 
         Args:
             pattern(string): the name of the pattern.
-        
+
         Returns:
             None.
 
@@ -1101,7 +1101,7 @@ class IpuStrategy(object):
 
         Args:
             pattern(string): the name of the pattern.
-        
+
         Returns:
             None.
 
@@ -1156,21 +1156,21 @@ class IpuCompiledProgram(object):
 
     Args:
         program(Program, optional): This parameter represents the :code:`Program`
-            to be executed. Default is None, which means the program will be set to 
+            to be executed. Default is None, which means the program will be set to
             the default program :code:`paddle.static.default_main_program()` .
         scope(Scope, optional): The scope used to run this program, you can switch
-            it to different scope. Default is None, which means use the global 
+            it to different scope. Default is None, which means use the global
             scope :code:`paddle.static.global_scope()` .
         ipu_strategy(IpuStrategy, optional): This argument is used to build the program with the
             specified options, such as half computation, training or inference session, the number of IPUs, etc.
-            Default is None, which means build the program based on the default `ipu_strategy`. 
+            Default is None, which means build the program based on the default `ipu_strategy`.
 
     Returns:
         IpuCompiledProgram
 
     Example:
         .. code-block:: python
-	
+
             # required: ipu
 
             import paddle
@@ -1181,12 +1181,12 @@ class IpuCompiledProgram(object):
             a = static.data(name='data', shape=[None, 1], dtype='int32')
             b = a + 1
             main_prog = static.default_main_program()
-            
+
             ipu_strategy = static.IpuStrategy()
             ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
             ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
             ipu_strategy.set_precision_config(enable_fp16=False)
-            
+
             ipu_compiled_program = static.IpuCompiledProgram(
                 main_prog,
                 ipu_strategy=ipu_strategy)
@@ -1232,7 +1232,7 @@ class IpuCompiledProgram(object):
         """
         This interface is used to compile the input Program to a program
         to run the model on the ipu.
-        
+
         Args:
             feed_list(list): This parameter represents the input Tensors of the model.
 
@@ -1244,14 +1244,14 @@ class IpuCompiledProgram(object):
 
         Example:
             .. code-block:: python
-    	
+
                 # required: ipu
-    
+
                 import paddle
                 import paddle.static as static
-    
+
                 paddle.enable_static()
-    
+
                 a = static.data(name='data', shape=[None, 1], dtype='int32')
                 b = a + 1
                 main_prog = static.default_main_program()
@@ -1260,7 +1260,7 @@ class IpuCompiledProgram(object):
                 ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
                 ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
                 ipu_strategy.set_precision_config(enable_fp16=False)
-                
+
                 program = static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile([a.name], [b.name])
diff --git a/python/paddle/fluid/contrib/layers/metric_op.py b/python/paddle/fluid/contrib/layers/metric_op.py
index 6f72086410a..cf7fb354214 100755
--- a/python/paddle/fluid/contrib/layers/metric_op.py
+++ b/python/paddle/fluid/contrib/layers/metric_op.py
@@ -49,7 +49,7 @@ def ctr_metric_bundle(input, label, ins_tag_weight=None):
         label(Tensor): A 2D int Tensor indicating the label of the training
                          data. The height is batch size and width is always 1.
         ins_tag_weight(Tensor): A 2D int Tensor indicating the ins_tag_weight of the training
-                         data. 1 means real data, 0 means fake data. 
+                         data. 1 means real data, 0 means fake data.
                          A LoDTensor or Tensor with type float32,float64.
 
     Returns:
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 90bf501ed5c..e241ea50640 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -1078,17 +1078,17 @@ def sparse_embedding(input,
     r"""
     :api_attr: Static Graph
 
-    The OP is used as the operator of the Embedding Lookup layer in the large-scale 
+    The OP is used as the operator of the Embedding Lookup layer in the large-scale
     sparse training of the parameter server mode, instead of using the paddle.nn.functional.embedding.
 
-    The operator is used to lookup embeddings vector of ids provided by :attr:`input` . 
-    It automatically constructs a 2D embedding matrix based on the input :attr:`size` 
+    The operator is used to lookup embeddings vector of ids provided by :attr:`input` .
+    It automatically constructs a 2D embedding matrix based on the input :attr:`size`
     (vocab_size, emb_size) and :attr:`dtype` .
 
     The shape of output Tensor is generated by appending an emb_size dimension to the
     last dimension of the input Tensor shape.
 
-    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` , otherwise 
+    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` , otherwise
     the program will throw an exception and exit.
 
     .. code-block:: text
@@ -1106,12 +1106,12 @@ def sparse_embedding(input,
 
                         [[0.345249859, 0.124939536, ..., 0.194353745],
                         [0.945345345, 0.435394634, ..., 0.435345365]],
-                        
+
                         [[0.945345345, 0.435394634, ..., 0.435345365],
                         [0.0,         0.0,         ..., 0.0        ]]]  # padding data
         The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
         It will pad all-zero data when ids is 127.
-        
+
         Case 2:
 
         input is a LoDTensor with 1-level LoD. padding_idx = 0
@@ -1130,41 +1130,41 @@ def sparse_embedding(input,
         It will pad all-zero data when ids is 0.
 
     Args:
-        input(Variable): A Tensor or LoDTensor with type int64, which contains the id 
+        input(Variable): A Tensor or LoDTensor with type int64, which contains the id
             information. The value of the input id should satisfy :math:`0<= id < size[0]` .
-        size(tuple|list): The shape of lookup table parameter (vocab_size, emb_size). It 
-            should have two elements which indicates the size of the dictionary of embeddings 
-            and the size of each embedding vector respectively. The initial parameter size 
-            is 0 in the large-scale sparse scenario, which will gradually expand with the 
+        size(tuple|list): The shape of lookup table parameter (vocab_size, emb_size). It
+            should have two elements which indicates the size of the dictionary of embeddings
+            and the size of each embedding vector respectively. The initial parameter size
+            is 0 in the large-scale sparse scenario, which will gradually expand with the
             training. So if vocab_size is temporarily useless, its value can be any integer.
             The emb_size is the dimensional configuration of the word embedding weight parameter.
-        padding_idx(int|long|None, optional): padding_idx needs to be in the interval [-vocab_size, vocab_size). 
+        padding_idx(int|long|None, optional): padding_idx needs to be in the interval [-vocab_size, vocab_size).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
-            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever 
-            lookup encounters :math:`padding\_idx` in id. And the padding data will not be updated 
+            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever
+            lookup encounters :math:`padding\_idx` in id. And the padding data will not be updated
             while training. If set None, it makes no efe mfect to output. Default: None.
-        is_test(bool, optional): Training or prediction mode. In prediction mode (is_test=False), 
+        is_test(bool, optional): Training or prediction mode. In prediction mode (is_test=False),
             the output is not initialized and created, and it is filled with 0 and returned. Default: False.
-        entry(str, optional): Entry config with parameter server whose value is ProbabilityEntry, 
+        entry(str, optional): Entry config with parameter server whose value is ProbabilityEntry,
             CountFilterEntry or None. Default: None.
-        table_class(str, optional): The type of the sparse table. The value can be CommonSparseTable 
+        table_class(str, optional): The type of the sparse table. The value can be CommonSparseTable
             or SSDSparseTable. The default is CommonSparseTable.
         param_attr(ParamAttr, optional): To specify the weight parameter property. Default: None, which means the
-            default weight parameter property is used. In addition, user-defined or pre-trained word 
-            vectors can be loaded with the :attr:`param_attr` parameter. The local word vector needs 
-            to be transformed into numpy format, and the shape of local word vector should be consistent 
+            default weight parameter property is used. In addition, user-defined or pre-trained word
+            vectors can be loaded with the :attr:`param_attr` parameter. The local word vector needs
+            to be transformed into numpy format, and the shape of local word vector should be consistent
             with :attr:`size` .
-        dtype(str): It refers to the data type of output Tensor. It must be float32 or 
+        dtype(str): It refers to the data type of output Tensor. It must be float32 or
             float64. Default: float32.
-            
+
     Returns:
         Variable: Embedding Tensor or LoDTensor mapped by input. The data type is the same as :attr:`dtype` .
-    
+
     Examples:
         .. code-block:: python
 
             import paddle
-            
+
             paddle.enable_static()
             sparse_feature_dim = 1024
             embedding_size = 64
@@ -1173,7 +1173,7 @@ def sparse_embedding(input,
             entry = paddle.distributed.CountFilterEntry(10)
 
             input = paddle.static.data(name='ins', shape=[1], dtype='int64')
-        
+
             emb = paddle.static.nn.sparse_embedding(
                 input=input,
                 size=[sparse_feature_dim, embedding_size],
@@ -1246,7 +1246,7 @@ def sparse_embedding(input,
 def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
     """
     **Tdm Child**
-     According to the input node_id on the given tree, return the corresponding child node_id and 
+     According to the input node_id on the given tree, return the corresponding child node_id and
       whether child is a leaf node by leaf_mask value.
     .. code-block:: text
 
@@ -1267,17 +1267,17 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
         child_nums(int): Maximum number of child nodes per node.
         param_attr(ParamAttr): To specify the tdm-tree-info parameter property. Default: None, which means the
             default weight parameter property is used. See usage for details in: ref: `api_fluid_ParamAttr`, should
-            has shape(node_nums, 3 + child_nums), dtype support int32/int64. 
-            The dimension[1] of tdm-tree-info contains the following: 
+            has shape(node_nums, 3 + child_nums), dtype support int32/int64.
+            The dimension[1] of tdm-tree-info contains the following:
             1. Item_id(int, shape(1)), if node is a leaf node, give its item_id corresponding to node_id, else give 0.
             2. Layer_id(int, shape(1)), indicates which layer the node is on.
             3. Parent_id(int, shape(1)), node's parent node.
-            4. Child_id(int, shape(child_nums)), all child node's node_id of this node should be given. 
+            4. Child_id(int, shape(child_nums)), all child node's node_id of this node should be given.
             If the number of child nodes is insufficient, padding 0 until child nums equal to child_nums
         dtype(str): The data type of output child and leaf_mask, support int32/int64.
 
     Returns:
-        tuple: A tuple including input node's child(Variable) and leaf_mask(Variable). 
+        tuple: A tuple including input node's child(Variable) and leaf_mask(Variable).
             If child is a leaf node, leaf_mask equal ot 1, otherwise equal to 0.
 
     Examples:
@@ -1370,22 +1370,22 @@ def tdm_sampler(x,
         layer_node_num_list (list(int)): Number of nodes per layer, must has same shape with neg_samples_num_list.
         leaf_node_num (int): Number of leaf nodes.
         tree_travel_attr (ParamAttr): To specify the tdm-travel parameter property. Default: None, which means the
-            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr`, should 
+            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr`, should
             has shape (leaf_node_num, len(layer_node_num_list)), dtype support int32/int64.
         tree_layer_attr (ParamAttr): To specify the tdm-layer parameter property. Default: None, which means the
-            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr`, should 
+            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr`, should
             has shape (node_num, 1), dtype support int32/int64.
         output_positive (bool): Whether to output positive samples (includ label and mask )at the same time.
         output_list (bool): Whether to divide the output into layers and organize it into list format.
         seed (int): The number of random seed.
         tree_dtype(np.dtype|core.VarDesc.VarType|str): The dtype of tdm-travel and tdm-layer, support int32/int64
-        dtype(np.dtype|core.VarDesc.VarType|str): The dtype of output(sampling results, labels and masks) 
+        dtype(np.dtype|core.VarDesc.VarType|str): The dtype of output(sampling results, labels and masks)
 
     Returns:
         tuple: A tuple including sampling results, corresponding labels and masks. if output_positive = True, sampling
-            result  will include both positive and negative samples. If sampling reseult is a positive sample, the label is 1, 
-            and if it is a negative sample, it is 0. If the tree is unbalanced, in order to ensure the consistency of the 
-            sampling result shape, the padding sample's mask = 0, the real sample's mask value = 1. 
+            result  will include both positive and negative samples. If sampling reseult is a positive sample, the label is 1,
+            and if it is a negative sample, it is 0. If the tree is unbalanced, in order to ensure the consistency of the
+            sampling result shape, the padding sample's mask = 0, the real sample's mask value = 1.
             If output_list = True, the result will organize into list format specified by layer information.
             Output variable have same type with tdm-travel and tdm-layer parameter(tree_dtype).
 
@@ -1556,7 +1556,7 @@ def rank_attention(input,
                    max_size=0):
     """
     **Rank Attention layer**
-    This Op can calculate rank attention between input and rank_param, and 
+    This Op can calculate rank attention between input and rank_param, and
     rank_param gives the organization of data. Notice: It currently supports
     GPU device.
     This Op exists in contrib, which means that it is not shown to the public.
@@ -1623,8 +1623,8 @@ def rank_attention(input,
 def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None):
     """
     **Batch FC layer**
-    This Op can calculate BatchFC. This is similar to matmul op, 
-    except that the bias and relu activation layers are added. 
+    This Op can calculate BatchFC. This is similar to matmul op,
+    except that the bias and relu activation layers are added.
     Notice: It currently supports GPU device.
     This Op exists in contrib, which means that it is not shown to the public.
     Args:
@@ -1640,7 +1640,7 @@ def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None):
     Examples:
         .. code-block:: python
            import paddle.fluid as fluid
-           
+
            input = fluid.data(name="input", shape=[16, 2, 3], dtype="float32")
            out = fluid.contrib.layers.batch_fc(input=input,
                                                param_size=[16, 3, 10],
@@ -1699,7 +1699,7 @@ def _pull_box_extended_sparse(input, size, extend_size=64, dtype='float32'):
             contains the IDs information.
         size(int): The embedding size parameter, which indicates the size of
             each embedding vector respectively.
-        extend_size(int): The embedding size parameter in extended dim, 
+        extend_size(int): The embedding size parameter in extended dim,
             which indicates the size of each embedding vector respectively.
         dtype(str): The dtype refers to the data type of output tensor. Only supports
       float32 now.
@@ -1776,7 +1776,7 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
 
             # without offset
             output = fluid.contrib.bilateral_slice(x, guide, grid, has_offset=False)
-            
+
             # has offset
             output = fluid.contrib.bilateral_slice(x, guide, grid, has_offset=True)
 
@@ -1811,8 +1811,8 @@ def correlation(x,
     """
 
     This operation compute correlation of two tensor.
-    For more information of correlation, please refer to PWC-Net: 
-    CNNs for Optical Flow Using Pyramid, Warping, and Cost Volume 
+    For more information of correlation, please refer to PWC-Net:
+    CNNs for Optical Flow Using Pyramid, Warping, and Cost Volume
     <https://arxiv.org/pdf/1709.02371.pdf>_
 
     Args:
@@ -1977,7 +1977,7 @@ def fused_bn_add_act(x,
                 startup_program = fluid.Program()
                 place = fluid.CUDAPlace(0)
                 x, y, loss = build_program(main_program, startup_program)
-  
+
                 feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
                 train_reader = paddle.batch(
                     paddle.dataset.mnist.train(), batch_size=batch_size)
diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py
index 0b14948bff9..95ce0a6ba53 100644
--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ b/python/paddle/fluid/contrib/layers/rnn_impl.py
@@ -47,7 +47,7 @@ class BasicGRUUnit(Layer):
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|None): The parameter attribute for the bias
             of GRU unit.
-            If it is set to None or one attribute of ParamAttr, gru_unit will 
+            If it is set to None or one attribute of ParamAttr, gru_unit will
             create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
         gate_activation (function|None): The activation function for gates (actGate).
@@ -186,8 +186,8 @@ def basic_gru(input,
             h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
 
     Args:
-        input (Variable): GRU input tensor, 
-                       if batch_first = False, shape should be ( seq_len x batch_size x input_size )  
+        input (Variable): GRU input tensor,
+                       if batch_first = False, shape should be ( seq_len x batch_size x input_size )
                        if batch_first = True, shape should be ( batch_size x seq_len x hidden_size )
         init_hidden(Variable|None): The initial hidden state of the GRU
                        This is a tensor with shape ( num_layers x batch_size x hidden_size)
@@ -199,7 +199,7 @@ def basic_gru(input,
         sequence_length (Variabe|None): A Tensor (shape [batch_size]) stores each real length of each instance,
                         This tensor will be convert to a mask to mask the padding ids
                         If it's None means NO padding ids
-        dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of each layers, 
+        dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of each layers,
                              NOT between time steps
         bidirectional (bool|False): If it is bidirectional
         batch_first (bool|True): The shape format of the input and output tensors. If true,
@@ -214,7 +214,7 @@ def basic_gru(input,
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|None): The parameter attribute for the bias
             of GRU unit.
-            If it is set to None or one attribute of ParamAttr, gru_unit will 
+            If it is set to None or one attribute of ParamAttr, gru_unit will
             create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
         gate_activation (function|None): The activation function for gates (actGate).
@@ -235,7 +235,7 @@ def basic_gru(input,
 
     Examples:
         .. code-block:: python
-            
+
             import paddle.fluid.layers as layers
             from paddle.fluid.contrib.layers import basic_gru
 
@@ -438,8 +438,8 @@ def basic_lstm(input,
            h_t &= o_t \odot tanh(c_t)
 
     Args:
-        input (Variable): lstm input tensor, 
-                       if batch_first = False, shape should be ( seq_len x batch_size x input_size )  
+        input (Variable): lstm input tensor,
+                       if batch_first = False, shape should be ( seq_len x batch_size x input_size )
                        if batch_first = True, shape should be ( batch_size x seq_len x hidden_size )
         init_hidden(Variable|None): The initial hidden state of the LSTM
                        This is a tensor with shape ( num_layers x batch_size x hidden_size)
@@ -456,7 +456,7 @@ def basic_lstm(input,
         sequence_length (Variabe|None): A tensor (shape [batch_size]) stores each real length of each instance,
                         This tensor will be convert to a mask to mask the padding ids
                         If it's None means NO padding ids
-        dropout_prob(float|0.0): Dropout prob, dropout ONLY work after rnn output of each layers, 
+        dropout_prob(float|0.0): Dropout prob, dropout ONLY work after rnn output of each layers,
                              NOT between time steps
         bidirectional (bool|False): If it is bidirectional
         batch_first (bool|True): The shape format of the input and output tensors. If true,
@@ -471,7 +471,7 @@ def basic_lstm(input,
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|None): The parameter attribute for the bias
             of LSTM unit.
-            If it is set to None or one attribute of ParamAttr, lstm_unit will 
+            If it is set to None or one attribute of ParamAttr, lstm_unit will
             create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
         gate_activation (function|None): The activation function for gates (actGate).
@@ -497,7 +497,7 @@ def basic_lstm(input,
 
     Examples:
         .. code-block:: python
-            
+
             import paddle.fluid.layers as layers
             from paddle.fluid.contrib.layers import basic_lstm
 
@@ -740,7 +740,7 @@ class BasicLSTMUnit(Layer):
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|None): The parameter attribute for the bias
             of LSTM unit.
-            If it is set to None or one attribute of ParamAttr, lstm_unit will 
+            If it is set to None or one attribute of ParamAttr, lstm_unit will
             create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized as zero. Default: None.
         gate_activation (function|None): The activation function for gates (actGate).
diff --git a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
index 62b98e75ea1..5b6bb9aeffa 100644
--- a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
@@ -27,8 +27,8 @@ def check_finite_and_unscale(x, scale, name=None, float_status=None):
     $$Out = X / scale$$
 
     If any tensor in X contains Inf or Nan, the Out will generate a indicator.
-    FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
-    Out should not be used, and its data may not be deterministic. 
+    FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of
+    Out should not be used, and its data may not be deterministic.
     Otherwise, FoundInfinite will be 0 (False).
 
     Args:
@@ -70,29 +70,29 @@ def update_loss_scaling(x,
                         stop_update=False,
                         name=None):
     """
-    Update loss scaling according to overall gradients. If all gradients is 
-    finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
+    Update loss scaling according to overall gradients. If all gradients is
+    finite after incr_every_n_steps, loss scaling will increase by incr_ratio.
     Otherwise, loss scaling will decrease by decr_ratio after
     decr_every_n_nan_or_inf steps and each step some gradients are infinite.
 
     Args:
         x(list|tuple): The input tensors of update_loss_scaling operator.
-        found_inf (Variable): A boolean variable indicates whether 
+        found_inf (Variable): A boolean variable indicates whether
                                      there is any infinite gradient.
         prev_loss_scaling (Variable): Previous loss scaling.
-        num_good_steps (Variable): A variable accumulates good steps in which 
+        num_good_steps (Variable): A variable accumulates good steps in which
                                    all gradients are finite.
-        num_bad_steps (Variable): A variable accumulates bad steps in which 
+        num_bad_steps (Variable): A variable accumulates bad steps in which
                                   some gradients are infinite.
-        incr_every_n_steps (int): A variable represents increasing loss 
-                                       scaling every n consecutive steps with 
+        incr_every_n_steps (int): A variable represents increasing loss
+                                       scaling every n consecutive steps with
                                        finite gradients.
-        decr_every_n_nan_or_inf (int): A variable represents decreasing 
-                                            loss scaling every n accumulated 
+        decr_every_n_nan_or_inf (int): A variable represents decreasing
+                                            loss scaling every n accumulated
                                             steps with nan or inf gradients.
-        incr_ratio(float): The multiplier to use when increasing the loss 
+        incr_ratio(float): The multiplier to use when increasing the loss
                            scaling.
-        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
+        decr_ratio(float): The less-than-one-multiplier to use when decreasing
                            loss scaling.
     """
 
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
index 41fce89a9e9..0551cf1aace 100644
--- a/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
@@ -25,11 +25,11 @@ __all__ = ["decorate_bf16"]
 
 class OptimizerWithMixedPrecision(object):
     """
-    Optimizer with mixed-precision (MP) training. This is a wrapper of a common 
+    Optimizer with mixed-precision (MP) training. This is a wrapper of a common
     optimizer, plus the support of mixed-precision pre-training. The object
-    of this class almost has the same behavior as the common optimizer, with the 
-    methods `minimize()`, `backward()`, `apply_gradients()` implemented. 
-    Additionally, it enables the MP training automatically, i.e, the creation 
+    of this class almost has the same behavior as the common optimizer, with the
+    methods `minimize()`, `backward()`, `apply_gradients()` implemented.
+    Additionally, it enables the MP training automatically, i.e, the creation
     and maintenance of master parameters, scaling of loss, etc.
 
     Args:
@@ -77,7 +77,7 @@ class OptimizerWithMixedPrecision(object):
 
         Args:
             loss (Variable): The loss Variable to minimize.
-            startup_program (Program|None): The startup Program for initializing 
+            startup_program (Program|None): The startup Program for initializing
                                        parameters in `parameter_list`.
             parameter_list (list|None): A list of Variables to update.
             no_grad_set (set|None): A set of Variables should be ignored.
@@ -85,7 +85,7 @@ class OptimizerWithMixedPrecision(object):
                                    backward operator for one parameter.
 
         Returns:
-            A list of (param, grad), which is a tuple of a parameter and its 
+            A list of (param, grad), which is a tuple of a parameter and its
             gradient respectively, and the scaled loss.
         """
         train_program = loss.block.program
@@ -116,9 +116,9 @@ class OptimizerWithMixedPrecision(object):
                  use_bf16_test=False):
         """
         Init the amp training, such as cast fp32 parameters to bf16 type.
-  
+
         Args:
-            place(CPUPlace): place is used to initialize 
+            place(CPUPlace): place is used to initialize
                 bf16 parameters with fp32 values.
             scope(Scope): The scope is used to find fp32 parameters.
             test_program(Program): The program is used for testing.
@@ -145,7 +145,7 @@ class OptimizerWithMixedPrecision(object):
                         loss = paddle.mean(hidden)
                     # 2) Create the optimizer and set `multi_precision` to True.
                     # Setting `multi_precision` to True can avoid the poor accuracy
-                    # or the slow convergence in a way. 
+                    # or the slow convergence in a way.
                     optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                     # 3) These ops in `custom_fp32_list` will keep in the float32 computation type.
                     amp_list = paddle.static.amp.CustomOpLists(
@@ -163,7 +163,7 @@ class OptimizerWithMixedPrecision(object):
                     # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                     # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                     optimizer.amp_init(place, scope=paddle.static.global_scope())
-                    
+
         """
         assert self._train_program is not None, \
             "Please call the minimize method first."
@@ -181,10 +181,10 @@ class OptimizerWithMixedPrecision(object):
     def apply_gradients(self, params_grads):
         """
         Apply gradients.
-  
+
         Args:
             params_grads (list): A list of params.
-    
+
         Returns:
             A list of optimize operators.
         """
@@ -237,7 +237,7 @@ def decorate_bf16(optimizer,
                   amp_lists=None,
                   use_pure_bf16=False,
                   use_bf16_guard=None):
-    """ 
+    """
     Decorate the given optimizer to adapt to the mixed-precision training.
 
     Args:
@@ -248,7 +248,7 @@ def decorate_bf16(optimizer,
                            Default None, which means that its value equals to `use_pure_bf16`.
 
     Returns:
-        An optimizer acting like a normal one but with mixed-precision training 
+        An optimizer acting like a normal one but with mixed-precision training
         enabled.
 
     Examples 1:
@@ -290,7 +290,7 @@ def decorate_bf16(optimizer,
                     loss = paddle.mean(hidden)
                 # 2) Create the optimizer and set `multi_precision` to True.
                 # Setting `multi_precision` to True can avoid the poor accuracy
-                # or the slow convergence in a way. 
+                # or the slow convergence in a way.
                 optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                 # 3) These ops in `custom_fp32_list` will keep in the float32 computation type.
                 amp_list = paddle.static.amp.CustomOpLists(
@@ -308,7 +308,7 @@ def decorate_bf16(optimizer,
                 # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                 # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                 optimizer.amp_init(place, scope=paddle.static.global_scope())
-                
+
     """
     if amp_lists is None:
         amp_lists = AutoMixedPrecisionListsBF16()
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index 787a4e90a0f..70aefac5a59 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -36,11 +36,11 @@ __all__ = ["decorate"]
 
 class OptimizerWithMixedPrecision(object):
     """
-    Optimizer with mixed-precision (MP) training. This is a wrapper of a common 
+    Optimizer with mixed-precision (MP) training. This is a wrapper of a common
     optimizer, plus the support of mixed-precision pre-training. The object
-    of this class almost has the same behavior as the common optimizer, with the 
-    methods `minimize()`, `backward()`, `apply_gradients()` implemented. 
-    Additionally, it enables the MP training automatically, i.e, the creation 
+    of this class almost has the same behavior as the common optimizer, with the
+    methods `minimize()`, `backward()`, `apply_gradients()` implemented.
+    Additionally, it enables the MP training automatically, i.e, the creation
     and maintenance of master parameters, scaling of loss, etc.
 
     Args:
@@ -48,14 +48,14 @@ class OptimizerWithMixedPrecision(object):
         amp_lists (CustomOpLists): An CustomOpLists object.
         init_loss_scaling (float): The initial loss scaling factor.
         use_dynamic_loss_scaling (bool): Whether to use dynamic loss scaling.
-        incr_every_n_steps(int): Increases loss scaling every n consecutive 
+        incr_every_n_steps(int): Increases loss scaling every n consecutive
                                  steps with finite gradients.
-        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
-                                      accumulated steps with nan or 
+        decr_every_n_nan_or_inf(int): Decreases loss scaling every n
+                                      accumulated steps with nan or
                                       inf gradients.
-        incr_ratio(float): The multiplier to use when increasing the loss 
+        incr_ratio(float): The multiplier to use when increasing the loss
                            scaling.
-        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
+        decr_ratio(float): The less-than-one-multiplier to use when decreasing
                            the loss scaling.
         use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
         use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
@@ -155,7 +155,7 @@ class OptimizerWithMixedPrecision(object):
 
         Args:
             loss (Variable): The loss Variable to minimize.
-            startup_program (Program|None): The startup Program for initializing 
+            startup_program (Program|None): The startup Program for initializing
                                        parameters in `parameter_list`.
             parameter_list (list|None): A list of Variables to update.
             no_grad_set (set|None): A set of Variables should be ignored.
@@ -163,7 +163,7 @@ class OptimizerWithMixedPrecision(object):
                                    backward operator for one parameter.
 
         Returns:
-            A list of (param, grad), which is a tuple of a parameter and its 
+            A list of (param, grad), which is a tuple of a parameter and its
             gradient respectively, and the scaled loss.
         """
         train_program = loss.block.program
@@ -244,9 +244,9 @@ class OptimizerWithMixedPrecision(object):
                  use_fp16_test=False):
         """
         Init the amp training, such as cast fp32 parameters to fp16 type.
-  
+
         Args:
-            place(CUDAPlace): place is used to initialize 
+            place(CUDAPlace): place is used to initialize
                 fp16 parameters with fp32 values.
             scope(Scope): The scope is used to find fp32 parameters.
             test_program(Program): The program is used for testing.
@@ -273,7 +273,7 @@ class OptimizerWithMixedPrecision(object):
                         loss = paddle.mean(hidden)
                     # 2) Create the optimizer and set `multi_precision` to True.
                     # Setting `multi_precision` to True can avoid the poor accuracy
-                    # or the slow convergence in a way. 
+                    # or the slow convergence in a way.
                     optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                     # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                     amp_list = paddle.static.amp.CustomOpLists(
@@ -293,9 +293,9 @@ class OptimizerWithMixedPrecision(object):
                     # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                     # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                     optimizer.amp_init(place, scope=paddle.static.global_scope())
-                    
+
                 if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
-                    run_example_code()       
+                    run_example_code()
         """
         assert self._train_program is not None, \
             "Please call the minimize method first."
@@ -311,12 +311,12 @@ class OptimizerWithMixedPrecision(object):
 
     def apply_gradients(self, params_grads):
         """
-        Check scaled gradients to determine whether to update loss scaling and update 
+        Check scaled gradients to determine whether to update loss scaling and update
         parameters by their scaled gradients.
-  
+
         Args:
             params_grads (list): A list of params and scaled grads.
-    
+
         Returns:
             A list of optimize operators.
         """
@@ -538,21 +538,21 @@ def decorate(optimizer,
              use_dynamic_loss_scaling=True,
              use_pure_fp16=False,
              use_fp16_guard=None):
-    """ 
+    """
     Decorate the given optimizer to adapt to the mixed-precision training.
 
     Args:
         optimizer(Optimizer): A common Optimizer.
         amp_lists (CustomOpLists): An CustomOpLists object.
         init_loss_scaling(float): The initial loss scaling factor.
-        incr_every_n_steps(int): Increases loss scaling every n consecutive 
+        incr_every_n_steps(int): Increases loss scaling every n consecutive
                                  steps with finite gradients.
-        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
-                                      accumulated steps with nan or 
+        decr_every_n_nan_or_inf(int): Decreases loss scaling every n
+                                      accumulated steps with nan or
                                       inf gradients.
-        incr_ratio(float): The multiplier to use when increasing the loss 
+        incr_ratio(float): The multiplier to use when increasing the loss
                            scaling.
-        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
+        decr_ratio(float): The less-than-one-multiplier to use when decreasing
                            the loss scaling.
         use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling.
         use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
@@ -560,7 +560,7 @@ def decorate(optimizer,
                            Default None, which means that its value equals to `use_pure_fp16`.
 
     Returns:
-        An optimizer acting like a normal one but with mixed-precision training 
+        An optimizer acting like a normal one but with mixed-precision training
         enabled.
 
     Examples 1:
@@ -604,7 +604,7 @@ def decorate(optimizer,
                     loss = paddle.mean(hidden)
                 # 2) Create the optimizer and set `multi_precision` to True.
                 # Setting `multi_precision` to True can avoid the poor accuracy
-                # or the slow convergence in a way. 
+                # or the slow convergence in a way.
                 optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                 # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                 amp_list = paddle.static.amp.CustomOpLists(
@@ -624,7 +624,7 @@ def decorate(optimizer,
                 # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                 # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                 optimizer.amp_init(place, scope=paddle.static.global_scope())
-                
+
             if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
                 run_example_code()
     """
diff --git a/python/paddle/fluid/contrib/model_stat.py b/python/paddle/fluid/contrib/model_stat.py
index ed6d82671f2..3bcc3306895 100644
--- a/python/paddle/fluid/contrib/model_stat.py
+++ b/python/paddle/fluid/contrib/model_stat.py
@@ -39,9 +39,9 @@ from collections import OrderedDict
 def summary(main_prog):
     '''
     It can summary model's PARAMS, FLOPs until now.
-    It support common operator like conv, fc, pool, relu, sigmoid, bn etc. 
+    It support common operator like conv, fc, pool, relu, sigmoid, bn etc.
     Args:
-        main_prog: main program 
+        main_prog: main program
     Returns:
         print summary on terminal
     '''
@@ -74,7 +74,7 @@ def _summary_model(block_vars, one_op):
     Returns:
         in_data_shape: one operator's input data shape
         out_data_shape: one operator's output data shape
-        params: one operator's PARAMs 
+        params: one operator's PARAMs
         flops: : one operator's FLOPs
     '''
     if one_op.type in ['conv2d', 'depthwise_conv2d']:
diff --git a/python/paddle/fluid/contrib/optimizer.py b/python/paddle/fluid/contrib/optimizer.py
index 67e972cf3e2..dc6dc213a8f 100644
--- a/python/paddle/fluid/contrib/optimizer.py
+++ b/python/paddle/fluid/contrib/optimizer.py
@@ -60,9 +60,9 @@ class Momentum(Optimizer):
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
             ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
         rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py
index cccc5d90fba..23affb658cc 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py
@@ -110,11 +110,11 @@ class ImperativePTQ(object):
 
         Args:
             model (Layer): The model to be saved.
-            path (str): The path prefix to save model. The format is 
+            path (str): The path prefix to save model. The format is
                 ``dirname/file_prefix`` or ``file_prefix``.
             input_spec (list[InputSpec|Tensor], optional): Describes the input
                 of the saved model's forward method, which can be described by
-                InputSpec or example Tensor. If None, all input variables of 
+                InputSpec or example Tensor. If None, all input variables of
                 the original Layer's forward method would be the inputs of
                 the saved model. Default None.
             **configs (dict, optional): Other save configuration options for
@@ -125,9 +125,9 @@ class ImperativePTQ(object):
                 (1) output_spec (list[Tensor]): Selects the output targets of
                 the saved model. By default, all return variables of original
                 Layer's forward method are kept as the output of the saved model.
-                If the provided ``output_spec`` list is not all output variables, 
+                If the provided ``output_spec`` list is not all output variables,
                 the saved model will be pruned according to the given
-                ``output_spec`` list. 
+                ``output_spec`` list.
 
         Returns:
             None
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_config.py b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_config.py
index 384d2c704fd..341e2f38025 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_config.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_config.py
@@ -36,7 +36,7 @@ class PTQConfig(object):
             activation_quantizer(BaseQuantizer): The activation quantizer.
                 It should be the instance of BaseQuantizer.
             weight_quantizer(BaseQuantizer): The weight quantizer.
-                It should be the instance of BaseQuantizer.    
+                It should be the instance of BaseQuantizer.
         """
         super(PTQConfig, self).__init__()
         assert isinstance(activation_quantizer, tuple(SUPPORT_ACT_QUANTIZERS))
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 84359f71153..8bc16526c3d 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -121,7 +121,7 @@ class ImperativeQuantAware(object):
                 Using this can quickly test if user's quantization method works or not.
                 In this layer, user should both define quantization method and
                 dequantization method, that is, the function's input is non-quantized
-                activation and returns dequantized activation. 
+                activation and returns dequantized activation.
                 If None, will use quantization op defined by 'activation_quantize_type'.
                 Default is None.
 
@@ -139,13 +139,13 @@ class ImperativeQuantAware(object):
                 import ImperativeQuantAware
             from paddle.vision.models \
                 import resnet
-            
+
             model = resnet.resnet50(pretrained=True)
 
             imperative_qat = ImperativeQuantAware(
                 weight_quantize_type='abs_max',
                 activation_quantize_type='moving_average_abs_max')
-            
+
             # Add the fake quant logical.
             # The original model will be rewrite.
             # The outscale of outputs in supportted layers would be calculated.
@@ -153,7 +153,7 @@ class ImperativeQuantAware(object):
 
             # Fine-tune the quantized model
             # ...
-            
+
             # Save quant model for the inference.
             imperative_qat.save_quantized_model(
                 layer=model,
@@ -306,7 +306,7 @@ class ImperativeQuantizeInputs(object):
                  weight_quantize_layer=None,
                  act_quantize_layer=None):
         """
-        The constructor for ImperativeQuantizeInputs. 
+        The constructor for ImperativeQuantizeInputs.
 
         Please refer to the args of ImperativeQuantAware.
         """
@@ -367,7 +367,7 @@ class ImperativeQuantizeInputs(object):
 
     def apply(self, model):
         """
-        Quantize the weights and activations to calculate for specific 
+        Quantize the weights and activations to calculate for specific
         layers.
 
         Args:
@@ -469,14 +469,14 @@ class ImperativeQuantizeOutputs(object):
 
         Args:
             model (Layer): The model to be saved.
-            path (str): The path prefix to save model. The format is 
+            path (str): The path prefix to save model. The format is
                 ``dirname/file_prefix`` or ``file_prefix``.
             input_spec (list[InputSpec|Tensor], optional): Describes the input
                 of the saved model's forward method, which can be described by
-                InputSpec or example Tensor. If None, all input variables of 
+                InputSpec or example Tensor. If None, all input variables of
                 the original Layer's forward method would be the inputs of
                 the saved model. Default None.
-            onnx_format (bool, optional): Whether to export the quantized model 
+            onnx_format (bool, optional): Whether to export the quantized model
                 with format of ONNX. Default is False.
             **configs (dict, optional): Other save configuration options for
                 compatibility. We do not recommend using these configurations,
@@ -486,9 +486,9 @@ class ImperativeQuantizeOutputs(object):
                 (1) output_spec (list[Tensor]): Selects the output targets of
                 the saved model. By default, all return variables of original
                 Layer's forward method are kept as the output of the saved model.
-                If the provided ``output_spec`` list is not all output variables, 
+                If the provided ``output_spec`` list is not all output variables,
                 the saved model will be pruned according to the given
-                ``output_spec`` list. 
+                ``output_spec`` list.
 
         Returns:
             None
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 4e37ba05b68..d0c885eef47 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -110,7 +110,7 @@ def _apply_pass(scope,
 class PostTrainingQuantization(object):
     """
     Utilizing post training quantization methon to quantize the FP32 model,
-    and it uses calibrate data to get the quantization information for all 
+    and it uses calibrate data to get the quantization information for all
     quantized variables.
     """
 
@@ -151,18 +151,18 @@ class PostTrainingQuantization(object):
         Args:
             executor(fluid.Executor): The executor to load, run and save the
                 quantized model.
-            scope(fluid.Scope, optional): The scope of the program, use it to load 
-                and save variables. If scope=None, get scope by global_scope(). 
-            model_dir(str): The path of the fp32 model that will be quantized, 
+            scope(fluid.Scope, optional): The scope of the program, use it to load
+                and save variables. If scope=None, get scope by global_scope().
+            model_dir(str): The path of the fp32 model that will be quantized,
                 and the model and params files are under the path.
-            model_filename(str, optional): The name of file to load the inference 
-                program. If it is None, the default filename '__model__' will 
+            model_filename(str, optional): The name of file to load the inference
+                program. If it is None, the default filename '__model__' will
                 be used. Default is 'None'.
             params_filename(str, optional): The name of file to load all parameters.
-                When all parameters were saved in a single binary file, set it 
-                as the real filename. If parameters were saved in separate files, 
+                When all parameters were saved in a single binary file, set it
+                as the real filename. If parameters were saved in separate files,
                 set it as 'None'. Default is 'None'.
-            batch_generator(Python Generator): The batch generator provides 
+            batch_generator(Python Generator): The batch generator provides
                 calibrate data for DataLoader, and it returns a batch every
                 time. Note that, sample_generator and batch_generator, only one
                 should be set. Beisdes, batch_generator supports lod tensor.
@@ -174,31 +174,31 @@ class PostTrainingQuantization(object):
                 Generator or Dataloader provides calibrate data, and it could
                 return a batch every time.
             batch_size(int, optional): The batch size of DataLoader. Default is 10.
-            batch_nums(int, optional): If batch_nums is not None, the number of 
-                calibrate data is batch_size*batch_nums. If batch_nums is None, use 
+            batch_nums(int, optional): If batch_nums is not None, the number of
+                calibrate data is batch_size*batch_nums. If batch_nums is None, use
                 all data provided by sample_generator as calibrate data.
             algo(str, optional): If algo='KL', use KL-divergenc method to
                 get the KL threshold for quantized activations and get the abs_max
-                value for quantized weights. If algo='abs_max', get the abs max 
-                value for activations and weights. If algo= 'min_max', get the min 
+                value for quantized weights. If algo='abs_max', get the abs max
+                value for activations and weights. If algo= 'min_max', get the min
                 and max value for quantized activations and weights. If algo='avg',
-                get the average value among the max values for activations. If 
+                get the average value among the max values for activations. If
                 algo= 'hist', get the value of 'hist_percent' quantile as the threshold.
-                If algo='mse', get the value which makes the quantization mse loss 
+                If algo='mse', get the value which makes the quantization mse loss
                 minimal. Default is KL.
             hist_percent(float, optional): The threshold of algo 'hist' for activations.
                 Default is 0.99999.
-            quantizable_op_type(list[str], optional): List the type of ops 
-                that will be quantized. Default is ["conv2d", "depthwise_conv2d", 
+            quantizable_op_type(list[str], optional): List the type of ops
+                that will be quantized. Default is ["conv2d", "depthwise_conv2d",
                 "mul"].
             round_type(str, optional): The method of converting the quantized weights
                 value float->int. Currently supports ['round', 'adaround'] methods.
                 Default is `round`, which is rounding nearest to the integer.
                 'adaround' is refer to https://arxiv.org/abs/2004.10568.
             learning_rate(float, optional): The learning rate of adaround method.
-            is_full_quantized(bool, optional): If set is_full_quantized as True, 
+            is_full_quantized(bool, optional): If set is_full_quantized as True,
                 apply quantization to all supported quantizable op type. If set
-                is_full_quantized as False, only apply quantization to the op type 
+                is_full_quantized as False, only apply quantization to the op type
                 according to the input quantizable_op_type.
             bias_correction(bool, optional): If set as True, use the bias correction
                 method of https://arxiv.org/abs/1810.05723. Default is False.
@@ -217,11 +217,11 @@ class PostTrainingQuantization(object):
                 the model accuracy is usually higher when it is 'channel_wise_abs_max'.
             onnx_format(bool): Whether to export the quantized model with format of ONNX.
                 Default is False.
-            freeze_model(bool): Whether to convert quantized and trained ``program`` to final 
+            freeze_model(bool): Whether to convert quantized and trained ``program`` to final
                 quantized ``program``. Default: True.
             skip_tensor_list(list): List of skip quant tensor name. Default: None.
-            same_scale_tensor_list(list(list)): The list of tensor keep same scale in the outermost 
-                list, the final scale about every list is the max of the scale in the list 
+            same_scale_tensor_list(list(list)): The list of tensor keep same scale in the outermost
+                list, the final scale about every list is the max of the scale in the list
                 of tensor. Default: None.
             optimize_model(bool, optional): If set optimize_model as True, it applies
                 some passes to the model before quantization, and it supports
@@ -240,17 +240,17 @@ class PostTrainingQuantization(object):
         .. code-block:: python
             import paddle.fluid as fluid
             from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
-            
+
             exe = fluid.Executor(fluid.CPUPlace())
             model_dir = path/to/fp32_model_params
-            # set model_filename as None when the filename is __model__, 
+            # set model_filename as None when the filename is __model__,
             # otherwise set it as the real filename
-            model_filename = None 
-            # set params_filename as None when all parameters were saved in 
+            model_filename = None
+            # set params_filename as None when all parameters were saved in
             # separate files, otherwise set it as the real filename
             params_filename = None
             save_model_path = path/to/save_model_path
-            # prepare the sample generator according to the model, and the 
+            # prepare the sample generator according to the model, and the
             # sample generator must return a sample every time. The reference
             # document: https://www.paddlepaddle.org.cn/documentation/docs/zh
             # /user_guides/howto/prepare_data/use_py_reader.html
@@ -621,7 +621,7 @@ class PostTrainingQuantization(object):
 
     def _set_activation_persistable(self):
         '''
-        Set activation variables to be persistable, so can obtain 
+        Set activation variables to be persistable, so can obtain
         the tensor data in sample_data
         '''
         for var in self._program.list_vars():
@@ -983,8 +983,8 @@ class PostTrainingQuantization(object):
 
     def _update_program(self):
         '''
-        Use QuantizationTransformPass and AddQuantDequantPass to insert 
-        fake_quantize, fake_dequantize and fake_quant_dequant op. 
+        Use QuantizationTransformPass and AddQuantDequantPass to insert
+        fake_quantize, fake_dequantize and fake_quant_dequant op.
         Besides, save all threshold to the scale var node.
         '''
         _logger.info("Update the program ...")
@@ -1326,36 +1326,36 @@ class WeightQuantization(object):
                                threshold_rate=0.0):
         '''
         In order to reduce the size of model, this api quantizes the weight
-        of some ops from float32 to int8/16. In the inference stage, the 
+        of some ops from float32 to int8/16. In the inference stage, the
         quantized weight will be dequantized to float32 again.
-        
+
         Args:
             save_model_dir(str): The path to save the quantized model.
-            save_model_filename(str, optional): The name of file to 
-                save the inference program. If it is None, the default 
+            save_model_filename(str, optional): The name of file to
+                save the inference program. If it is None, the default
                 filename '__model__' will be used. Default is 'None'.
-            save_params_filename(str, optional): The name of file to 
-                save all parameters. If it is None, parameters were 
-                saved in separate files. If it is not None, all 
+            save_params_filename(str, optional): The name of file to
+                save all parameters. If it is None, parameters were
+                saved in separate files. If it is not None, all
                 parameters were saved in a single binary file.
-            quantizable_op_type(list[str], optional): The list of ops 
+            quantizable_op_type(list[str], optional): The list of ops
                 that will be quantized, and the quantized ops should be
-                contained in ["conv2d", "depthwise_conv2d", "mul"]. 
+                contained in ["conv2d", "depthwise_conv2d", "mul"].
                 Default is ["conv2d","mul"].
-            weight_bits(int, optional): The bits for the quantized weight, 
+            weight_bits(int, optional): The bits for the quantized weight,
                 and it should be 8 or 16. Default is 8.
             weight_quantize_type(str, optional): quantization type for weights,
                 support 'channel_wise_abs_max' and 'abs_max'. Set it as
                 'channel_wise_abs_max', the accuracy performs better.
-            generate_test_model(bool, optional): If set generate_test_model 
-                as True, it saves a fake quantized model, in which the weights 
-                are quantized and dequantized. We can use PaddlePaddle to load 
+            generate_test_model(bool, optional): If set generate_test_model
+                as True, it saves a fake quantized model, in which the weights
+                are quantized and dequantized. We can use PaddlePaddle to load
                 the fake quantized model and test the accuracy on GPU or CPU.
-            threshold_rate(float, optional): This api uses abs_max methd to 
-                quantize the weight from float32 to int8/16, and the abs max 
-                value is important for quantization diff. When the abs_max 
-                value is far away from the center of the numerical distribution, 
-                we can set threshold_rate between 1e-6 and 1e-8, so the abs max 
+            threshold_rate(float, optional): This api uses abs_max methd to
+                quantize the weight from float32 to int8/16, and the abs max
+                value is important for quantization diff. When the abs_max
+                value is far away from the center of the numerical distribution,
+                we can set threshold_rate between 1e-6 and 1e-8, so the abs max
                 value will be optimized. Default is 0.0.
         '''
         for op_type in quantizable_op_type:
@@ -1386,7 +1386,7 @@ class WeightQuantization(object):
         """
         Convert all presistable vars from fp32 to fp16.
         Note that, this api only changes the data type of variables in
-        __params__ file, and the __model__ file remains unchanged. 
+        __params__ file, and the __model__ file remains unchanged.
 
         Args:
             save_model_dir(str): The path to save the fp16 model.
@@ -1545,7 +1545,7 @@ class WeightQuantization(object):
     def _weight_channel_wise_abs_max_quantization(self, scope, place,
                                                   weight_bits, op, var_name,
                                                   for_test):
-        ''' 
+        '''
         Use channel_wise_abs_max method to quantize weight.
         '''
         quantize_range = (1 << (weight_bits - 1)) - 1
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index f8d950aa5e0..520c094798f 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -135,7 +135,7 @@ class QuantizationTransformPass(object):
                 initialize these new parameters.
             place(fluid.CPUPlace|fluid.CUDAPlace|str): place is used to initialize new
                 parameters described above. If it's string, It can be ``cpu``, and ``gpu:x``,
-                where ``x`` is the index of the GPUs. 
+                where ``x`` is the index of the GPUs.
             weight_bits(int): quantization bit number for weights,
                 the bias is not quantized.
             activation_bits(int): quantization bit number for activation.
@@ -153,8 +153,8 @@ class QuantizationTransformPass(object):
             moving_rate(float): the param for 'moving_average_abs_max' quantization.
             skip_pattern(str or str list): The user-defined quantization skip pattern, which
                 will be presented in the name scope of an op. When the skip pattern is
-                detected in an op's name scope, the corresponding op will not be quantized. 
-            quantizable_op_type(list[str]): List the type of ops that will be quantized. 
+                detected in an op's name scope, the corresponding op will not be quantized.
+            quantizable_op_type(list[str]): List the type of ops that will be quantized.
                 Default is ["conv2d", "depthwise_conv2d", "mul"]. The quantizable_op_type in
                 QuantizationFreezePass and ConvertToInt8Pass must be the same as this.
             weight_quantize_func(function): Function that defines how to quantize weight.
@@ -762,7 +762,7 @@ class QuantizationTransformPass(object):
 
     def _copy_graph(self, graph, source_graph, op_node):
         """
-        copy op_node in source_graph to graph. And will run recursively 
+        copy op_node in source_graph to graph. And will run recursively
         for next ops that link to op_node's outputs.
         Args:
             graph(IrGraph): target graph to copy.
@@ -977,8 +977,8 @@ class QuantizationFreezePass(object):
                 value float->int. Currently supports ['round', 'adaround'] methods.
                 Default is `round`, which is rounding nearest to the integer.
                 'adaround' is refer to https://arxiv.org/abs/2004.10568.
-            weight_quantize_type(str): quantization type for weights, support 'abs_max' and 
-                'channel_wise_abs_max'. The 'range_abs_max' usually is not used for weight, 
+            weight_quantize_type(str): quantization type for weights, support 'abs_max' and
+                'channel_wise_abs_max'. The 'range_abs_max' usually is not used for weight,
                 since weights are fixed once the model is well trained.
             quantizable_op_type(list[str]): This input param will be removed latter. The pass
                 will process all quantized op, so it is not necessary to set the input param.
@@ -1610,7 +1610,7 @@ class OutScaleForInferencePass(object):
 
 class AddQuantDequantPass(object):
     """
-    Quantize the ops that do not have weights, and add quant_dequant op for the 
+    Quantize the ops that do not have weights, and add quant_dequant op for the
     quantized ops's inputs.
     """
 
@@ -1635,18 +1635,18 @@ class AddQuantDequantPass(object):
             place(fluid.CPUPlace|fluid.CUDAPlace|str): place is used to initialize new
                 parameters described above. If ``place`` is string, it can be It can be ``cpu``
                 or ``gpu:x``, where ``x`` is the index of the GPUs.
-            moving_rate(float, optional): the param for 'quant_dequant_moving_average_abs_max' 
+            moving_rate(float, optional): the param for 'quant_dequant_moving_average_abs_max'
                 quantization. Default is 0.9.
             quant_bits(int, optional): quantization bit number for activation. Default is 8.
             skip_pattern(str, optional): The user-defined quantization skip pattern, which
                 will be presented in the name scope of an op. When the skip pattern is
                 detected in an op's name scope, the corresponding op will not be quantized.
                 Default is 'skip_quant'.
-            quantizable_op_type(list[str], optional): List the type of ops that will be 
-                quantized. Default is ["elementwise_add", "pool2d"]. 
-            is_full_quantized(bool, optional): If set is_full_quantized as True, apply 
+            quantizable_op_type(list[str], optional): List the type of ops that will be
+                quantized. Default is ["elementwise_add", "pool2d"].
+            is_full_quantized(bool, optional): If set is_full_quantized as True, apply
                 quantization to all supported quantizable op type. If set is_full_quantized
-                as False, only apply quantization to the op type according to the input 
+                as False, only apply quantization to the op type according to the input
                 quantizable_op_type.
         """
         self._scope = scope
@@ -2051,7 +2051,7 @@ class QuantizationTransformPassV2(QuantizationTransformPass):
                 initialize these new parameters.
             place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to initialize new
                 parameters described above. If it's string, It can be ``cpu``, and ``gpu:x``,
-                where ``x`` is the index of the GPUs. 
+                where ``x`` is the index of the GPUs.
             weight_bits(int): quantization bit number for weights,
                 the bias is not quantized.
             activation_bits(int): quantization bit number for activation.
@@ -2069,8 +2069,8 @@ class QuantizationTransformPassV2(QuantizationTransformPass):
             moving_rate(float): the param for 'moving_average_abs_max' quantization.
             skip_pattern(str or str list): The user-defined quantization skip pattern, which
                 will be presented in the name scope of an op. When the skip pattern is
-                detected in an op's name scope, the corresponding op will not be quantized. 
-            quantizable_op_type(list[str]): List the type of ops that will be quantized. 
+                detected in an op's name scope, the corresponding op will not be quantized.
+            quantizable_op_type(list[str]): List the type of ops that will be quantized.
                 Default is ["conv2d", "depthwise_conv2d", "mul"]. The quantizable_op_type in
                 QuantizationFreezePass and ConvertToInt8Pass must be the same as this.
             weight_quantize_func(function): Function that defines how to quantize weight.
@@ -2345,20 +2345,20 @@ class AddQuantDequantPassV2(object):
             place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to initialize new
                 parameters described above. If ``place`` is string, it can be It can be ``cpu``
                 or ``gpu:x``, where ``x`` is the index of the GPUs.
-            moving_rate(float, optional): the param for 'quant_dequant_moving_average_abs_max' 
+            moving_rate(float, optional): the param for 'quant_dequant_moving_average_abs_max'
                 quantization. Default is 0.9.
             quant_bits(int, optional): quantization bit number for activation. Default is 8.
             skip_pattern(str, optional): The user-defined quantization skip pattern, which
                 will be presented in the name scope of an op. When the skip pattern is
                 detected in an op's name scope, the corresponding op will not be quantized.
                 Default is 'skip_quant'.
-            quantizable_op_type(list[str], optional): List the type of ops that will be 
-                quantized. Default is ["elementwise_add", "pool2d"]. 
-            is_full_quantized(bool, optional): If set is_full_quantized as True, apply 
+            quantizable_op_type(list[str], optional): List the type of ops that will be
+                quantized. Default is ["elementwise_add", "pool2d"].
+            is_full_quantized(bool, optional): If set is_full_quantized as True, apply
                 quantization to all supported quantizable op type. If set is_full_quantized
-                as False, only apply quantization to the op type according to the input 
+                as False, only apply quantization to the op type according to the input
                 quantizable_op_type.
-        
+
         Examples:
         .. code-block:: python
             # The original graph will be rewrite.
@@ -2490,7 +2490,7 @@ class ReplaceFakeQuantDequantPass(object):
             place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to initialize new
                 parameters described above. If ``place`` is string, it can be It can be ``cpu``
                 or ``gpu:x``, where ``x`` is the index of the GPUs.
-        
+
         Examples:
         .. code-block:: python
             # The original graph will be rewrite.
@@ -2615,7 +2615,7 @@ class QuantWeightPass(object):
              https://arxiv.org/abs/1810.05723.
         quant_bits(int, optional): quantization bit number for weight. Default is 8.
         save_int_weight(bool, optional): Whether the type saving the weight is int. Default is True.
-    
+
     Examples:
         .. code-block:: python
             # The original graph will be rewrite.
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py b/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py
index 892b027de53..ce20d2e695a 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py
@@ -64,7 +64,7 @@ class QuantizeTranspilerV2(object):
                  ],
                  skip_pattern=['skip_quant']):
         """
-        Apply fake quant for the quantized ops. 
+        Apply fake quant for the quantized ops.
 
         Args:
             weight_bits(int): the bit of quantized weight.
@@ -141,13 +141,13 @@ class QuantizeTranspilerV2(object):
 
     def convert(self, test_program, scope=None):
         """
-        Convert the test program. 
+        Convert the test program.
         Get the out scale from the moving_average_abs_max_scale op and save the
-        out scale into the quantized op. 
+        out scale into the quantized op.
         Args:
             test_program(Program): the test program to be converted.
-            scope(fluid.Scope, optional): The scope of the program, use it to load 
-                and save variables. If scope=None, get scope by global_scope(). 
+            scope(fluid.Scope, optional): The scope of the program, use it to load
+                and save variables. If scope=None, get scope by global_scope().
         """
         scope = global_scope() if scope == None else scope
 
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index 40c96e0ce34..bc335dfd4b9 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -111,7 +111,7 @@ def set_excluded_layers(param_names, main_program=None):
 
                     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
                     optimizer = paddle.static.amp.decorate(optimizer )
-                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
                     # will insert necessary masking operations for ASP workflow.
                     optimizer = paddle.incubate.asp.decorate(optimizer)
                     optimizer.minimize(loss, startup_program)
@@ -124,7 +124,7 @@ def set_excluded_layers(param_names, main_program=None):
 
 def reset_excluded_layers(main_program=None):
     r"""
-    Reset exculded layers setting corresponding to :attr:`main_program`. If :attr:`main_program` 
+    Reset exculded layers setting corresponding to :attr:`main_program`. If :attr:`main_program`
     is None, then all configurations of excluded_layers would be cleaned.
 
     Args:
@@ -203,7 +203,7 @@ def reset_excluded_layers(main_program=None):
 
                     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
                     optimizer = paddle.static.amp.decorate(optimizer )
-                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
                     # will insert necessary masking operations for ASP workflow.
                     optimizer = paddle.incubate.asp.decorate(optimizer)
                     optimizer.minimize(loss, startup_program)
@@ -215,7 +215,7 @@ def decorate(optimizer):
     r"""
     Wrap the given optimizer as a OptimizerWithSparsityGuarantee,
     If runnig with dynamic graph mode. ASP would creates mask variables for supported parameters.
-    Else if in static graph mode, ASP would creates mask variables and inserts necessary ops 
+    Else if in static graph mode, ASP would creates mask variables and inserts necessary ops
     when calling minimize()
 
     Args:
@@ -250,7 +250,7 @@ def decorate(optimizer):
                 optimizer = paddle.optimizer.SGD(
                     learning_rate=0.01, parameters=my_layer.parameters())
 
-                # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which 
+                # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which
                 # will apply necessary masking operations for ASP workflow.
                 # In dynamic graph mode, ASP would create related mask variables during decoration.
                 optimizer = paddle.incubate.asp.decorate(optimizer)
@@ -287,9 +287,9 @@ def decorate(optimizer):
                     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
 
                     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
                     # will insert necessary masking operations for ASP workflow.
-                    # In static graph mode, ASP creates related mask variables 
+                    # In static graph mode, ASP creates related mask variables
                     # during minimize().
                     optimizer = paddle.incubate.asp.decorate(optimizer)
                     optimizer.minimize(loss, startup_program)
@@ -299,15 +299,15 @@ def decorate(optimizer):
 
 def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True):
     r"""
-    Pruning parameters of supported layers in :attr:`model` via 
-    specified mask generation function given by :attr:`mask_algo`. This 
+    Pruning parameters of supported layers in :attr:`model` via
+    specified mask generation function given by :attr:`mask_algo`. This
     function supports both training and inference controlled by :attr:`with_mask`.
     If :attr:`with_mask` is True, it would also prune parameter related ASP mask Variables,
     else only prunes parameters.
 
-    *Note*: (Static graph mode) If calling this function with :attr:`with_mask`, it should call `OptimizerWithSparsityGuarantee.minimize` 
-    and initialization (`exe.run(startup_program`)) before (For successfully obtain mask Variable). 
-    Typically set `with_mask` as true for training (have called `OptimizerWithSparsityGuarantee.minimize`) and false for 
+    *Note*: (Static graph mode) If calling this function with :attr:`with_mask`, it should call `OptimizerWithSparsityGuarantee.minimize`
+    and initialization (`exe.run(startup_program`)) before (For successfully obtain mask Variable).
+    Typically set `with_mask` as true for training (have called `OptimizerWithSparsityGuarantee.minimize`) and false for
     inference only. To obtain OptimizerWithSparsityGuarantee, please see `paddle.incubate.asp.decoreate()`.
 
     Args:
@@ -350,7 +350,7 @@ def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True):
                 optimizer = paddle.optimizer.SGD(
                     learning_rate=0.01, parameters=my_layer.parameters())
 
-                # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which 
+                # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which
                 # will apply necessary masking operations for ASP workflow.
                 # In dynamic graph mode, ASP would create related mask variables during decoration.
                 optimizer = paddle.incubate.asp.decorate(optimizer)
@@ -408,9 +408,9 @@ def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True):
                     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
 
                     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
                     # will insert necessary masking operations for ASP workflow.
-                    # In static graph mode, ASP creates related mask variables 
+                    # In static graph mode, ASP creates related mask variables
                     # during minimize().
                     optimizer = paddle.incubate.asp.decorate(optimizer)
                     optimizer.minimize(loss, startup_program)
@@ -423,7 +423,7 @@ def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True):
 
                 # Must call exe.run(startup_program) first before calling paddle.asp.prune_model()
                 paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
-                # it also be accepted to call 
+                # it also be accepted to call
                 # paddle.incubate.asp.prune_model(main_program, mask_algo='mask_2d_best')
 
                 for i in range(10):
@@ -505,7 +505,7 @@ class ProgramASPInfo(object):
 
 class ASPHelper(object):
     r"""
-    ASPHelper is a collection of Auto SParsity (ASP) functions to enable 
+    ASPHelper is a collection of Auto SParsity (ASP) functions to enable
 
     1. training models with weights in 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 from scratch.
     2. pruning well-trained models into 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 for fine-tuning.
@@ -770,8 +770,8 @@ class ASPHelper(object):
         2. Create sparse mask Tensors according to supported layers in :attr:`main_program`.
         3. Insert masking ops in the end of parameters update.
 
-        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`. 
-        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph 
+        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`.
+        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph
         cannot be modified anymore.)
 
         Args:
@@ -809,8 +809,8 @@ class ASPHelper(object):
         1. Call :attr:`optimizer`.step()
         2. Mask parameters with sparse masks.
 
-        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`. 
-        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph 
+        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`.
+        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph
         cannot be modified anymore.)
 
         Args:
@@ -921,8 +921,8 @@ class OptimizerWithSparsityGuarantee(object):
         1. Call :attr:`optimizer`.step()
         2. Mask parameters with sparse masks.
 
-        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`. 
-        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph 
+        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`.
+        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph
         cannot be modified anymore.)
 
         Args:
@@ -949,7 +949,7 @@ class OptimizerWithSparsityGuarantee(object):
     def set_state_dict(self, state_dict):
         r"""
         This function is a decorator of `set_state_dict` function in `Optimizer`.
-        Args: 
+        Args:
             state_dict(dict) : Dict contains all the Tensor needed by optimizer
         Return:
             None
diff --git a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
index 8cd422d0d76..38dd428e0f0 100644
--- a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
@@ -81,8 +81,8 @@ def add_supported_layer(layer, pruning_func=None):
     Add supported layers and its corresponding pruning function.
 
     Args:
-        name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then 
-        it would be turn to string internally. ASP would use this name to match parameter's name and call 
+        name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then
+        it would be turn to string internally. ASP would use this name to match parameter's name and call
         its the corresponding pruning function.
         pruning_func (function, optional): a function type which receives five argument (weight_nparray,
         m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py
index 1d0694c4dde..521449f0f20 100644
--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -108,7 +108,7 @@ def calculate_density(x):
 def _reshape_1d(mat, m):
     r"""
     Reshape the input 2D matrix to shape (-1, m).
-    If the second dimension of :attr:`mat` is not a multiples of :attr:`m`, 
+    If the second dimension of :attr:`mat` is not a multiples of :attr:`m`,
     then this function would pad the remainder with 0 before reshaping.
 
     .. math::
@@ -136,7 +136,7 @@ def _reshape_1d(mat, m):
 def check_mask_1d(mat, n, m):
     r"""
     Check if every row of the input matrix :attr:`mat` is in 1D `n:m` sparse pattern.
-    This function would pad the second dimension of :attr:`mat` by zero 
+    This function would pad the second dimension of :attr:`mat` by zero
     to be a multiples of :attr:`m` if necessary.
 
     1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block.
@@ -179,8 +179,8 @@ def check_mask_1d(mat, n, m):
 
 def get_mask_1d(mat, n, m):
     r"""
-    Generate 1D `n:m` sparse pattern mask of the input matrix :attr:`mat` 
-    in row-directory. This function would pad the second dimension of :attr:`mat` 
+    Generate 1D `n:m` sparse pattern mask of the input matrix :attr:`mat`
+    in row-directory. This function would pad the second dimension of :attr:`mat`
     by zero to be a multiples of :attr:`m` before mask generation.
 
     1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block.
@@ -220,7 +220,7 @@ def get_mask_1d(mat, n, m):
 def _reshape_2d(mat, m):
     r"""
     Reshape the input 2D matrix to shape (-1, :math:`m \times m`).
-    In each dimension of :attr:`mat`, if it is not a multiples of :attr:`m`, 
+    In each dimension of :attr:`mat`, if it is not a multiples of :attr:`m`,
     then this function would pad the remainder with 0 before reshaping.
 
     .. math::
@@ -263,10 +263,10 @@ def _reshape_2d(mat, m):
 def check_mask_2d(mat, n, m):
     r"""
     Check if every :math:`m \times m` block of the input matrix :attr:`mat` is in 2D `n:m` sparse pattern.
-    This function would pad each dimension of :attr:`mat` by zero to be a multiples of 
+    This function would pad each dimension of :attr:`mat` by zero to be a multiples of
     :attr:`m` if necessary.
 
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
     under the constraint of at least :attr:`n` zeros for each row and column.
 
     Args:
@@ -312,10 +312,10 @@ def check_mask_2d(mat, n, m):
 
 def get_mask_2d_greedy(mat, n, m):
     r"""
-    Greedily generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`. 
+    Greedily generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`.
     This function would pad each dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation.
 
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
     under the constraint of at least :attr:`n` zeros for each row and column.
     Greedily generating: For each :math:`m \times m` block, selecting values to keep in descent order.
 
@@ -384,7 +384,7 @@ def _compute_valid_2d_patterns(n, m):
     r"""
     Compute all vaild 2D `n:m` sparse patterns.
 
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
     under the constraint of at least :attr:`n` zeros for each row and column.
 
     Args:
@@ -420,11 +420,11 @@ def _compute_valid_2d_patterns(n, m):
 
 def get_mask_2d_best(mat, n, m):
     r"""
-    Generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat` 
-    to form sparse matrix with maximun L1 norm .This function would pad each 
+    Generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`
+    to form sparse matrix with maximun L1 norm .This function would pad each
     dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation.
 
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
     under the constraint of at least :attr:`n` zeros for each row and column.
 
     *Note*: L1 norm of sparse matrix from `Best` API is greater than or equal to the one from `Greedy`.
diff --git a/python/paddle/fluid/data.py b/python/paddle/fluid/data.py
index 4a15b6a8ea2..34fc5eec4e6 100644
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
@@ -35,10 +35,10 @@ def data(name, shape, dtype='float32', lod_level=0):
     is a placeholder that could be fed with input, such as Executor can feed
     input into the variable.
 
-    Note: 
+    Note:
         `paddle.fluid.layers.data` is deprecated. It will be removed in a
-        future version. Please use this `paddle.fluid.data`. 
-       
+        future version. Please use this `paddle.fluid.data`.
+
         The `paddle.fluid.layers.data` set shape and dtype at compile time but
         does NOT check the shape or the dtype of fed data, this
         `paddle.fluid.data` checks the shape and the dtype of data fed by
diff --git a/python/paddle/fluid/data_feed_desc.py b/python/paddle/fluid/data_feed_desc.py
index fb4ce735fca..0aa72cadaf5 100644
--- a/python/paddle/fluid/data_feed_desc.py
+++ b/python/paddle/fluid/data_feed_desc.py
@@ -21,7 +21,7 @@ __all__ = ['DataFeedDesc']
 class DataFeedDesc(object):
     """
     :api_attr: Static Graph
-    
+
     Datafeed descriptor, describing input training data format. This class is
     currently only used for AsyncExecutor (See comments for class AsyncExecutor
     for a brief introduction)
@@ -133,7 +133,7 @@ class DataFeedDesc(object):
     def set_dense_slots(self, dense_slots_name):
         """
         Set slots in :attr:`dense_slots_name` as dense slots. **Note: In default, all slots are sparse slots.**
- 
+
         Features for a dense slot will be fed into a Tensor, while those for a
         sparse slot will be fed into a LoDTensor.
 
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 876d4772462..d5e086f7c93 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -269,21 +269,21 @@ class BatchedTensorProvider(object):
 class DataFeeder(object):
     """
     :api_attr: Static Graph
-    
+
     DataFeeder converts the data that returned by a reader into a data
-    structure that can feed into Executor. The reader is usually a 
-    python generator that returns a list of mini-batch data entries. 
+    structure that can feed into Executor. The reader is usually a
+    python generator that returns a list of mini-batch data entries.
 
     Parameters:
         feed_list (list): Variables or names of Variables that need
             to feed.
-        place (:ref:`api_fluid_CPUPlace` | :ref:`api_fluid_CUDAPlace` ): 
-            place indicates the device (CPU | GPU) the data will be fed into, if 
-            you want to feed data into GPU, please using :code:`fluid.CUDAPlace(i)` 
-            (:code:`i` represents the GPU id), or if you want to feed data into CPU, 
+        place (:ref:`api_fluid_CPUPlace` | :ref:`api_fluid_CUDAPlace` ):
+            place indicates the device (CPU | GPU) the data will be fed into, if
+            you want to feed data into GPU, please using :code:`fluid.CUDAPlace(i)`
+            (:code:`i` represents the GPU id), or if you want to feed data into CPU,
             please using :code:`fluid.CPUPlace()`.
-        program (:ref:`api_fluid_Program` , optional): The Program that will 
-            feed data into, if program is None, it will use default_main_program(). 
+        program (:ref:`api_fluid_Program` , optional): The Program that will
+            feed data into, if program is None, it will use default_main_program().
             Default None.
 
     Raises:
@@ -295,31 +295,31 @@ class DataFeeder(object):
             import numpy as np
             import paddle
             import paddle.fluid as fluid
-            
+
             place = fluid.CPUPlace()
             def reader():
                 for _ in range(4):
                     yield np.random.random([4]).astype('float32'), np.random.random([3]).astype('float32'),
-            
+
             main_program = fluid.Program()
             startup_program = fluid.Program()
-            
+
             with fluid.program_guard(main_program, startup_program):
                 data_1 = fluid.data(name='data_1', shape=[None, 2, 2], dtype='float32')
                 data_2 = fluid.data(name='data_2', shape=[None, 1, 3], dtype='float32')
                 out = fluid.layers.fc(input=[data_1, data_2], size=2)
                 # ...
             feeder = fluid.DataFeeder([data_1, data_2], place)
-            
+
             exe = fluid.Executor(place)
             exe.run(startup_program)
-            
+
             feed_data = feeder.feed(reader())
-            
+
             # print feed_data to view feed results
             # print(feed_data['data_1'])
             # print(feed_data['data_2'])
-            
+
             outs = exe.run(program=main_program,
                             feed=feed_data,
                             fetch_list=[out])
@@ -348,13 +348,13 @@ class DataFeeder(object):
 
     def feed(self, iterable):
         """
-        According to :code:`feed_list` of :code:`DataFeeder` and :code:`iterable` , converts 
+        According to :code:`feed_list` of :code:`DataFeeder` and :code:`iterable` , converts
         the input into a data structure that can feed into Executor.
 
         Parameters:
             iterable (generator): user defined python generator to read the raw input data
 
-        Returns: 
+        Returns:
             :code:`dict`: a :code:`dict` that contains (variable name - converted tensor) pairs
 
         Example:
@@ -367,17 +367,17 @@ class DataFeeder(object):
                 # result['data_2'], result['data_3'] are similar.
                 import numpy as np
                 import paddle.fluid as fluid
-                
+
                 def reader(limit=5):
                     for i in range(1, limit + 1):
                         yield np.ones([6]).astype('float32') * i , np.ones([1]).astype('int64') * i, np.random.random([9]).astype('float32')
-                
+
                 data_1 = fluid.data(name='data_1', shape=[None, 2, 1, 3])
                 data_2 = fluid.data(name='data_2', shape=[None, 1], dtype='int64')
                 data_3 = fluid.data(name='data_3', shape=[None, 3, 3], dtype='float32')
                 feeder = fluid.DataFeeder(['data_1','data_2', 'data_3'], fluid.CPUPlace())
-                
-                
+
+
                 result = feeder.feed(reader())
                 print(result['data_1'])
                 print(result['data_2'])
@@ -410,20 +410,20 @@ class DataFeeder(object):
     def feed_parallel(self, iterable, num_places=None):
         """
         Similar with feed function, feed_parallel is used with multiple devices (CPU|GPU).
-        Here :code:`iterable` is a list of python generators. The data return by each 
-        generator in the list will be fed into a separate device.        
+        Here :code:`iterable` is a list of python generators. The data return by each
+        generator in the list will be fed into a separate device.
 
         Parameters:
-            iterable (list|tuple): list of user-defined python generators. The element 
+            iterable (list|tuple): list of user-defined python generators. The element
                 number should match the :code:`num_places`.
-            num_places (int, optional): the number of devices. If not provided (None), 
+            num_places (int, optional): the number of devices. If not provided (None),
                 all available devices on the machine will be used. Default None.
 
-        Returns: 
-            :code:`generator`: a :code:`generator` that generate dict which contains (variable name - converted tensor) pairs, 
+        Returns:
+            :code:`generator`: a :code:`generator` that generate dict which contains (variable name - converted tensor) pairs,
             the total number of dicts will be generated matches with the :code:`num_places`
 
-        .. note::        
+        .. note::
             The number of devices - :code:`num_places` should equal to the generator (element of :code:`iterable` ) number
 
         Example:
@@ -503,18 +503,18 @@ class DataFeeder(object):
 
         Parameters:
             reader(generator): a user defined python generator used to get :code:`mini-batch` of data.
-                A :code:`mini-batch` can be regarded as a python generator that returns batches of input 
-                entities, just like the below :code:`_mini_batch` in the code example.                      
+                A :code:`mini-batch` can be regarded as a python generator that returns batches of input
+                entities, just like the below :code:`_mini_batch` in the code example.
             multi_devices(bool): indicate whether to use multiple devices or not.
             num_places(int, optional): if :code:`multi_devices` is True, you can specify the number
                 of devices(CPU|GPU) to use, if multi_devices is None, the function will use all the
                 devices of the current machine. Default None.
-            drop_last(bool, optional): whether to drop the last round of data if it is not enough to 
+            drop_last(bool, optional): whether to drop the last round of data if it is not enough to
                 feed all devices. Default True.
 
-        Returns: 
+        Returns:
             :code:`generator`: a new :code:`generator` which return converted dicts that can be fed into Executor
-            
+
         Raises:
             :code:`ValueError`: If drop_last is False and the data cannot fit devices perfectly.
 
@@ -525,7 +525,7 @@ class DataFeeder(object):
                 import paddle
                 import paddle.fluid as fluid
                 import paddle.fluid.compiler as compiler
-                
+
                 def reader():
                     def _mini_batch(batch_size):
                         for i in range(batch_size):
@@ -533,23 +533,23 @@ class DataFeeder(object):
 
                     for _ in range(10):
                         yield _mini_batch(np.random.randint(1, 10))
-                
+
                 place_num = 3
                 places = [fluid.CPUPlace() for _ in range(place_num)]
-                
+
                 # a simple network sample
                 data = fluid.data(name='data', shape=[None, 4, 4], dtype='float32')
                 label = fluid.data(name='label', shape=[None, 1], dtype='int64')
                 hidden = fluid.layers.fc(input=data, size=10)
-                
+
                 feeder = fluid.DataFeeder(place=places[0], feed_list=[data, label])
                 reader = feeder.decorate_reader(reader, multi_devices=True, num_places=3, drop_last=True)
-                
+
                 exe = fluid.Executor(places[0])
                 exe.run(fluid.default_startup_program())
                 compiled_prog = compiler.CompiledProgram(
                          fluid.default_main_program()).with_data_parallel(places=places)
-                
+
                 for i,data in enumerate(reader()):
                     # print data if you like
                     # print(i, data)
diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py
index 8187faef008..8a17ba2be34 100644
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
@@ -40,7 +40,7 @@ class BatchSampler(Sampler):
 
 
     Args:
-        dataset(Dataset): this could be a :code:`paddle.io.Dataset` 
+        dataset(Dataset): this could be a :code:`paddle.io.Dataset`
                 implement or other python object which implemented
                 :code:`__len__` for BatchSampler to get indices as the
                 range of :attr:`dataset` length. Default None.
@@ -59,24 +59,24 @@ class BatchSampler(Sampler):
         BatchSampler: an iterable object for indices iterating
 
     Examples:
-        
+
         .. code-block:: python
-            
+
             from paddle.io import RandomSampler, BatchSampler, Dataset
 
             # init with dataset
             class RandomDataset(Dataset):
                 def __init__(self, num_samples):
                     self.num_samples = num_samples
-            
+
                 def __getitem__(self, idx):
                     image = np.random.random([784]).astype('float32')
                     label = np.random.randint(0, 9, (1, )).astype('int64')
                     return image, label
-                
+
                 def __len__(self):
                     return self.num_samples
-            
+
             bs = BatchSampler(dataset=RandomDataset(100),
                               shuffle=False,
                               batch_size=16,
@@ -164,13 +164,13 @@ class _InfiniteIterableSampler(object):
 class DistributedBatchSampler(BatchSampler):
     """Sampler that restricts data loading to a subset of the dataset.
 
-    In such case, each process can pass a DistributedBatchSampler instance 
-    as a DataLoader sampler, and load a subset of the original dataset that 
+    In such case, each process can pass a DistributedBatchSampler instance
+    as a DataLoader sampler, and load a subset of the original dataset that
     is exclusive to it.
 
     .. note::
         Dataset is assumed to be of constant size.
-        
+
     Args:
         dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement
                      or other python object which implemented
@@ -200,15 +200,15 @@ class DistributedBatchSampler(BatchSampler):
             class RandomDataset(Dataset):
                 def __init__(self, num_samples):
                     self.num_samples = num_samples
-            
+
                 def __getitem__(self, idx):
                     image = np.random.random([784]).astype('float32')
                     label = np.random.randint(0, 9, (1, )).astype('int64')
                     return image, label
-                
+
                 def __len__(self):
                     return self.num_samples
-  
+
             dataset = RandomDataset(100)
             sampler = DistributedBatchSampler(dataset, batch_size=64)
 
@@ -317,27 +317,27 @@ class DistributedBatchSampler(BatchSampler):
 
         Examples:
             .. code-block:: python
-    
+
                 import numpy as np
-    
+
                 from paddle.io import Dataset, DistributedBatchSampler
-    
+
                 # init with dataset
                 class RandomDataset(Dataset):
                     def __init__(self, num_samples):
                         self.num_samples = num_samples
-                
+
                     def __getitem__(self, idx):
                         image = np.random.random([784]).astype('float32')
                         label = np.random.randint(0, 9, (1, )).astype('int64')
                         return image, label
-                    
+
                     def __len__(self):
                         return self.num_samples
-      
+
                 dataset = RandomDataset(100)
                 sampler = DistributedBatchSampler(dataset, batch_size=64)
-    
+
                 for epoch in range(10):
                     sampler.set_epoch(epoch)
         """
diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/fluid/dataloader/collate.py
index 0bf041007eb..3c46b54156d 100644
--- a/python/paddle/fluid/dataloader/collate.py
+++ b/python/paddle/fluid/dataloader/collate.py
@@ -38,17 +38,17 @@ def default_collate_fn(batch):
      {'image': np.array(shape=[3, 224, 224]), 'label': 3},
      {'image': np.array(shape=[3, 224, 224]), 'label': 4},
      {'image': np.array(shape=[3, 224, 224]), 'label': 5},]
-    
-    
+
+
     This default collate function zipped each number and numpy array
     field together and stack each field as the batch field as follows:
 
     {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}
 
 
-    Args:  
+    Args:
         batch(list of sample data): batch should be a list of sample data.
-    
+
     Returns:
         Batched data: batched each number, numpy array and paddle.Tensor
                       in input data.
@@ -92,9 +92,9 @@ def default_convert_fn(batch):
         automatic batching** mode, for **Distable automatic batching**
         mode, please ses :attr:`paddle.io.DataLoader`
 
-    Args:  
+    Args:
         batch(list of sample data): batch should be a list of sample data.
-    
+
     Returns:
         Batched data: batched each number, numpy array and paddle.Tensor
                       in input data.
diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py
index bd3bb87a79f..c6c31b22750 100755
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@@ -40,25 +40,25 @@ class Dataset(object):
     see :code:`paddle.io.DataLoader`.
 
     Examples:
-        
+
         .. code-block:: python
 
             import numpy as np
             from paddle.io import Dataset
-            
+
             # define a random dataset
             class RandomDataset(Dataset):
                 def __init__(self, num_samples):
                     self.num_samples = num_samples
-            
+
                 def __getitem__(self, idx):
                     image = np.random.random([784]).astype('float32')
                     label = np.random.randint(0, 9, (1, )).astype('int64')
                     return image, label
-                
+
                 def __len__(self):
                     return self.num_samples
-            
+
             dataset = RandomDataset(10)
             for i in range(len(dataset)):
                 print(dataset[i])
@@ -93,23 +93,23 @@ class IterableDataset(Dataset):
     see :code:`paddle.io.DataLoader`.
 
     Examples:
-        
+
         .. code-block:: python
 
             import numpy as np
             from paddle.io import IterableDataset
-            
+
             # define a random dataset
             class RandomDataset(IterableDataset):
                 def __init__(self, num_samples):
                     self.num_samples = num_samples
-            
+
                 def __iter__(self):
                     for i in range(self.num_samples):
                         image = np.random.random([784]).astype('float32')
                         label = np.random.randint(0, 9, (1, )).astype('int64')
                         yield image, label
-            
+
             dataset = RandomDataset(10)
             for img, lbl in dataset:
                 print(img, lbl)
@@ -203,7 +203,7 @@ class IterableDataset(Dataset):
                 worker_init_fn=worker_init_fn)
 
             for data in dataloader:
-                print(data) 
+                print(data)
             # outputs: [2, 5, 3, 6, 4, 7]
 
     """
@@ -241,7 +241,7 @@ class TensorDataset(Dataset):
     Examples:
 
         .. code-block:: python
-        
+
             import numpy as np
             import paddle
             from paddle.io import TensorDataset
@@ -299,7 +299,7 @@ class ComposeDataset(Dataset):
     Examples:
 
         .. code-block:: python
-        
+
             import numpy as np
             import paddle
             from paddle.io import Dataset, ComposeDataset
@@ -314,7 +314,7 @@ class ComposeDataset(Dataset):
                     image = np.random.random([32]).astype('float32')
                     label = np.random.randint(0, 9, (1, )).astype('int64')
                     return image, label
-                
+
                 def __len__(self):
                     return self.num_samples
 
@@ -325,7 +325,7 @@ class ComposeDataset(Dataset):
                 print(label1)
                 print(image2)
                 print(label2)
-            
+
     """
 
     def __init__(self, datasets):
@@ -366,7 +366,7 @@ class ChainDataset(IterableDataset):
     Examples:
 
         .. code-block:: python
-        
+
             import numpy as np
             import paddle
             from paddle.io import IterableDataset, ChainDataset
@@ -382,11 +382,11 @@ class ChainDataset(IterableDataset):
                         image = np.random.random([32]).astype('float32')
                         label = np.random.randint(0, 9, (1, )).astype('int64')
                         yield image, label
-                
+
             dataset = ChainDataset([RandomDataset(10), RandomDataset(10)])
             for image, label in iter(dataset):
                 print(image, label)
-            
+
     """
 
     def __init__(self, datasets):
@@ -405,14 +405,14 @@ class ChainDataset(IterableDataset):
 class Subset(Dataset):
     """
     Subset of a dataset at specified indices.
-    
+
     Args:
         dataset (Dataset): The whole Dataset.
         indices (sequence): Indices in the whole set selected for subset.
 
     Returns:
         List[Dataset]: A Dataset which is the subset of the original dataset.
-    
+
     Examples:
 
         .. code-block:: python
@@ -463,7 +463,7 @@ def random_split(dataset, lengths, generator=None):
             from paddle.io import random_split
 
             a_list = paddle.io.random_split(range(10), [3, 7])
-            print(len(a_list)) 
+            print(len(a_list))
             # 2
 
             for idx, v in enumerate(a_list[0]):
@@ -502,7 +502,7 @@ def random_split(dataset, lengths, generator=None):
 def _accumulate(iterable, fn=lambda x, y: x + y):
     """
     Return running totals
-    
+
     Args:
         iterable: any iterable object for example dataset.
         y (x): one element in the iterable object.
@@ -512,9 +512,9 @@ def _accumulate(iterable, fn=lambda x, y: x + y):
         yields total from beginning iterator to current iterator.
 
     Example code:
-    
+
         .. code-block:: python
-        
+
             _accumulate([1,2,3,4,5]) --> 1 3 6 10 15
             _accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
     """
diff --git a/python/paddle/fluid/dataloader/sampler.py b/python/paddle/fluid/dataloader/sampler.py
index 25a46f3b5df..42fd6724365 100644
--- a/python/paddle/fluid/dataloader/sampler.py
+++ b/python/paddle/fluid/dataloader/sampler.py
@@ -47,20 +47,20 @@ class Sampler(object):
         Sampler: an iterable object for sample indices iterating
 
     Examples:
-        
+
         .. code-block:: python
-            
+
             from paddle.io import Dataset, Sampler
 
             class RandomDataset(Dataset):
                 def __init__(self, num_samples):
                     self.num_samples = num_samples
-            
+
                 def __getitem__(self, idx):
                     image = np.random.random([784]).astype('float32')
                     label = np.random.randint(0, 9, (1, )).astype('int64')
                     return image, label
-                
+
                 def __len__(self):
                     return self.num_samples
 
@@ -73,7 +73,7 @@ class Sampler(object):
 
                 def __len__(self):
                     return len(self.data_source)
-            
+
             sampler = MySampler(data_source=RandomDataset(100))
 
             for index in sampler:
@@ -110,18 +110,18 @@ class SequenceSampler(Sampler):
     Examples:
 
         .. code-block:: python
-            
+
             from paddle.io import Dataset, SequenceSampler
 
             class RandomDataset(Dataset):
                 def __init__(self, num_samples):
                     self.num_samples = num_samples
-            
+
                 def __getitem__(self, idx):
                     image = np.random.random([784]).astype('float32')
                     label = np.random.randint(0, 9, (1, )).astype('int64')
                     return image, label
-                
+
                 def __len__(self):
                     return self.num_samples
 
@@ -158,25 +158,25 @@ class RandomSampler(Sampler):
         num_samples(int): set sample number to draw if :attr:`replacement`
                 is True. Default None.
         generator(Generator): specify a generator to sample the data source. Default None
-        
+
     Returns:
         Sampler: a Sampler yield sample index randomly
 
     Examples:
 
         .. code-block:: python
-            
+
             from paddle.io import Dataset, RandomSampler
 
             class RandomDataset(Dataset):
                 def __init__(self, num_samples):
                     self.num_samples = num_samples
-            
+
                 def __getitem__(self, idx):
                     image = np.random.random([784]).astype('float32')
                     label = np.random.randint(0, 9, (1, )).astype('int64')
                     return image, label
-                
+
                 def __len__(self):
                     return self.num_samples
 
@@ -286,14 +286,14 @@ class WeightedRandomSampler(Sampler):
                 should be numpy array, paddle.Tensor, list or tuple
         num_samples(int): set sample number to draw from sampler.
         replacement(bool): Whether to draw sample with replacements, default True
-        
+
     Returns:
         Sampler: a Sampler yield sample index randomly by given weights
 
     Examples:
 
         .. code-block:: python
-            
+
             from paddle.io import WeightedRandomSampler
 
             sampler = WeightedRandomSampler(weights=[0.1, 0.3, 0.5, 0.7, 0.2],
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index 9fba7bb70f1..4c7e8bb5378 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -134,13 +134,13 @@ class DatasetBase(object):
         """
         set fea eval mode for slots shuffle to debug the importance level of
         slots(features), fea_eval need to be set True for slots shuffle.
-        
+
         Args:
-            record_candidate_size(int): size of instances candidate to shuffle 
+            record_candidate_size(int): size of instances candidate to shuffle
                                         one slot
             fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
                             default is True.
-            
+
         Examples:
             .. code-block:: python
 
@@ -155,12 +155,12 @@ class DatasetBase(object):
 
     def slots_shuffle(self, slots):
         """
-        Slots Shuffle 
-        Slots Shuffle is a shuffle method in slots level, which is usually used 
+        Slots Shuffle
+        Slots Shuffle is a shuffle method in slots level, which is usually used
         in sparse feature with large scale of instances. To compare the metric, i.e.
-        auc while doing slots shuffle on one or several slots with baseline to 
+        auc while doing slots shuffle on one or several slots with baseline to
         evaluate the importance level of slots(features).
-        
+
         Args:
             slots(list[string]): the set of slots(string) to do slots shuffle.
 
@@ -578,7 +578,7 @@ class InMemoryDataset(DatasetBase):
 
     def preprocess_instance(self):
         """
-        Merge pv instance and convey it from input_channel to input_pv_channel. 
+        Merge pv instance and convey it from input_channel to input_pv_channel.
         It will be effective when enable_pv_merge_ is True.
 
         Examples:
@@ -898,7 +898,7 @@ class InMemoryDataset(DatasetBase):
     def release_memory(self):
         """
         :api_attr: Static Graph
-        
+
         Release InMemoryDataset memory data, when data will not be used again.
 
         Examples:
@@ -1039,7 +1039,7 @@ class InMemoryDataset(DatasetBase):
 
     def set_graph_config(self, config):
         """
-        Set graph config, user can set graph config in gpu graph mode. 
+        Set graph config, user can set graph config in gpu graph mode.
 
         Args:
             config(dict): config dict.
@@ -1238,7 +1238,7 @@ class BoxPSDataset(InMemoryDataset):
     def begin_pass(self):
         """
         Begin Pass
-        Notify BoxPS to load sparse parameters of next pass to GPU Memory 
+        Notify BoxPS to load sparse parameters of next pass to GPU Memory
 
         Examples:
             .. code-block:: python
@@ -1252,7 +1252,7 @@ class BoxPSDataset(InMemoryDataset):
     def end_pass(self, need_save_delta):
         """
         End Pass
-        Notify BoxPS that current pass ended 
+        Notify BoxPS that current pass ended
         Examples:
             .. code-block:: python
 
@@ -1318,12 +1318,12 @@ class BoxPSDataset(InMemoryDataset):
 
     def slots_shuffle(self, slots):
         """
-        Slots Shuffle 
-        Slots Shuffle is a shuffle method in slots level, which is usually used 
+        Slots Shuffle
+        Slots Shuffle is a shuffle method in slots level, which is usually used
         in sparse feature with large scale of instances. To compare the metric, i.e.
-        auc while doing slots shuffle on one or several slots with baseline to 
+        auc while doing slots shuffle on one or several slots with baseline to
         evaluate the importance level of slots(features).
-        
+
         Args:
             slots(list[string]): the set of slots(string) to do slots shuffle.
 
diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py
index 4d6cc88ea7e..69d592d0d5f 100644
--- a/python/paddle/fluid/distributed/downpour.py
+++ b/python/paddle/fluid/distributed/downpour.py
@@ -32,7 +32,7 @@ class DownpourSGD(object):
         Can be a float value
     Examples:
         .. code-block:: python
-    
+
              opt = fluid.DistributedOptimizer(sgd_opt)
              opt.minimize()
 
diff --git a/python/paddle/fluid/distributed/fleet.py b/python/paddle/fluid/distributed/fleet.py
index 6c2bcdc213b..9874492a413 100644
--- a/python/paddle/fluid/distributed/fleet.py
+++ b/python/paddle/fluid/distributed/fleet.py
@@ -20,7 +20,7 @@ __all__ = ['Fleet']
 
 class Fleet(object):
     """
-    
+
     """
 
     def __init__(self):
diff --git a/python/paddle/fluid/distributed/helper.py b/python/paddle/fluid/distributed/helper.py
index 20f45b4e796..08e6fca6165 100644
--- a/python/paddle/fluid/distributed/helper.py
+++ b/python/paddle/fluid/distributed/helper.py
@@ -15,7 +15,7 @@
 
 class FileSystem(object):
     """
-    A file system that support hadoop client desc. 
+    A file system that support hadoop client desc.
 
     Args:
         fs_type (string): fs_type, for example is "afs"
diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py
index 6fc1c51e06a..90ac44ada14 100644
--- a/python/paddle/fluid/distributed/node.py
+++ b/python/paddle/fluid/distributed/node.py
@@ -38,7 +38,7 @@ class DownpourServer(Server):
     """
         DownpourServer class is used to generate server program_desc
         Args:
-            server: it is pslib.ServerParameter() 
+            server: it is pslib.ServerParameter()
         Examples:
             server = DownpourServer()
     """
@@ -58,10 +58,10 @@ class DownpourServer(Server):
             table_id(int): id of sparse params table
             learning_rate(float): the learning rate used to update parameters. \
                 Can be a float value
-            slot_key_vars(string): slot key id 
+            slot_key_vars(string): slot key id
             slot_value_var(string): slot key value after embedding
         Returns:
-            return None 
+            return None
         """
         table = self.server_.downpour_server_param.downpour_table_param.add()
         table.table_id = table_id
@@ -93,7 +93,7 @@ class DownpourServer(Server):
             param_var(list): all dense param. it is a list.
             grad_var(list): all dense grad parm it is a list.
         Returns:
-            return None 
+            return None
         """
         table = self.server_.downpour_server_param.downpour_table_param.add()
         table.table_id = table_id
@@ -125,7 +125,7 @@ class DownpourWorker(Worker):
         DownpourWorker class is used to generate worker program_desc
         Args:
             window (int): push params frequency
-            worker: it is pslib.DownpourTrainerParameter 
+            worker: it is pslib.DownpourTrainerParameter
         Examples:
             worker = DownpourWorker(1)
     """
@@ -141,10 +141,10 @@ class DownpourWorker(Worker):
             table_id(int): id of sparse params table
             learning_rate(float): the learning rate used to update parameters. \
                 Can be a float value
-            slot_key_vars(string): slot key id 
+            slot_key_vars(string): slot key id
             slot_value_var(string): slot key value after embedding
         Returns:
-            return None 
+            return None
         """
         table = self.worker_.sparse_table.add()
         table.table_id = table_id
@@ -162,7 +162,7 @@ class DownpourWorker(Worker):
             param_var(list): all dense param. it is a list.
             grad_var(list): all dense grad parm it is a list.
         Returns:
-            return None 
+            return None
         """
         table = self.worker_.dense_table.add()
         table.table_id = table_id
diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py
index 9254a4a136f..6b19d7ca62e 100644
--- a/python/paddle/fluid/distributed/ps_instance.py
+++ b/python/paddle/fluid/distributed/ps_instance.py
@@ -16,10 +16,10 @@ from .helper import MPIHelper
 
 class PaddlePSInstance(object):
     """
-        PaddlePSInstance class is used to generate A instance of server or worker 
+        PaddlePSInstance class is used to generate A instance of server or worker
         Args:
             server_worker_mode: is a value 0 or 1, default is 1
-            proc_per_node: process per node, default is 2 
+            proc_per_node: process per node, default is 2
         Examples:
             instance = PaddlePSInstance(1, 2)
     """
@@ -68,7 +68,7 @@ class PaddlePSInstance(object):
 
     def get_worker_id(self):
         """
-        Return worker index 
+        Return worker index
         """
         if self._server_worker_mode == 0:
             return self._rankid == self.server_num
@@ -77,7 +77,7 @@ class PaddlePSInstance(object):
 
     def get_server_id(self):
         """
-        Return server index 
+        Return server index
         """
         if self._server_worker_mode == 0:
             return self.rank_id
@@ -110,7 +110,7 @@ class PaddlePSInstance(object):
 
     def gather_ips(self):
         """
-        Return all servers and workers ip through mpi allgather 
+        Return all servers and workers ip through mpi allgather
         """
         self._ips = self.dh.comm.allgather(self._ip)
         return self._ips
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index d1d53853740..d70c976858b 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -248,25 +248,25 @@ def amp_guard(enable=True,
     :api_attr: imperative
 
     Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
-    If enabled, the input data type (float32 or float16) of each operator is decided 
-    by autocast algorithm for better performance. 
-    
-    Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in 
+    If enabled, the input data type (float32 or float16) of each operator is decided
+    by autocast algorithm for better performance.
+
+    Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in
     imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode.
 
     Args:
         enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
         custom_white_list(set|list|tuple, optional): The custom white_list. It's the set of ops that support
-             fp16 calculation and are considered numerically-safe and performance-critical. These ops 
+             fp16 calculation and are considered numerically-safe and performance-critical. These ops
              will be converted to fp16.
         custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
-             calculation and are considered numerically-dangerous and whose effects may also be 
+             calculation and are considered numerically-dangerous and whose effects may also be
              observed in downstream ops. These ops will not be converted to fp16.
-        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list; 
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list;
              O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp)
         dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
 
-        
+
     Examples:
 
      .. code-block:: python
@@ -445,15 +445,15 @@ def amp_decorate(models,
                  master_weight=None,
                  save_dtype=None):
     """
-    Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing. 
+    Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
     When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm and LayerNorm.
-    
+
     Commonly, it is used together with `amp_guard` to achieve Pure fp16 in imperative mode.
 
     Args:
         models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
         optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
-        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing; 
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing;
              O2 represent Pure fp16/bf16, the decorator will cast all parameters of models to FP16/BF16, except BatchNorm and LayerNorm. Default is O1(amp)
         dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
         master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
@@ -462,8 +462,8 @@ def amp_decorate(models,
 
     Examples:
 
-     .. code-block:: python   
-        
+     .. code-block:: python
+
         # required: gpu
         # Demo1: single model and optimizer:
         import paddle
@@ -493,7 +493,7 @@ def amp_decorate(models,
             output2 = models[1](data)
             print(output.dtype) # FP16
             print(output2.dtype) # FP16
-        
+
         # required: gpu
         # Demo3: optimizers is None:
         model3 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index f86bdf18506..db98af25ded 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -49,19 +49,19 @@ class AmpScaler(object):
     `unscale_()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio)
     `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling.
 
-    Commonly, it is used together with `amp_guard` to achieve Auto-Mixed-Precision in 
+    Commonly, it is used together with `amp_guard` to achieve Auto-Mixed-Precision in
     imperative mode.
 
     Args:
         enable(bool, optional): Enable loss scaling or not. Default is True.
         init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
-        incr_ratio(float, optional): The multiplier to use when increasing the loss 
+        incr_ratio(float, optional): The multiplier to use when increasing the loss
                         scaling. Default is 2.0.
-        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing 
+        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing
                         the loss scaling. Default is 0.5.
-        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive 
+        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive
                                 steps with finite gradients. Default is 1000.
-        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n 
+        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
                                     accumulated steps with nan or inf gradients. Default is 2.
         use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
     Returns:
@@ -86,7 +86,7 @@ class AmpScaler(object):
                 loss = fluid.layers.reduce_mean(conv)
                 scaled = scaler.scale(loss)
                 scaled.backward()
-                scaler.minimize(optimizer, scaled)         
+                scaler.minimize(optimizer, scaled)
     """
 
     @dygraph_only
@@ -143,14 +143,14 @@ class AmpScaler(object):
 
     def scale(self, var):
         """
-        Multiplies a variable(Tensor) by the scale factor and returns scaled outputs.  
+        Multiplies a variable(Tensor) by the scale factor and returns scaled outputs.
         If this instance of :class:`AmpScaler` is not enabled, output are returned unmodified.
 
         Args:
             var (Variable):  The variable to scale.
         Returns:
             The scaled variable or original variable.
-        
+
         Examples:
 
             .. code-block:: python
@@ -170,7 +170,7 @@ class AmpScaler(object):
                         loss = fluid.layers.reduce_mean(conv)
                         scaled = scaler.scale(loss)
                         scaled.backward()
-                        scaler.minimize(optimizer, scaled) 
+                        scaler.minimize(optimizer, scaled)
         """
         check_type(var, "var", core.VarBase, 'AmpScaler.scale()')
 
@@ -182,7 +182,7 @@ class AmpScaler(object):
     def minimize(self, optimizer, *args, **kwargs):
         """
         This function is similar as `Optimizer.minimize()`, which performs parameters updating.
-        
+
         If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
         Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.
 
@@ -212,7 +212,7 @@ class AmpScaler(object):
                         loss = fluid.layers.reduce_mean(conv)
                         scaled = scaler.scale(loss)
                         scaled.backward()
-                        scaler.minimize(optimizer, scaled) 
+                        scaler.minimize(optimizer, scaled)
         """
         if not self._enable:
             return optimizer.minimize(*args, **kwargs)
@@ -241,7 +241,7 @@ class AmpScaler(object):
 
     def _unscale(self, optimizer):
         """
-        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).  
+        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).
         If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
         Args:
             optimizer(Optimizer):  The optimizer used to update parameters.
@@ -496,7 +496,7 @@ class AmpScaler(object):
     def load_state_dict(self, state_dict):
         """
         Loads the scaler state.
-        
+
         Args:
            state_dict(dict): scaler state.  Should be an object returned from a call to `AmpScaler.state_dict()`.
         """
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index b30e3ff1d85..5101483858d 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -169,7 +169,7 @@ def enable_dygraph(place=None):
     This API turn OFF static graph mode. You can turn ON static graph mode by `enable_static <./disable_dygraph_en.html>`_ .
 
     Parameters:
-        place(paddle.CPUPlace|paddle.CUDAPlace|str, optional): Place to run dynamic graph. Default: None. Which means that the running place will be 
+        place(paddle.CPUPlace|paddle.CUDAPlace|str, optional): Place to run dynamic graph. Default: None. Which means that the running place will be
             determined according to the way of paddle compilation. If ``place`` is string, It can be ``cpu``, and ``gpu:x``, where ``x`` is the
             index of the GPUs.
 
@@ -387,7 +387,7 @@ def guard(place=None):
     This context will create a dygraph context for dygraph to run, using python ``with`` statement.
 
     Parameters:
-        place(fluid.CPUPlace| fluid.CUDAPlace|str, optional): Place to execute dygraph. 
+        place(fluid.CPUPlace| fluid.CUDAPlace|str, optional): Place to execute dygraph.
             If None, the running place will be determined according to the way of paddle compilation.
             If ``place`` is string, It can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the
             index of the GPUs or XPUs. Default: None
@@ -437,52 +437,52 @@ def grad(outputs,
          only_inputs=True,
          allow_unused=False,
          no_grad_vars=None):
-    ''' 
+    '''
     .. note::
         **This API is ONLY available in imperative mode.**
 
     This API computes the sum of gradients of `outputs` with respect to each `inputs` .
 
     Parameters:
-        outputs (Tensor|list(Tensor)|tuple(Tensor)): the output Tensor or 
+        outputs (Tensor|list(Tensor)|tuple(Tensor)): the output Tensor or
             Tensor list/tuple of the graph to compute gradients.
-        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
+        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or
             Tensor list/tuple of the graph to compute gradients. The returned
-            values of this API are the gradients of `inputs` . 
-        grad_outputs (Tensor|list(Tensor|None)|tuple(Tensor|None), optional): 
-            initial gradient values of `outputs` . If `grad_outputs` is None, 
-            the initial gradient values of `outputs` would be Tensors filled with 1; 
-            if `grad_outputs` is not None, it must have the same length as `outputs` , 
+            values of this API are the gradients of `inputs` .
+        grad_outputs (Tensor|list(Tensor|None)|tuple(Tensor|None), optional):
+            initial gradient values of `outputs` . If `grad_outputs` is None,
+            the initial gradient values of `outputs` would be Tensors filled with 1;
+            if `grad_outputs` is not None, it must have the same length as `outputs` ,
             and in this case, the initial gradient value of the i-th `outputs` would
-            be: (1) a Tensor filled with 1 when the i-th element of `grad_outputs` 
+            be: (1) a Tensor filled with 1 when the i-th element of `grad_outputs`
             is None; (2) the i-th element of `grad_outputs` when the i-th element of
             `grad_outputs` is a Tensor. Default None.
-        retain_graph (bool, optional): whether to retain the forward graph which 
-            is used to calculate the gradient. When it is True, the graph would 
-            be retained, in which way users can calculate backward twice for the 
+        retain_graph (bool, optional): whether to retain the forward graph which
+            is used to calculate the gradient. When it is True, the graph would
+            be retained, in which way users can calculate backward twice for the
             same graph. When it is False, the graph would be freed. Default None,
-            which means it is equal to `create_graph` . 
+            which means it is equal to `create_graph` .
         create_graph (bool, optional): whether to create the gradient graphs of
             the computing process. When it is True, higher order derivatives are
             supported to compute; when it is False, the gradient graphs of the
             computing process would be discarded. Default False.
         only_inputs (bool, optional): whether to only compute the gradients of
-            `inputs` . If it is False, the gradients of all remaining leaf 
-            Tensors in the graph would be also computed and accumulated. 
+            `inputs` . If it is False, the gradients of all remaining leaf
+            Tensors in the graph would be also computed and accumulated.
             If it is True, only the gradients of `inputs` would be computed.
             Default True. only_inputs=False is under development, and it is
-            not supported yet.    
-        allow_unused (bool, optional): whether to raise error or return None if some 
-            Tensors of `inputs` are unreachable in the graph. If some Tensors of 
-            `inputs` are unreachable in the graph (i.e., their gradients are None),  
+            not supported yet.
+        allow_unused (bool, optional): whether to raise error or return None if some
+            Tensors of `inputs` are unreachable in the graph. If some Tensors of
+            `inputs` are unreachable in the graph (i.e., their gradients are None),
             error would be raised if allow_unused=False, or None would be returned as
             their gradients if allow_unused=True. Default False.
-        no_grad_vars (Tensor|list(Tensor)|tuple(Tensor)|set(Tensor), optional): 
+        no_grad_vars (Tensor|list(Tensor)|tuple(Tensor)|set(Tensor), optional):
             the Tensors whose gradients are not needed to compute. Default None.
 
     Returns:
-        list: a list of Tensors, whose length is the same as the Tensor number 
-        inside `inputs`, and the i-th returned Tensor is the sum of gradients of 
+        list: a list of Tensors, whose length is the same as the Tensor number
+        inside `inputs`, and the i-th returned Tensor is the sum of gradients of
         `outputs` with respect to the i-th `inputs`.
 
     Examples:
@@ -530,7 +530,7 @@ def grad(outputs,
                 x.stop_gradient = False
 
                 y1 = x * x
-                y2 = x * 3 
+                y2 = x * 3
 
                 # If grad_outputs=None, dy1 = [1], dy2 = [1].
                 # If grad_outputs=[g1, g2], then:
@@ -543,7 +543,7 @@ def grad(outputs,
                 # dx = 2 * x * dy1 + 3 * dy2 = 4 * dy1 + 3 * dy2.
 
                 dx = paddle.grad(
-                    outputs=[y1, y2], 
+                    outputs=[y1, y2],
                     inputs=[x],
                     grad_outputs=grad_outputs)[0]
 
@@ -674,29 +674,29 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
     r"""
     :api_attr: imperative
 
-    The API will create a ``Variable`` object from 
+    The API will create a ``Variable`` object from
     tuple, list, numpy\.ndarray or Variable object.
 
     Parameters:
-        value(tuple|list|ndarray|Variable|Tensor): Initial data. 
+        value(tuple|list|ndarray|Variable|Tensor): Initial data.
             Can be a list, tuple, NumPy ndarray, Variable, Tensor.
-            The shape can be multi-dimensional. The data type is one of 
-            numpy\.{float16, float32, float64, int16, int32, int64, 
+            The shape can be multi-dimensional. The data type is one of
+            numpy\.{float16, float32, float64, int16, int32, int64,
             uint8, uint16, complex64, complex128}.
-        name(str, optional): The default value is None. Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
-        zero_copy(bool, optional): Whether to share memory with the input numpy 
-            array. This parameter only works with CPUPlace and will be set to 
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
+        zero_copy(bool, optional): Whether to share memory with the input numpy
+            array. This parameter only works with CPUPlace and will be set to
             True when it is None. Default: None. (Note: zero_copy is discarded temporally for some reason.)
         dtype(str, optional): The desired data type of returned ``Variable`` .
-            Can be 'bool' , 'float16' , 'float32' , 'float64' , 'int8' , 'int16' , 
+            Can be 'bool' , 'float16' , 'float32' , 'float64' , 'int8' , 'int16' ,
             'int32' , 'int64' , 'uint8' . Default: None.
 
     Returns:
-        Variable : If ``value`` is a tuple/list/numpy\.ndarray object, 
-            return ``Tensor`` created from the corresponding numpy\.ndarray object, which has 
-            same data type and shape with ``value``. 
+        Variable : If ``value`` is a tuple/list/numpy\.ndarray object,
+            return ``Tensor`` created from the corresponding numpy\.ndarray object, which has
+            same data type and shape with ``value``.
 
 
     Examples:
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 0fe5d236a58..e62e9bc9752 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -57,9 +57,9 @@ def save_dygraph(state_dict, model_path):
     :api_attr: imperative
 
     Save Layer's state_dict to disk. This will generate a file with suffix ".pdparams"
-    
+
     The state_dict is get from Layers.state_dict function
-    
+
     Args:
         state_dict(dict) : The state dict to be saved.
         model_path(str) : the file prefix to save the state_dict. The format is "dirname/file_prefix". If file_prefix is empty str. A exception will be raised
@@ -127,24 +127,24 @@ def save_dygraph(state_dict, model_path):
 def load_dygraph(model_path, **configs):
     '''
     :api_attr: imperative
-    
+
     Load parameter state dict from disk.
 
     .. note::
-        Due to some historical reasons, if you load ``state_dict`` from the saved 
-        result of `paddle.static.save_inference_model`, the structured variable name 
-        will cannot be restored. You need to set the argument `use_structured_name=False` 
+        Due to some historical reasons, if you load ``state_dict`` from the saved
+        result of `paddle.static.save_inference_model`, the structured variable name
+        will cannot be restored. You need to set the argument `use_structured_name=False`
         when using `Layer.set_state_dict` later.
 
     Args:
-        model_path(str) : The file prefix store the state_dict. 
-            (The path should Not contain suffix '.pdparams') 
-        **configs (dict, optional): Other load configuration options for compatibility. We do not 
+        model_path(str) : The file prefix store the state_dict.
+            (The path should Not contain suffix '.pdparams')
+        **configs (dict, optional): Other load configuration options for compatibility. We do not
             recommend using these configurations, if not necessary, DO NOT use them. Default None.
             The following options are currently supported:
-            (1) model_filename (str): The inference model file name of the paddle 1.x ``save_inference_model`` 
-            save format. Default file name is :code:`__model__` . 
-            (2) params_filename (str): The persistable variables file name of the paddle 1.x ``save_inference_model`` 
+            (1) model_filename (str): The inference model file name of the paddle 1.x ``save_inference_model``
+            save format. Default file name is :code:`__model__` .
+            (2) params_filename (str): The persistable variables file name of the paddle 1.x ``save_inference_model``
             save format. No default file name, save variables separately by default.
 
     Returns:
@@ -163,7 +163,7 @@ def load_dygraph(model_path, **configs):
             state_dict = emb.state_dict()
             fluid.save_dygraph(state_dict, "paddle_dy")
 
-            scheduler = paddle.optimizer.lr.NoamDecay(	
+            scheduler = paddle.optimizer.lr.NoamDecay(
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
                 learning_rate=scheduler,
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py
index ff5b5220952..b77c5b3c701 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py
@@ -122,10 +122,10 @@ class NameNodeReplaceTransformer(BaseTransformer):
 
 class ForLoopTuplePreTransformer(BaseTransformer):
     """ pre-process of for loop.
-    >>> for A in B: 
+    >>> for A in B:
     >>>    C
 
-    will be changed into : 
+    will be changed into :
 
     >>> UUID_iterator = _jst.Indexable(B)  # make iterator-only to indexable list.
     >>> for UUID_target in UUID_iterator:
@@ -166,9 +166,9 @@ class ForLoopTuplePreTransformer(BaseTransformer):
 
     def tuple_node_to_unpack_structure(self, node):
         """ Create a sequence to represents the structure of nest.
-            For example: `a, (b,c), [d,e,f]` is represented by 
+            For example: `a, (b,c), [d,e,f]` is represented by
             `[1, [1,1], [1,1,1]]`. the `1` is just a notation.
-            
+
             Specially, `a` is represented by `1`.
         """
         ret = []
@@ -395,9 +395,9 @@ class ForNodeVisitor(object):
         """
         Process special cases for iter_node inclue:
           - Case 1 (for zip):
-            
+
             - for i, val in enumerate(zip(x, y))  # original code:
-            
+
             - __for_loop_iter_zip_0 = list(zip(x, y))
             - for i, val in enumerate(__for_loop_iter_zip_0)
         """
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
index b63fe6eea5a..b0717f78e81 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
@@ -91,7 +91,7 @@ class BreakContinueTransformer(BaseNodeVisitor):
     """
     Rewrite 'break' and 'continue' key words in a if-else python way to make
     it equivalent to original control flow
-    
+
     The main idea of this class is:
 
         1. Map the 'break/continue' stmt with an unique boolean variable V.
@@ -278,9 +278,9 @@ def _find_ancestor_loop_index(node, ancestor_nodes):
 
 class BreakTransformOptimizer(BaseNodeVisitor):
     """
-    In specific pattern, the transformed code could be optimized by joining the 
-    If.test with while.test. 
-    
+    In specific pattern, the transformed code could be optimized by joining the
+    If.test with while.test.
+
     Currently supported pattern is:
     ```
         while cond1:            while cond1 and not cond2:
@@ -288,7 +288,7 @@ class BreakTransformOptimizer(BaseNodeVisitor):
                 break
             do_something()
     ```
-    
+
     See following example:
 
     >>> def foo(x):
@@ -309,7 +309,7 @@ class BreakTransformOptimizer(BaseNodeVisitor):
                 i += 1
             return x
     ```
-    It can avoid wrapping all ops after `break` statement into `cond_op` that 
+    It can avoid wrapping all ops after `break` statement into `cond_op` that
     usually brings very heavy overhead.
     """
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index e22d83d56f3..d5fb80ce575 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -137,7 +137,7 @@ def _run_paddle_while(cond, body, getter, setter, return_name_ids,
         return helper.get(return_name_ids)
 
     def new_cond_fn(*args):
-        """ cond is a zero-args function, which is not 
+        """ cond is a zero-args function, which is not
             compatible with `while_loop`.
         """
         return cond()
@@ -495,7 +495,7 @@ def convert_zip(*args):
 
 # TODO(xiongkun): delete when list<variable> is ready.
 class VariableTuple:
-    """ 
+    """
         this class will cause enumerate can't be wrapped by other iterator change function.
         this will be fixed when list<Variable> is producted.
         VariableTuple can only deal with variables which is fixed.
@@ -577,7 +577,7 @@ def convert_shape_compare(left, *args):
         Python like "a op1 b and b op2 c and ... ".
         If the variables to compare are Paddle Variables, we will do elementwise
         comparsion first and then reduce to a boolean whose numel is 1.
-        
+
     """
     args_len = len(args)
     assert args_len >= 2, "convert_shape_compare needs at least one right compare variable"
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index e8afef09468..ebc0aacf664 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -273,12 +273,12 @@ def get_buffers(layer_instance, include_sublayer=True):
 def convert_to_input_spec(inputs, input_spec):
     """
     Replaces tensor in structured `inputs` by InputSpec in `input_spec`.
-    
+
     Args:
         inputs(list|dict): nested structure list or dict.
-        input_spec(list|dict): same nested structure list or dict as inputs. 
+        input_spec(list|dict): same nested structure list or dict as inputs.
+
 
-    
     Return:
         Same structure with inputs by replacing the element with specified InputSpec.
     """
@@ -341,7 +341,7 @@ def replace_spec_empty_name(args_name, input_with_spec):
         4. If the arguments `input_dic` corresponds to a dict(InputSpec), using key as name.
 
     For example:
-        
+
         # case 1: foo(x, y)
         foo = to_static(foo, input_spec=[InputSpec([None, 10]), InputSpec([None])])
         print([in_var.name for in_var in foo.inputs])  # [x, y]
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
index 1c64daa2fdc..708383925fd 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
@@ -280,7 +280,7 @@ def _valid_nonlocal_names(return_name_ids, nonlocal_names):
     """
     All var in return_name_ids should be in nonlocal_names.
     Moreover, we will always put return_name_ids in front of nonlocal_names.
-    
+
     For Example:
 
         return_name_ids: [x, y]
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index c9e659cb68b..d348df203d7 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -420,7 +420,7 @@ class PartialProgramLayer:
             x = 2 * in  # <---- x is a non-leaf node in program.
             y = x + 3
             return x, y
-        
+
         loss = forward(in)[0].sum()
         loss.backward()  # <----- x@grad will be overwrited by elementwise_add_grad Op
         """
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 2a098947413..b1f99cf491e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -308,12 +308,12 @@ class StaticFunction(object):
         Overrides this method to parse the class instance and call bound method correctly.
 
         For example:
-            
+
             '''
             class Net(Layer):
                 def __init__(self):
                     pass
-                
+
                 @paddle.jit.to_static
                 def forward(self, x, y):
                     return x + y
@@ -321,7 +321,7 @@ class StaticFunction(object):
             net = Net()
             out = net(x, y)
             '''
-        
+
         In above case, `net(x, y)` will call `net.forward(x, y)` firstly that is a bound method
         of `Net` instance. After decorated by `@paddle.jit.to_static`, it will firstly to call `__get__`
         to parse the class instance correctly instead of the `StaticFunction` instance.
@@ -347,7 +347,7 @@ class StaticFunction(object):
 
         Args:
             *args(tuple): tuple of all input arguments from original decorated function.
-            **kwargs(dict): dict of all input keyward arguments from original decorated function. 
+            **kwargs(dict): dict of all input keyward arguments from original decorated function.
 
         Return:
             Outputs of decorated function.
@@ -423,7 +423,7 @@ class StaticFunction(object):
 
         Args:
             *args(tuple): tuple of all input arguments from original decorated function.
-            **kwargs(dict): dict of all input keyward arguments from original decorated function. 
+            **kwargs(dict): dict of all input keyward arguments from original decorated function.
 
         Return:
             Outputs of dygraph function.
@@ -521,7 +521,7 @@ class StaticFunction(object):
                 def foo(x, y):
                     z = x + y
                     return z
-                
+
                 # usage 1:
                 decorated_foo = to_static(foo, input_spec=[InputSpec([10], name='x'), InputSpec([10], name='y')])
                 print(decorated_foo.concrete_program)
@@ -599,7 +599,7 @@ class StaticFunction(object):
     def rollback(self):
         """
         Rollback into original dygraph functions for current class instance.
-        
+
         Returns:
             Function or Method
 
@@ -622,7 +622,7 @@ class StaticFunction(object):
                 x = paddle.randn([10, 1], 'float32')
                 net = paddle.jit.to_static(Net())  # convert into static mode
                 out = net(x)
-                
+
                 net.forward.rollback()  # rollback into dygraph mode
                 out = net(x)
         """
@@ -679,7 +679,7 @@ class StaticFunction(object):
                 net = paddle.jit.to_static(Net())  # convert into static mode
 
                 copy_net = copy.deepcopy(net)      # deepcopy a new net without @to_static
-        
+
         Please attention that original 'net' will unwrap @to_static and rollback into simple Layer.
         """
         if self._class_instance is not None:
@@ -835,7 +835,7 @@ class ConcreteProgram(object):
 
         Args:
             func_spec(FunctionSpec): A FunctionSpec instance for decorated function.
-            input_spec(list[InputSpec]): 
+            input_spec(list[InputSpec]):
         """
         # verify the instance is initialized in imperative mode.
         _verify_init_in_dynamic_mode(class_instance)
@@ -1250,7 +1250,7 @@ class ProgramTranslator(object):
                 print([i.name for i in inputs])
                 # [u'generated_tensor_0'] the feed input Tensor name representing x
                 print([o.name for o in outputs])
-                # [u'_generated_var_4'] the fetch output Tensor name representing x_v        
+                # [u'_generated_var_4'] the fetch output Tensor name representing x_v
 
         """
         assert callable(
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
index 82177b343aa..1f8b19ad15c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
@@ -408,11 +408,11 @@ class StaticAnalysisVisitor(object):
     def _get_func_argument_type(self, parent_node_wrapper, node):
         """
         Returns type information by parsing annotation or default values.
-        
+
         For example:
             1. parse by default values.
                 foo(x, y=1, z='s') -> x: UNKNOWN, y: INT, z: STR
-            
+
             2. parse by Py3 type annotation.
                 foo(x: Tensor, y: int, z: str) -> x: Tensor, y: INT, z: STR
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index a8b372c28ce..4719e4675b0 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -51,7 +51,7 @@ ORIGI_INFO = "Original information of source code for ast node."
 
 class BaseNodeVisitor(gast.NodeVisitor):
     """
-    Implement customized NodeVisitor inherited from gast.NodeVisitor. 
+    Implement customized NodeVisitor inherited from gast.NodeVisitor.
     Ancestor nodes are traced to easily support more operations of currently
     visited node.
     """
@@ -107,7 +107,7 @@ def data_layer_not_check(name, shape, dtype='float32', lod_level=0):
     data can be various-length. This API is used in translating dygraph into
     static graph.
 
-     Note: 
+     Note:
         The default :code:`stop_gradient` attribute of the Tensor created by
         this API is true, which means the gradient won't be passed backward
         through the data Tensor. Set :code:`var.stop_gradient = False` If
@@ -118,7 +118,7 @@ def data_layer_not_check(name, shape, dtype='float32', lod_level=0):
            for more details.
        shape (list|tuple): List|Tuple of integers declaring the shape. You can
            set "None" at a dimension to indicate the dimension can be of any
-           size. For example, it is useful to set changeable batch size as "None" 
+           size. For example, it is useful to set changeable batch size as "None"
        dtype (np.dtype|VarType|str, optional): The type of the data. Supported
            dtype: bool, float16, float32, float64, int8, int16, int32, int64,
            uint8. Default: float32
@@ -996,8 +996,8 @@ def slice_is_num(slice_node):
 class NameScope:
 
     def __init__(self):
-        """ 
-            A NameScope is a object which manager all the variable names. 
+        """
+            A NameScope is a object which manager all the variable names.
             only FunctionDef and Controlflow node will have a namescope property.
 
             type can be "function" and "controlflow"
@@ -1018,7 +1018,7 @@ class NameScope:
         self.father = father
 
     def existed_vars(self):
-        """ vars existing in current scope. 
+        """ vars existing in current scope.
             they must not contain qualified names.
         """
         local_vars = self.w_vars - self.globals - self.nonlocals - self.args
@@ -1032,9 +1032,9 @@ class NameScope:
         return self.w_vars
 
     def variadic_length_vars(self):
-        """ 
+        """
         At present, we do not support global append, such as
-        
+
         import numpy as np
         a = []
         def func():
@@ -1063,9 +1063,9 @@ class NameScope:
         return True
 
     def is_global_var(self, name):
-        """ 
+        """
         Return whether the name is a var created in global scope.
-        Search from bottom to top. If it is not created or modified, 
+        Search from bottom to top. If it is not created or modified,
         it means global vars; otherwise, it means local vars.
         Only valid after FunctionNameLivenessAnalysis visitor.
         """
@@ -1093,16 +1093,16 @@ class FunctionNameLivenessAnalysis(gast.NodeVisitor):
     """ analyze the liveness of a function.
 
         every variables stored in this scope will be collected,
-        in addition with global/nonlocal information and 
+        in addition with global/nonlocal information and
         push_pop information.
 
         1. global variable is stored in node.var_globals.
         2. nonlocal variable is stored in node.var_nonlocals.
         3. arguments is stored in node.var_args.
-        4. if a variable's push and pop attribute is called, 
+        4. if a variable's push and pop attribute is called,
            it will be collected in push_pop_vars. They are
            used for transformation to tensor_array.
-           NOTE: push_pop_vars **may not** in w_vars. 
+           NOTE: push_pop_vars **may not** in w_vars.
            a.push(0) don't modify the variable a, but the content
            of a.
 
@@ -1120,13 +1120,13 @@ class FunctionNameLivenessAnalysis(gast.NodeVisitor):
                 q = 12
                 b.push(1)
                 c.pop()
-        
-        After this visitor we have: 
+
+        After this visitor we have:
         # node is the FunctionDef node with name: "func"
         node.pd_scope = NameScope(
             globals = ['i', 'j'],
             nonlocals = ['x', 'y'],
-            args = ['args', 'kargs'], 
+            args = ['args', 'kargs'],
             wr_vars = ['a', 'i', 'q', 'm', 'c', 'b']
             push_pop_vars = ['b', 'c']
         )
@@ -1160,7 +1160,7 @@ class FunctionNameLivenessAnalysis(gast.NodeVisitor):
 
     def visit_ListComp(self, node):
         """ [ i for i in range(10) ]
-            In this case, `i` will not created in FunctionScope. 
+            In this case, `i` will not created in FunctionScope.
             We don't collect `i` by not calling generic_visit.
         """
         pass
@@ -1183,7 +1183,7 @@ class FunctionNameLivenessAnalysis(gast.NodeVisitor):
                 self._get_argument_names(node))
 
         def post_func():
-            """ NOTE: why we need merge w_vars and push_pop_vars here ? 
+            """ NOTE: why we need merge w_vars and push_pop_vars here ?
                 because we do ifelse_transformer after loop_transformer. Loops will changed into functioons. but we know this function will be called in if. so we add w_vars to father function scope.
             """
             from paddle.fluid.dygraph.dygraph_to_static.loop_transformer import WHILE_CONDITION_PREFIX, WHILE_BODY_PREFIX, FOR_CONDITION_PREFIX, FOR_BODY_PREFIX
@@ -1271,7 +1271,7 @@ class FunctionNameLivenessAnalysis(gast.NodeVisitor):
 
     def _get_argument_names(self, node):
         """ get all arguments name in the functiondef node.
-            this node is local to the function and shouldn't 
+            this node is local to the function and shouldn't
             be created.
         """
         assert isinstance(
@@ -1372,7 +1372,7 @@ def create_nonlocal_stmt_nodes(names):
 
 
 class GetterSetterHelper:
-    """ we have two classes of names in setter and getter function: 
+    """ we have two classes of names in setter and getter function:
         w_vars(loop_vars) + push_pop_vars
         To simplify the setter logic in convert_while and convert_cond,
         we extract the helper class here.
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
index fec231d5485..fd8f631c06b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
@@ -64,7 +64,7 @@ def to_static_variable(x):
     if isinstance(x, six.integer_types):
         return paddle.full(shape=[1], dtype='int64', fill_value=x)
     if isinstance(x, UndefinedVar) or x is None:
-        """ 
+        """
         for early return case, we need a variable to represent None, current we use data_layer_not_check.
         """
         return create_undefined_variable()
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index eb4fdc682a7..539fd93b0d1 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -308,8 +308,8 @@ class _ProgramHolder(object):
     """
     Holds the execution information of a Program.
 
-    _ProgramHolder is the execution unit of TranslatedLayer, 
-    if TranslatedLayer contains multiple _ProgramHolder, 
+    _ProgramHolder is the execution unit of TranslatedLayer,
+    if TranslatedLayer contains multiple _ProgramHolder,
     it can execute multiple methods
 
     _ProgramHolder is an internal concept.
@@ -984,7 +984,7 @@ def _run_static_graph(input, program_holder, trace_program):
 def _collect_current_and_parent_var(program, block_idx):
     '''
     Get variables in current block and its parent block.
-    
+
     Args:
         program(Program): The program containing the current block.
         block_idx(int): index of current block.
@@ -1010,13 +1010,13 @@ def _append_block(dest_program,
                   dict_rename_var_old_new=None):
     '''
     Append Variables and Operators in 'src_program_desc' to dest_program.
-    
+
     Args:
         dest_program(Program): Variables and Operators are appended to it.
         src_program_desc(ProgramDesc): Variables in it will be appended to 'dest_program'.
         program_holder(_ProgramHolder): program_holder of TranslatedLayer
         input_variables(list): list of input variables
-        dict_rename_var_old_new(None|dict): When using '_rename_var_program_desc', 
+        dict_rename_var_old_new(None|dict): When using '_rename_var_program_desc',
         use it to map the name of the variable before it was modified and the new name.
     '''
 
@@ -1199,10 +1199,10 @@ def append_var_from_block_desc_static(block,
 
 class TranslatedLayer(layers.Layer):
     """
-    TranslatedLayer is a ``paddle.nn.Layer`` for holding the model 
-    loaded by :ref:`api_paddle_jit_load` . It can be used like a 
+    TranslatedLayer is a ``paddle.nn.Layer`` for holding the model
+    loaded by :ref:`api_paddle_jit_load` . It can be used like a
     general Layer object in eval or train mode.
-    
+
     .. note:
         The TranslatedLayer objects should not be created by constructor, it only can be loaded and constructed by :ref:`api_paddle_jit_load` .
 
@@ -1410,13 +1410,13 @@ class TranslatedLayer(layers.Layer):
         Args:
             - method_name (string): mehtod name corresponding to the program
                 to be obtained. Default: 'forward'.
-        
+
         Returns:
             Program
 
         Examples:
             .. code-block:: python
-            
+
                 import numpy as np
                 import paddle
                 import paddle.nn as nn
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 856a21881c2..a0275ac57ce 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -701,7 +701,7 @@ def save(layer, path, input_spec=None, **configs):
       - Other C++ inference APIs
 
     .. note::
-        When using ``paddle.jit.save`` to save a function, parameters will not be saved. If you have to 
+        When using ``paddle.jit.save`` to save a function, parameters will not be saved. If you have to
         save the parameter, please pass the Layer containing function and parameter to ``paddle.jit.save``.
 
     Args:
@@ -813,7 +813,7 @@ def save(layer, path, input_spec=None, **configs):
 
                 load_result = load_func(inps)
                 print((load_result - origin).abs().max() < 1e-10)
-                
+
             save_function()
     """
 
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 879900085d5..9f36f1cd37e 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -1321,7 +1321,7 @@ class Layer(object):
                                    include_sublayers=True,
                                    structured_name_prefix=""):
         """
-        The difference from state_dict() is that state_dict_hook will not be called, 
+        The difference from state_dict() is that state_dict_hook will not be called,
         but the original types of parameters and buffers will be maintained.
         """
         if destination is None:
@@ -1559,7 +1559,7 @@ class Layer(object):
 
             blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be
               asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
-            
+
         Returns:
             self
 
@@ -1689,7 +1689,7 @@ class Layer(object):
 
             blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be
               asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
-            
+
             include_sublayers(bool|True, optional): If True, deal with self and all sublayers parameters and buffers, if not only deal with self parameters and buffers. Default: True.
 
             floating_only(bool|False, optional): If True, only cast all floating point parameters and buffers of Layer by the give device, dtype and blocking.
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index 18950144bc4..83452739fd4 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -31,7 +31,7 @@ __all__ = [
 class LearningRateDecay(object):
     """
     Base class of learning rate decay
-    
+
     Define the common interface of an LearningRateDecay.
     User should not use this class directly,
     but need to use one of it's implementation.
@@ -53,7 +53,7 @@ class LearningRateDecay(object):
         """
         convert lr from float to variable
 
-        Args: 
+        Args:
             lr: learning rate
         Returns:
             learning rate variable
@@ -124,7 +124,7 @@ class LearningRateDecay(object):
 class PiecewiseDecay(LearningRateDecay):
     """
     :api_attr: imperative
-    
+
     Piecewise decay scheduler.
 
     The algorithm can be described as the code below.
@@ -141,7 +141,7 @@ class PiecewiseDecay(LearningRateDecay):
             learning_rate = 0.1
 
     Parameters:
-        boundaries(list): A list of steps numbers. The type of element in the list is python int. 
+        boundaries(list): A list of steps numbers. The type of element in the list is python int.
         values(list): A list of learning rate values that will be picked during
             different step boundaries. The type of element in the list is python float.
         begin(int): The begin step to initialize the global_step in the description above.
@@ -187,12 +187,12 @@ class NaturalExpDecay(LearningRateDecay):
     :api_attr: imperative
 
     Applies natural exponential decay to the initial learning rate.
-    
+
     The algorithm can be described as following.
 
     .. math::
 
-        decayed\_learning\_rate = learning\_rate * e^{y} 
+        decayed\_learning\_rate = learning\_rate * e^{y}
 
     If staircase is set to False, then:
 
@@ -204,15 +204,15 @@ class NaturalExpDecay(LearningRateDecay):
 
     .. math::
 
-        y = - decay\_rate * math.floor(\\frac{global\_step}{decay\_steps}) 
+        y = - decay\_rate * math.floor(\\frac{global\_step}{decay\_steps})
 
     Parameters:
-        learning_rate(Variable|float): The initial learning rate. If the type 
-            is Variable, it's a tensor with shape [1], the data type can be  
+        learning_rate(Variable|float): The initial learning rate. If the type
+            is Variable, it's a tensor with shape [1], the data type can be
             float32 or float64. It also can be set to python int number.
         decay_steps(int): The decay step size. It determines the decay cycle.
         decay_rate(int): The decay rate.
-        staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The 
+        staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The
             default value is False.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
@@ -272,16 +272,16 @@ class ExponentialDecay(LearningRateDecay):
     Applies exponential decay to the learning rate.
 
     The algorithm can be described as following.
-    
+
     .. math::
 
-        decayed\_learning\_rate = learning\_rate * decay\_rate ^ y 
+        decayed\_learning\_rate = learning\_rate * decay\_rate ^ y
 
     If staircase is set to False, then:
 
     .. math::
 
-        y = \\frac{global\_step}{decay\_steps} 
+        y = \\frac{global\_step}{decay\_steps}
 
     If staircase is set to True, then:
 
@@ -291,12 +291,12 @@ class ExponentialDecay(LearningRateDecay):
 
 
     Parameters:
-        learning_rate(Variable|float): The initial learning rate. If the type 
-            is Variable, it's a tensor with shape [1], the data type can be  
+        learning_rate(Variable|float): The initial learning rate. If the type
+            is Variable, it's a tensor with shape [1], the data type can be
             float32 or float64. It also can be set to python int number.
         decay_steps(int): The decay step size. It determines the decay cycle.
         decay_rate(float): The decay rate.
-        staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The 
+        staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The
             default value is False.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
@@ -358,7 +358,7 @@ class InverseTimeDecay(LearningRateDecay):
 
     .. math::
 
-        decayed\_learning\_rate = \\frac{learning\_rate}{1 + decay\_rate * \\frac{global\_step}{decay\_step}}  
+        decayed\_learning\_rate = \\frac{learning\_rate}{1 + decay\_rate * \\frac{global\_step}{decay\_step}}
 
     If staircase is set to True, then:
 
@@ -367,17 +367,17 @@ class InverseTimeDecay(LearningRateDecay):
         decayed\_learning\_rate = \\frac{learning\_rate}{1 + decay\_rate * math.floor(\\frac{global\_step}{decay\_step})}
 
     Parameters:
-        learning_rate(Variable|float): The initial learning rate. If the type 
-            is Variable, it's a tensor with shape [1], the data type can be  
+        learning_rate(Variable|float): The initial learning rate. If the type
+            is Variable, it's a tensor with shape [1], the data type can be
             float32 or float64. It also can be set to python int number.
         decay_steps(int): The decay step size. It determines the decay cycle.
         decay_rate(float): The decay rate.
-        staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The 
+        staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The
             default value is False.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
             The default value is 1.
-        dtype(str, optional): The data type used to create the learning rate variable. The data type can be 
+        dtype(str, optional): The data type used to create the learning rate variable. The data type can be
             'float32', 'float64'. The default value is 'float32'.
 
     Returns:
@@ -437,7 +437,7 @@ class PolynomialDecay(LearningRateDecay):
 
     .. math::
 
-        decay\_steps & = decay\_steps * math.ceil(\\frac{global\_step}{decay\_steps}) 
+        decay\_steps & = decay\_steps * math.ceil(\\frac{global\_step}{decay\_steps})
 
         decayed\_learning\_rate & = (learning\_rate-end\_learning\_rate)*(1-\\frac{global\_step}{decay\_steps})^{power}+end\_learning\_rate
 
@@ -445,13 +445,13 @@ class PolynomialDecay(LearningRateDecay):
 
     .. math::
 
-        global\_step & = min(global\_step, decay\_steps) 
+        global\_step & = min(global\_step, decay\_steps)
 
         decayed\_learning\_rate & = (learning\_rate-end\_learning\_rate)*(1-\\frac{global\_step}{decay\_steps})^{power}+end\_learning\_rate
 
     Parameters:
-        learning_rate(Variable|float): The initial learning rate. If the type 
-            is Variable, it's a tensor with shape [1], the data type can be  
+        learning_rate(Variable|float): The initial learning rate. If the type
+            is Variable, it's a tensor with shape [1], the data type can be
             float32 or float64. It also can be set to python int number.
         decay_steps(int): The decay step size. It determines the decay cycle.
         end_learning_rate(float, optional): The minimum final learning rate. The default value is 0.0001.
@@ -530,10 +530,10 @@ class CosineDecay(LearningRateDecay):
     .. math::
 
         decayed\_learning\_rate = learning\_rate * 0.5 * (math.cos(global\_step * \\frac{math.pi}{step\_each\_epoch} ) + 1)
-    
+
     Parameters:
-        learning_rate(Variable|float): The initial learning rate. If the type 
-            is Variable, it's a tensor with shape [1], the data type can be  
+        learning_rate(Variable|float): The initial learning rate. If the type
+            is Variable, it's a tensor with shape [1], the data type can be
             float32 or float64. It also can be set to python int number.
         step_each_epoch(int): The number of steps in an epoch.
         epochs(int): The number of epochs.
@@ -581,7 +581,7 @@ class NoamDecay(LearningRateDecay):
     r"""
     :api_attr: imperative
 
-    Applies Noam decay to the initial learning rate. 
+    Applies Noam decay to the initial learning rate.
 
     The algorithm can be described as following.
 
@@ -589,12 +589,12 @@ class NoamDecay(LearningRateDecay):
 
         decayed\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(global\_step^{-0.5}, global\_step * warmup\_steps^{-1.5})
 
-    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_ 
+    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_
 
     Parameters:
-        d$_{model}$(Variable|int): The dimensionality of input and output feature vector of model. If type is Variable, 
+        d$_{model}$(Variable|int): The dimensionality of input and output feature vector of model. If type is Variable,
             it's a tensor with shape [1] and the data type can be int32 or int64. The type can also be python int.
-        warmup_steps(Variable|int): The number of warmup steps. A super parameter. If type is Variable, 
+        warmup_steps(Variable|int): The number of warmup steps. A super parameter. If type is Variable,
             it's a tensor with shape [1] and the data type can be int32 or int64. The type can also be python int.
         begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
         step(int, optional): The step size used to calculate the new global_step in the description above.
@@ -650,24 +650,24 @@ class LinearLrWarmup(LearningRateDecay):
 
     This operator use the linear learning rate warm up strategy to adjust the learning rate preliminarily before the normal learning rate scheduling.
     For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
-    
+
     When global_step < warmup_steps, learning rate is updated as:
-    
+
     .. code-block:: text
-    
+
             linear_step = end_lr - start_lr
             lr = start_lr + linear_step * (global_step / warmup_steps)
-    
+
     where start_lr is the initial learning rate, and end_lr is the final learning rate;
-    
+
     When global_step >= warmup_steps, learning rate is updated as:
-    
+
     .. code-block:: text
-    
+
             lr = learning_rate
-    
+
     where lr is the learning_rate after warm-up.
-    
+
     Args:
         learning_rate (Variable|float): Learning_rate after warm-up, it could be 1D-Tensor or single value with the data type of float32.
         warmup_steps (int): Steps for warm up.
@@ -678,26 +678,26 @@ class LinearLrWarmup(LearningRateDecay):
             The default value is 1.
         dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
             'float32', 'float64'. The default value is 'float32'.
-    
+
     Returns:
         Variable: Warm-up learning rate with the same data type as learning_rate.
-    
-    
+
+
     Examples:
-    
+
     .. code-block:: python
-    
+
         import paddle.fluid as fluid
-    
-        learning_rate = 0.1 
+
+        learning_rate = 0.1
         warmup_steps = 50
         start_lr = 0
         end_lr = 0.1
 
-        with fluid.dygraph.guard(): 
+        with fluid.dygraph.guard():
             lr_decay = fluid.dygraph.LinearLrWarmup( learning_rate, warmup_steps, start_lr, end_lr)
-    
-       
+
+
     """
 
     def __init__(self,
@@ -739,12 +739,12 @@ class ReduceLROnPlateau(LearningRateDecay):
     """
     :api_attr: imperative
 
-    Reduce learning rate when ``loss`` has stopped descending. Models often benefit from reducing the learning rate 
+    Reduce learning rate when ``loss`` has stopped descending. Models often benefit from reducing the learning rate
     by 2 to 10 times once model performance has no longer improvement.
 
-    The ``loss`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``loss`` 
-    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * decay_rate`` . 
-    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``loss`` stop ascending for a ``patience`` number 
+    The ``loss`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``loss``
+    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * decay_rate`` .
+    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``loss`` stop ascending for a ``patience`` number
     of epochs, the learning rate will be reduced.)
 
     In addition, After each reduction, it will wait a ``cooldown`` number of epochs before resuming normal operation.
@@ -752,31 +752,31 @@ class ReduceLROnPlateau(LearningRateDecay):
     Args:
         learning_rate (Variable|float|int): The initial learning rate. It can be set to python float or int number.
             If the type is Variable, it should be 1-D Tensor with shape [1], the data type can be 'float32' or 'float64'.
-        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the 
-            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning 
+        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the
+            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning
             rate will reduce when ``loss`` stops ascending. Default: ``'min'`` .
-        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . 
+        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` .
             It should be less than 1.0. Default: 0.1.
-        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced. 
+        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced.
             Default: 10.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``.
-        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` . 
+        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` .
             This make tiny changes of ``loss`` will be ignored. Default: 1e-4.
         threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
-            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum 
+            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum
             change of ``loss`` is ``threshold`` . Default: ``'rel'`` .
         cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0.
         min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0.
         eps (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than eps, the update is
             ignored. Default: 1e-8.
         dtype (str, optional): The data type used to create the learning rate variable. The data type can be set as
-            'float32', 'float64'. Default: 'float32'. 
-    
+            'float32', 'float64'. Default: 'float32'.
+
     Returns:
         Reduced learning rate.
 
     Examples:
-    
+
     .. code-block:: python
 
         import paddle.fluid as fluid
@@ -791,7 +791,7 @@ class ReduceLROnPlateau(LearningRateDecay):
                                     learning_rate = 1.0,
                                     decay_rate = 0.5,
                                     patience = 5,
-                                    verbose = True, 
+                                    verbose = True,
                                     cooldown = 3)
             adam = fluid.optimizer.Adam(
                 learning_rate = reduce_lr,
@@ -804,7 +804,7 @@ class ReduceLROnPlateau(LearningRateDecay):
                     loss = fluid.layers.reduce_mean(out)
                     total_loss += loss
                     adam.minimize(loss)
-                
+
                 avg_loss = total_loss/5
 
                 # adjust learning rate according to avg_loss
@@ -878,17 +878,17 @@ class ReduceLROnPlateau(LearningRateDecay):
 
     def step(self, loss):
         """
-        It should be invoked on each epoch. Update the learning rate in optimizer according to ``loss`` .  
+        It should be invoked on each epoch. Update the learning rate in optimizer according to ``loss`` .
         The new learning rate will take effect on next call to ``optimizer.minimize`` .
 
         Args:
-            loss (Variable): A ``Variable`` that will be monitored to determine whether the learning rate will reduce. 
-                If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. It should 
-                be 1-D Tensor with shape [1]. 
+            loss (Variable): A ``Variable`` that will be monitored to determine whether the learning rate will reduce.
+                If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. It should
+                be 1-D Tensor with shape [1].
                 Specially, if ``mode`` has been set to ``'max'`` ,  the learning rate will reduce when it stops ascending.
         Returns:
             None
-        
+
         Examples:
             Please refer to the example of current LearningRateDecay.
         """
@@ -944,7 +944,7 @@ class _LearningRateEpochDecay(LearningRateDecay):
     :api_attr: imperative
 
     Base class of learning rate decay, which is updated each epoch.
-    
+
     Define the common interface of an _LearningRateEpochDecay.
     User should not use this class directly,
     but need to use one of it's implementation. And invoke method: `epoch()` each epoch.
@@ -974,7 +974,7 @@ class _LearningRateEpochDecay(LearningRateDecay):
         self.keys = ['epoch_num', 'learning_rate']
 
     def __call__(self):
-        """ 
+        """
         Return last computed learning rate on current epoch.
         """
         if not isinstance(self.learning_rate, Variable):
@@ -1002,7 +1002,7 @@ class StepDecay(_LearningRateEpochDecay):
 
     Decays the learning rate of ``optimizer`` by ``decay_rate`` every ``step_size`` number of epoch.
 
-    The algorithm can be described as the code below. 
+    The algorithm can be described as the code below.
 
     .. code-block:: text
 
@@ -1018,7 +1018,7 @@ class StepDecay(_LearningRateEpochDecay):
     Parameters:
         learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
         step_size (int): Period of learning rate decay.
-        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . 
+        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` .
             It should be less than 1.0. Default: 0.1.
 
     Returns:
@@ -1026,7 +1026,7 @@ class StepDecay(_LearningRateEpochDecay):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle.fluid as fluid
             import numpy as np
             with fluid.dygraph.guard():
@@ -1040,7 +1040,7 @@ class StepDecay(_LearningRateEpochDecay):
                     for batch_id in range(5):
                         out = linear(input)
                         loss = fluid.layers.reduce_mean(out)
-                        adam.minimize(loss)  
+                        adam.minimize(loss)
                     scheduler.epoch()
 
                     print("epoch:{}, current lr is {}" .format(epoch, adam.current_step_lr()))
@@ -1080,7 +1080,7 @@ class MultiStepDecay(_LearningRateEpochDecay):
 
     Decays the learning rate of ``optimizer`` by ``decay_rate`` once ``epoch`` reaches one of the milestones.
 
-    The algorithm can be described as the code below. 
+    The algorithm can be described as the code below.
 
     .. code-block:: text
 
@@ -1097,7 +1097,7 @@ class MultiStepDecay(_LearningRateEpochDecay):
     Parameters:
         learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
         milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
-        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . 
+        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` .
             It should be less than 1.0. Default: 0.1.
 
     Returns:
@@ -1105,7 +1105,7 @@ class MultiStepDecay(_LearningRateEpochDecay):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle.fluid as fluid
             import numpy as np
             with fluid.dygraph.guard():
@@ -1166,7 +1166,7 @@ class LambdaDecay(_LearningRateEpochDecay):
     Sets the learning rate of ``optimizer`` to the initial lr times a multiplicative factor, and this multiplicative
     factor is computed by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .
 
-    The algorithm can be described as the code below. 
+    The algorithm can be described as the code below.
 
     .. code-block:: text
 
@@ -1179,15 +1179,15 @@ class LambdaDecay(_LearningRateEpochDecay):
 
     Parameters:
         learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
-        lr_lambda (function): A function which computes a multiplicative factor given an integer parameter ``epoch`` , and 
+        lr_lambda (function): A function which computes a multiplicative factor given an integer parameter ``epoch`` , and
             then multiply the initial learning rate by this multiplicative factor.
-    
+
     Returns:
         None.
 
     Examples:
         .. code-block:: python
-            
+
             import paddle.fluid as fluid
             import numpy as np
             with fluid.dygraph.guard():
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index e0262fb113e..5bd638e60c9 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -140,7 +140,7 @@ class Conv2D(layers.Layer):
 
     Returns:
         None
-    
+
     Raises:
         ValueError: if ``use_cudnn`` is not a bool value.
 
@@ -316,7 +316,7 @@ class Conv3D(layers.Layer):
 
     The convolution3D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional tensors with a shape of 
+    Output(Output) are multidimensional tensors with a shape of
     :math:`[N, C, D, H, W]` . Where N is batch size, C is the number of
     channels, D is the depth of the feature, H is the height of the feature,
     and W is the width of the feature. Convlution3D is similar with Convlution2D
@@ -567,15 +567,15 @@ class Conv3DTranspose(layers.Layer):
 
     **Note**:
 
-          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d, 
-          when stride > 1, conv3d maps multiple input shape to the same output shape, 
+          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d,
+          when stride > 1, conv3d maps multiple input shape to the same output shape,
           so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
-          H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output 
-          size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
-          the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
-          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
+          H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output
+          size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`,
+          the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}`
+          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`,
           conv3d_transpose can compute the kernel size automatically.
 
 
@@ -596,9 +596,9 @@ class Conv3DTranspose(layers.Layer):
             when `data_format` is `'NDHWC'`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             The default value is 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
-            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
+        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution.
+            If stride is a tuple, it must contain three integers, (stride_depth, stride_height,
+            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride.
             The default value is 1.
         dilation(int|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
@@ -622,7 +622,7 @@ class Conv3DTranspose(layers.Layer):
             library is installed. The default value is True.
         act (str, optional): Activation type, if it is set to None, activation is not appended.
             The default value is None.
-        name(str, optional): The default value is None. Normally there is no need for user 
+        name(str, optional): The default value is None. Normally there is no need for user
             to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Attribute:
@@ -791,12 +791,12 @@ class Pool2D(layers.Layer):
         pool_size (int or list or tuple, optional): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be a square of an int. Default: -1.
-        pool_type(str, optional) : The pooling type, can be "max" for max-pooling and "avg" for average-pooling. 
+        pool_type(str, optional) : The pooling type, can be "max" for max-pooling and "avg" for average-pooling.
             Default: max.
         pool_stride (int or list or tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain two integers, (pool_stride_Height, pool_stride_Width). Otherwise,
             the pool stride size will be a square of an int. Default: 1.
-        pool_padding (int or list or tuple, optional): The padding size for pooling operation. 
+        pool_padding (int or list or tuple, optional): The padding size for pooling operation.
             If ``pool_padding`` is a tuple,
             it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
             Otherwise, the padding size for pooling operation will be a square of an int. Default: 0.
@@ -808,7 +808,7 @@ class Pool2D(layers.Layer):
         exclusive (bool, optional): Whether to exclude padding points in average pooling mode. Default: True.
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            ``[batch_size, input_channels, input_height, input_width]``. When it is `"NHWC"`, the data is 
+            ``[batch_size, input_channels, input_height, input_width]``. When it is `"NHWC"`, the data is
             stored in the order of: ``[batch_size, input_height, input_width, input_channels]``
 
     Returns:
@@ -930,7 +930,7 @@ class Pool2D(layers.Layer):
 
 class Linear(layers.Layer):
     """
-    
+
     Fully-connected linear transformation layer:
 
     .. math::
@@ -1066,7 +1066,7 @@ class InstanceNorm(layers.Layer):
     :math:`input` is the input features over a mini-batch.
 
     ..  math::
-        
+
         \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
         \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
         \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
@@ -1085,12 +1085,12 @@ class InstanceNorm(layers.Layer):
         param_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the param_attr is not set, the parameter is initialized 
+	     If the Initializer of the param_attr is not set, the parameter is initialized
 	     one. If it is set to False, will not create param_attr. Default: None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
              If it is set to None or one attribute of ParamAttr, instance_norm
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero.
              If it is set to False, will not create bias_attr. Default: None.
         dtype(str, optional): Indicate the data type of the input ``Tensor``,
              which can be float32 or float64. Default: float32.
@@ -1107,13 +1107,13 @@ class InstanceNorm(layers.Layer):
           import numpy as np
           import paddle
 
-          # x's shape is [1, 3, 1, 2] 
+          # x's shape is [1, 3, 1, 2]
           x = np.array([[[[1.0, 8.0]], [[10.0, 5.0]], [[4.0, 6.0]]]]).astype('float32')
           with fluid.dygraph.guard():
               x = to_variable(x)
               instanceNorm = paddle.nn.InstanceNorm(3)
               ret = instanceNorm(x)
-              # ret's shape is [1, 3, 1, 2]; value is [-1 1 0.999999 -0.999999 -0.999995 0.999995] 
+              # ret's shape is [1, 3, 1, 2]; value is [-1 1 0.999999 -0.999999 -0.999995 0.999995]
               print(ret)
 
     """
@@ -1195,14 +1195,14 @@ class BatchNorm(layers.Layer):
 
     This interface is used to construct a callable object of the ``BatchNorm`` class.
     For more details, refer to code examples.
-    It implements the function of the Batch Normalization Layer and can be used 
+    It implements the function of the Batch Normalization Layer and can be used
     as a normalizer function for conv2d and fully connected operations.
     The data is normalized by the mean and variance of the channel based on the current batch data.
     Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
     Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
     for more details.
 
-    When use_global_stats = False, the :math:`\mu_{\beta}` 
+    When use_global_stats = False, the :math:`\mu_{\beta}`
     and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
     Calculated as follows:
 
@@ -1226,7 +1226,7 @@ class BatchNorm(layers.Layer):
         moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
 
     The normalization function formula is as follows:
- 
+
     ..  math::
 
         \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
@@ -1589,7 +1589,7 @@ class Embedding(layers.Layer):
 
                         [[0.345249859, 0.124939536, ..., 0.194353745],
                         [0.945345345, 0.435394634, ..., 0.435345365]],
-                        
+
                         [[0.945345345, 0.435394634, ..., 0.435345365],
                         [0.0,         0.0,         ..., 0.0        ]]]  # padding data
         The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
@@ -1599,22 +1599,22 @@ class Embedding(layers.Layer):
         size(tuple|list): The shape of the look up table parameter. It should have two elements which indicate the size
             of the dictionary of embeddings and the size of each embedding vector respectively.
         is_sparse(bool): The flag indicating whether to use sparse update. This parameter only
-            affects the performance of the backwards gradient update. It is recommended to set 
+            affects the performance of the backwards gradient update. It is recommended to set
             True because sparse update is faster. But some optimizer does not support sparse update,
-            such as :ref:`api_fluid_optimizer_AdadeltaOptimizer` , :ref:`api_fluid_optimizer_AdamaxOptimizer` , 
+            such as :ref:`api_fluid_optimizer_AdadeltaOptimizer` , :ref:`api_fluid_optimizer_AdamaxOptimizer` ,
             :ref:`api_fluid_optimizer_DecayedAdagradOptimizer` , :ref:`api_fluid_optimizer_FtrlOptimizer` ,
             :ref:`api_fluid_optimizer_LambOptimizer` and :ref:`api_fluid_optimizer_LarsMomentumOptimizer` .
             In these case, is_sparse must be False. Default: False.
         is_distributed(bool): Whether to store the embedding matrix in a distributed manner. Only used
             in multi-machine distributed CPU training. Default: False.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size). 
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
             to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
             encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
             If set None, it makes no effect to output. Default: None.
         param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
             default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
-            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. 
+            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
             The local word vector needs to be transformed into numpy format, and the shape of local word
             vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
             is used to load custom or pre-trained word vectors. See code example 2 for details.
@@ -1659,7 +1659,7 @@ class Embedding(layers.Layer):
                   size=[128, 100],
                   param_attr= w_param_attrs,
                   is_sparse=False)
-              static_rlt3 = emb(base.to_variable(inp_word))          
+              static_rlt3 = emb(base.to_variable(inp_word))
     """
 
     def __init__(self,
@@ -1899,10 +1899,10 @@ class LayerNorm(layers.Layer):
 class GRUUnit(layers.Layer):
     """
     **GRU unit layer**
-    
+
     It creates a callable object from GRUUnit class.
     If origin_mode is True, then the equation of a gru step is from paper
-    `Learning Phrase Representations using RNN Encoder-Decoder for Statistical 
+    `Learning Phrase Representations using RNN Encoder-Decoder for Statistical
     Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
 
         .. math::
@@ -1943,19 +1943,19 @@ class GRUUnit(layers.Layer):
     Parameters:
         size (int): The input dimension value.
         param_attr(ParamAttr, optional): The parameter attribute for the learnable
-            hidden-hidden weight matrix. 
-            
+            hidden-hidden weight matrix.
+
             **Note**:
-    
+
                 1. The shape of the weight matrix is :math:`[T, 3*D]`, where D is the hidden size.
-                2. All elements in the weight matrix can be divided into two parts. The first 
-                   part are weights of the update gate and reset gate with shape :math:`[D, 2*D]`, 
+                2. All elements in the weight matrix can be divided into two parts. The first
+                   part are weights of the update gate and reset gate with shape :math:`[D, 2*D]`,
                    and the second part are weights for candidate hidden state with shape :math:`[D, D]`.
 
 
             If it is set to None or one attribute of ParamAttr, gru_unit will
             create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. The default 
+            is not set, the parameter is initialized with Xavier. The default
             value is None.
         bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias
             of GRU.Note that the bias with :math:`[1, 3*D]` concatenates
@@ -1980,7 +1980,7 @@ class GRUUnit(layers.Layer):
     Returns:
         tuple: The hidden value, reset-hidden value and gate values. The hidden value
         is a 2-D tensor with shape  :math:`[T, D]` . The reset-hidden value is a
-        2-D tensor with shape  :math:`[T, D]` . The gate value is a 2-D tensor with 
+        2-D tensor with shape  :math:`[T, D]` . The gate value is a 2-D tensor with
         shape  :math:`[T, 3*D]`.
 
     Examples:
@@ -2114,7 +2114,7 @@ class NCE(layers.Layer):
         **weight** (Parameter): the learnable weights of this layer.
 
         **bias** (Parameter or None): the learnable bias of this layer.
-    
+
     Returns:
         None
 
@@ -2356,7 +2356,7 @@ class PRelu(layers.Layer):
 
     Attribute:
         **weight** (Parameter): the learnable weights of this layer.
-    
+
     Returns:
         None
 
@@ -2465,7 +2465,7 @@ class BilinearTensorProduct(layers.Layer):
        name (str, optional): The default value is None. Normally there is no need for user
            to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
        act (str, optional): Activation to be applied to the output of this layer. The default value is None.
-       param_attr (ParamAttr, optional): The parameter attribute for the learnable w, parameters/weights of 
+       param_attr (ParamAttr, optional): The parameter attribute for the learnable w, parameters/weights of
            this layer. The default value is None.
        bias_attr (ParamAttr, optional): The parameter attribute for the bias
            of this layer. If it is set to False, no bias will be added to the output units.
@@ -3222,7 +3222,7 @@ class TreeConv(layers.Layer):
     Tree-Based Convolution proposed a kind of data structure called continuous binary tree,
     which regards multiway tree as binary tree.
     The paper of Tree-Based Convolution Operator is here: `tree-based convolution <https://arxiv.org/abs/1409.5718v1/>`_ .
-    
+
     Parameters:
         feature_size(int): last dimension of nodes_vector.
         output_size(int): output feature width.
@@ -3332,7 +3332,7 @@ class Flatten(layers.Layer):
     Parameters:
         start_axis(int): first dim to flatten (default = 1)
         stop_axis(int): last dim to flatten (default = -1).
-    
+
     Returns:
         None
 
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 91f22842a45..8ccb1d1411c 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -76,11 +76,11 @@ def prepare_context(strategy=None):
 class ParallelEnv(object):
     """
     .. note::
-        This API is not recommended, if you need to get rank and world_size, 
-        it is recommended to use ``paddle.distributed.get_rank()`` and 
+        This API is not recommended, if you need to get rank and world_size,
+        it is recommended to use ``paddle.distributed.get_rank()`` and
         ``paddle.distributed.get_world_size()`` .
 
-    This class is used to obtain the environment variables required for 
+    This class is used to obtain the environment variables required for
     the parallel execution of ``paddle.nn.Layer`` in dynamic mode.
 
     The parallel execution in dynamic mode needs to be started using ``paddle.distributed.launch``
@@ -162,7 +162,7 @@ class ParallelEnv(object):
 
             # execute this command in terminal: export PADDLE_TRAINER_ID=0
             import paddle.distributed as dist
-            
+
             env = dist.ParallelEnv()
             print("The rank is %d" % env.rank)
             # The rank is 0
@@ -181,7 +181,7 @@ class ParallelEnv(object):
 
             # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
             import paddle.distributed as dist
-            
+
             env = dist.ParallelEnv()
             print("The world_size is %d" % env.world_size)
             # The world_size is 4
@@ -200,7 +200,7 @@ class ParallelEnv(object):
 
             # execute this command in terminal: export FLAGS_selected_gpus=1
             import paddle.distributed as dist
-            
+
             env = dist.ParallelEnv()
             print("The device id are %d" % env.device_id)
             # The device id are 1
@@ -226,10 +226,10 @@ class ParallelEnv(object):
 
         Examples:
           .. code-block:: python
-            
+
             # execute this command in terminal: export PADDLE_CURRENT_ENDPOINT=127.0.0.1:6170
             import paddle.distributed as dist
-            
+
             env = dist.ParallelEnv()
             print("The current endpoint are %s" % env.current_endpoint)
             # The current endpoint are 127.0.0.1:6170
@@ -239,7 +239,7 @@ class ParallelEnv(object):
     @property
     def trainer_endpoints(self):
         """
-        The endpoints of all trainer nodes in the task, 
+        The endpoints of all trainer nodes in the task,
         which are used to broadcast the NCCL ID when NCCL2 is initialized.
 
         Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ENDPOINTS`` . The default value is "".
@@ -249,7 +249,7 @@ class ParallelEnv(object):
 
             # execute this command in terminal: export PADDLE_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171
             import paddle.distributed as dist
-            
+
             env = dist.ParallelEnv()
             print("The trainer endpoints are %s" % env.trainer_endpoints)
             # The trainer endpoints are ['127.0.0.1:6170', '127.0.0.1:6171']
@@ -268,7 +268,7 @@ class ParallelEnv(object):
 
             # execute this command in terminal: export FLAGS_nccl_nrings=1
             import paddle.distributed as dist
-            
+
             env = dist.ParallelEnv()
             print("The nrings is %d" % env.nrings)
             # the number of ring is 1
@@ -439,43 +439,43 @@ class DataParallel(layers.Layer):
     Run the dygraph module with data parallelism.
 
     Currently, DataParallel class only supports to run the dynamic graph
-    with multi-process. 
-    
+    with multi-process.
+
     Now supports two ways to start training:
 
     1. start by ``paddle.distributed.spawn`` method, for example:
 
         ``python demo.py`` (spawn need to be called in ``__main__`` method)
-    
+
     2. start by ``paddle.distributed.launch`` module, for example:
-    
+
         ``python -m paddle.distributed.launch --gpus=0,1 demo.py`` .
 
     And the content of `demo.py` is the code of examples.
 
     Args:
         layers(Layer): The module that should be executed by data parallel.
-        strategy(ParallelStrategy, optional): (deprecated) The strategy of data parallelism, 
+        strategy(ParallelStrategy, optional): (deprecated) The strategy of data parallelism,
             contains environment configuration related to parallel execution. Default: None.
-        comm_buffer_size(int, optional):  It limits the memory size(MB) of one buffer  
-                                          parameters' gradient which is the input of communication 
+        comm_buffer_size(int, optional):  It limits the memory size(MB) of one buffer
+                                          parameters' gradient which is the input of communication
                                           calling(e.g NCCLAllReduce). Default: 25.
         last_comm_buffer_size(float, optional): It limits memory size(MB) of last buffer in communication
-                                         calling. Making the last communication buffer size small is useful to 
+                                         calling. Making the last communication buffer size small is useful to
                                          improve performance. Default: 1.
         find_unused_parameters(bool, optional): Whether to traverse the entire backward graph from the
-                                                all tensors in the return value of the wrapped model's 
-                                                forward function. For parameters not involved in loss 
-                                                calculation, their gradients will be marked as ready in 
-                                                advance to prepare reduce. Please note that all forward 
-                                                outputs derived from the wrapped model parameters must 
-                                                participate in the calculation of loss and subsequent 
+                                                all tensors in the return value of the wrapped model's
+                                                forward function. For parameters not involved in loss
+                                                calculation, their gradients will be marked as ready in
+                                                advance to prepare reduce. Please note that all forward
+                                                outputs derived from the wrapped model parameters must
+                                                participate in the calculation of loss and subsequent
                                                 gradient calculations. If not, serious error will occur.
-                                                Note that setting the find_unused_parameters to True 
+                                                Note that setting the find_unused_parameters to True
                                                 will affect computing performance. Therefore, if all parameters
-                                                are sure to participate in the loss calculation and the 
+                                                are sure to participate in the loss calculation and the
                                                 autograd graph construction, please set it False. Default: False.
-            
+
     Returns:
         Layer: The data paralleled module.
 
@@ -495,7 +495,7 @@ class DataParallel(layers.Layer):
                     super(LinearNet, self).__init__()
                     self._linear1 = nn.Linear(10, 10)
                     self._linear2 = nn.Linear(10, 1)
-                    
+
                 def forward(self, x):
                     return self._linear2(self._linear1(x))
 
@@ -516,7 +516,7 @@ class DataParallel(layers.Layer):
                 outputs = dp_layer(inputs)
                 labels = paddle.randn([10, 1], 'float32')
                 loss = loss_fn(outputs, labels)
-                
+
                 loss.backward()
 
                 adam.step()
@@ -530,9 +530,9 @@ class DataParallel(layers.Layer):
 
 
     .. note::
-        ``PyLayer`` is not supported in DataParallel. To solve problems of this kind, 
-        it's recommended to skip gradient synchronization among multiple cards by 'no_sync', 
-        and manually implement 'all_reduce' before model optimization. There is an example 
+        ``PyLayer`` is not supported in DataParallel. To solve problems of this kind,
+        it's recommended to skip gradient synchronization among multiple cards by 'no_sync',
+        and manually implement 'all_reduce' before model optimization. There is an example
         showing specific implemetation processing.
 
     Examples:
@@ -728,8 +728,8 @@ class DataParallel(layers.Layer):
     @contextmanager
     def no_sync(self):
         """
-        A context manager to stop gradient synchronization. Within no_sync(), 
-        gradients of parameters will only be accumulated on model and not 
+        A context manager to stop gradient synchronization. Within no_sync(),
+        gradients of parameters will only be accumulated on model and not
         synchronized util the first forward-backward out of this context.
 
         Examples:
@@ -744,7 +744,7 @@ class DataParallel(layers.Layer):
                     def __init__(self):
                         super(SimpleNet, self).__init__()
                         self._linear = nn.Linear(10, 1)
-                        
+
                     def forward(self, x):
                         return self._linear(x)
 
@@ -782,7 +782,7 @@ class DataParallel(layers.Layer):
                 reason="This method does not need to be called anymore.")
     def scale_loss(self, loss):
         """
-        Deprecated method, now ``scale_loss`` is an empty method,  
+        Deprecated method, now ``scale_loss`` is an empty method,
         keep this method just for compatibility.
         """
         return loss
@@ -791,7 +791,7 @@ class DataParallel(layers.Layer):
                 reason="This method does not need to be called anymore.")
     def apply_collective_grads(self):
         """
-        Deprecated method, now ``apply_collective_grads`` is an empty method, 
+        Deprecated method, now ``apply_collective_grads`` is an empty method,
         keep this method just for compatibility.
         """
         return
@@ -838,7 +838,7 @@ class DataParallel(layers.Layer):
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters and persistable buffers.
-            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key. 
+            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key.
                                                   Default: True
         Returns:
             None
diff --git a/python/paddle/fluid/dygraph/rnn.py b/python/paddle/fluid/dygraph/rnn.py
index 837287faa0f..e7053366297 100644
--- a/python/paddle/fluid/dygraph/rnn.py
+++ b/python/paddle/fluid/dygraph/rnn.py
@@ -66,18 +66,18 @@ class LSTMCell(Layer):
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|None): The parameter attribute for the bias
             of LSTMCell.
-            If it is set to None or one attribute of ParamAttr, LSTMCell will 
+            If it is set to None or one attribute of ParamAttr, LSTMCell will
             create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized as zero. Default: None.
         gate_activation (function|None): The activation function for gates (actGate).
                                   Default: 'fluid.layers.sigmoid'
         activation (function|None): The activation function for cells (actNode).
                              Default: 'fluid.layers.tanh'
-        forget_bias(float|1.0): forget bias used when computing forget gate. This 
+        forget_bias(float|1.0): forget bias used when computing forget gate. This
             is not used in default LSTMCell implementation (CUDNN compatiable)
         use_cudnn_impl(bool|True): whether to use CUDNN compatible LSTMCell
         dtype(string): data type used in this cell
-    
+
     Returns:
         None
 
@@ -107,7 +107,7 @@ class LSTMCell(Layer):
                 step_input_var = fluid.dygraph.to_variable(step_input_np)
                 pre_hidden_var = fluid.dygraph.to_variable(pre_hidden_np)
                 pre_cell_var = fluid.dygraph.to_variable(pre_cell_np)
-                new_hidden, new_cell = cudnn_lstm(step_input_var, pre_hidden_var, pre_cell_var) 
+                new_hidden, new_cell = cudnn_lstm(step_input_var, pre_hidden_var, pre_cell_var)
 
     """
 
@@ -273,7 +273,7 @@ class GRUCell(Layer):
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|None): The parameter attribute for the bias
             of GRUCell.
-            If it is set to None or one attribute of ParamAttr, GRUCell will 
+            If it is set to None or one attribute of ParamAttr, GRUCell will
             create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
         gate_activation (function|None): The activation function for gates (actGate).
@@ -282,7 +282,7 @@ class GRUCell(Layer):
                              Default: 'fluid.layers.tanh'
         use_cudnn_impl(bool|True): whether to use CUDNN compatible LSTMCell
         dtype(string): data type used in this cell
-    
+
     Returns:
         None
 
diff --git a/python/paddle/fluid/dygraph/static_runner.py b/python/paddle/fluid/dygraph/static_runner.py
index e8738da07e9..f312b9e1101 100644
--- a/python/paddle/fluid/dygraph/static_runner.py
+++ b/python/paddle/fluid/dygraph/static_runner.py
@@ -26,7 +26,7 @@ class StaticModelRunner(object):
     and then performing fine-tune training or inference.
 
     .. note::
-        This is a temporary API, which will be deprecated later, please use 
+        This is a temporary API, which will be deprecated later, please use
         `fluid.dygraph.jit.load` to achieve the same function.
     """
 
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index 4627d6d11e7..932e3ab207e 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -95,11 +95,11 @@ name_mapping = {
 class Tracer(core.Tracer):
     """
     :api_attr: imperative
-    
-    Tracer is used to execute and record the operators executed, to construct the 
+
+    Tracer is used to execute and record the operators executed, to construct the
     computation graph in dygraph model. Tracer has two mode, :code:`train_mode`
-    and :code:`eval_mode`. In :code:`train_mode`, Tracer would add backward network 
-    automatically and perform AutoGrad by method :code:`loss.backward()`. 
+    and :code:`eval_mode`. In :code:`train_mode`, Tracer would add backward network
+    automatically and perform AutoGrad by method :code:`loss.backward()`.
     In :code:`eval_mode`, Tracer would not add backward network.
 
     This is a low level API, users don't need to use it directly.
diff --git a/python/paddle/fluid/dygraph_utils.py b/python/paddle/fluid/dygraph_utils.py
index 849191f5463..d93915e8bb5 100644
--- a/python/paddle/fluid/dygraph_utils.py
+++ b/python/paddle/fluid/dygraph_utils.py
@@ -25,7 +25,7 @@ def _append_activation_in_dygraph(input,
     """Append activation in dygraph mode.
 
         Args:
-            input: the input variable. 
+            input: the input variable.
             act: activation type
             use_mkldnn: if use mkldnn
             use_cudnn: if use cudnn
@@ -50,7 +50,7 @@ def _append_bias_in_dygraph(input, bias=None, axis=1, use_mkldnn=False):
     """Append bias operation in dygraph mode.
 
         Args:
-            input: the input variable. 
+            input: the input variable.
             bias:  the bias to be appended
             axis:  the axis to perform operation
             use_mkldnn: whether to use mkldnn
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 510733d4c1c..52e9d91b932 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -127,7 +127,7 @@ class Evaluator(object):
 
 class ChunkEvaluator(Evaluator):
     """
-    Warning: This would be deprecated in the future. Please use fluid.metrics.ChunkEvaluator 
+    Warning: This would be deprecated in the future. Please use fluid.metrics.ChunkEvaluator
     instead.
 
     Accumulate counter numbers output by chunk_eval from mini-batches and
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 9a839acde04..58baf0928d9 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -78,7 +78,7 @@ def _switch_scope(scope):
 @signature_safe_contextmanager
 def scope_guard(scope):
     """
-    
+
     This function switches scope through python `with` statement.
     Scope records the mapping between variable names and variables ( :ref:`api_guide_Variable` ),
     similar to brackets in programming languages.
@@ -96,7 +96,7 @@ def scope_guard(scope):
         None
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
@@ -164,10 +164,10 @@ def dtype_is_compatible_with(first, second):
     """
     Returns True if the first dtype can be compatible the second one.
     Currently, we require the two dtype's have to be same.
-      
+
     Args:
         dtype (np.dtype|VarType|str): The type of data: float32, int64, etc.
-    
+
     Returns:
         True if the two types are same.
     """
@@ -223,7 +223,7 @@ def check_feed_shape_type(var, feed, num_places=1):
     2. Each non-negative number of the two dimensions are same.
     3. For negative number or 'None' in a dimension, it means unknown so it
        is compatible with any number.
-    
+
     Args:
         var (Variable): the Variable object
         feed (LoDTensor): the fed value, which must be a LoDTensor
@@ -621,7 +621,7 @@ class _StandaloneExecutor(object):
         Args:
             feed_names(list): This parameter represents the input names of the model.
             fetch_list(list): This parameter represents the Tensors that need to be returned
-                after the model runs. The default is None. 
+                after the model runs. The default is None.
             return_numpy(bool): This parameter indicates whether convert the fetched Tensors
                 (the Tensor specified in the fetch list) to numpy.ndarray. if it is False,
                 the type of the return value is a list of :code:`LoDTensor`. The default is True.
@@ -642,10 +642,10 @@ class _StandaloneExecutor(object):
 
     def _update_feed(self, feed):
         """
-        Update the feed dict, remove the feed item which is pruned in program.  
+        Update the feed dict, remove the feed item which is pruned in program.
 
         Notes: This is a very low level API. Users should not use this API
-        directly. 
+        directly.
 
         Args:
             feed(list|dict): feed dict or list.
@@ -825,10 +825,10 @@ class Executor(object):
             will set the default device according to its installation version. If Paddle
             is CPU version, the default device would be set to `CPUPlace()` . If Paddle is
             GPU version, the default device would be set to `CUDAPlace(0)` . Default is None.
-            If ``place`` is string, it can be ``cpu``, and ``gpu:x``, where ``x`` 
+            If ``place`` is string, it can be ``cpu``, and ``gpu:x``, where ``x``
             is the index of the GPUs. Note: users only pass one Place or None to initialize
             Executor when using multiple-cards. Other APIs will override the cards. See
-            `document for multiple-cards <https://www.paddlepaddle.org.cn/documentation/docs/en/develop/guides/01_paddle2.0_introduction/update_en.html#stand-alone-multi-card-launch>`_ 
+            `document for multiple-cards <https://www.paddlepaddle.org.cn/documentation/docs/en/develop/guides/01_paddle2.0_introduction/update_en.html#stand-alone-multi-card-launch>`_
 
     Returns:
         Executor
@@ -1007,7 +1007,7 @@ class Executor(object):
 
         Returns:
             optimize_ops(list): The optimize operators splited from fetch_list.
-            fetch_list(list):  The updated fetch_list which does not contain optimize operators.  
+            fetch_list(list):  The updated fetch_list which does not contain optimize operators.
         """
         _optimize_ops = []
         _fetch_list = []
@@ -1055,12 +1055,12 @@ class Executor(object):
                        optimize_ops=None):
         """
         Prune operators and variables which are not needed to generate
-        :code:`fetch_list` and optimize operators. 
-        Prune operators and variables which are needed 
-        to generate variables to be feeded.  
+        :code:`fetch_list` and optimize operators.
+        Prune operators and variables which are needed
+        to generate variables to be feeded.
 
         Notes: This is a very low level API. Users should not use this API
-        directly. 
+        directly.
 
         Args:
             program(Program): the origin program
@@ -1114,10 +1114,10 @@ class Executor(object):
     @classmethod
     def _update_feed(cls, program, feed):
         """
-        Update the feed dict, remove the feed item which is pruned in program.  
+        Update the feed dict, remove the feed item which is pruned in program.
 
         Notes: This is a very low level API. Users should not use this API
-        directly. 
+        directly.
 
         Args:
             program(Program): the pruned program.
@@ -1288,12 +1288,12 @@ class Executor(object):
                 so the length of this list should be equal to the number of places.
                 The default is None.
             fetch_list(list): This parameter represents the Tensors that need to be returned
-                after the model runs. The default is None. 
+                after the model runs. The default is None.
             feed_var_name(str): This parameter represents the name of the input Tensor of
                 the feed operator. The default is "feed".
             fetch_var_name(str): This parameter represents the name of the output Tensor of
                 the fetch operator. The default is "fetch".
-            scope(Scope): the scope used to run this program, you can switch 
+            scope(Scope): the scope used to run this program, you can switch
                 it to different scope. default is :code:`paddle.static.global_scope()`
             return_numpy(bool): This parameter indicates whether convert the fetched Tensors
                 (the Tensor specified in the fetch list) to numpy.ndarray. if it is False,
@@ -1314,14 +1314,14 @@ class Executor(object):
                 results are variant, please set :code:`return_merged` as False, which denotes that the fetched
                 results will not be merged. The default is True, but it is just for the compatibility, and may
                 use False as default value in the future version.
-            use_prune(bool): This parameter indicates whether the input :code:`Program` will be pruned. 
+            use_prune(bool): This parameter indicates whether the input :code:`Program` will be pruned.
                 If the parameter is True, the program will be pruned accroding to the given feed and fetch_list,
-                which means the operators and variables in program that generate :code:`feed` and are not 
-                needed to generate :code:`fetch_list` will be pruned. The default is False, which means the 
+                which means the operators and variables in program that generate :code:`feed` and are not
+                needed to generate :code:`fetch_list` will be pruned. The default is False, which means the
                 program will not pruned and all the operators and variables will be executed during running.
-                Note that if the tuple returned from :code:`Optimizer.minimize()` is passed to :code:`fetch_list`, 
+                Note that if the tuple returned from :code:`Optimizer.minimize()` is passed to :code:`fetch_list`,
                 :code:`use_prune` will be overrided to True, and the program will be pruned.
-                
+
         Returns:
 
             List: The fetched result list.
@@ -2617,7 +2617,7 @@ class Executor(object):
                 for each run. default is global_scope
             thread(int): number of thread a user wants to run in this function. Default is 0, which
                 means using thread num of dataset
-            debug(bool): whether a user wants to run train_from_dataset 
+            debug(bool): whether a user wants to run train_from_dataset
             fetch_list(Tensor List): fetch Tensor list, each variable will be printed
                 during training
             fetch_info(String List): print information for each Tensor, its length should be equal
@@ -2627,9 +2627,9 @@ class Executor(object):
 
         Returns:
             None
-        
+
         Examples:
-        
+
             .. code-block:: python
 
               import paddle
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 82a887be414..a4400d6272f 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -283,15 +283,15 @@ def ipu_shard_guard(index=-1, stage=-1):
         index(int, optional): Specify which ipu the Tensor is computed on, (such as '0, 1, 2, 3').
             The default value is -1, which means the Op only run on IPU 0.
         stage(int, optional): Specify the computation order of the sharded model(such as '0, 1, 2, 3').
-            The sharded model will be computed from small to large. The default value is -1, 
+            The sharded model will be computed from small to large. The default value is -1,
             which means no pipelining computation order and run Ops in terms of graph.
-    
+
     **Note**:
-    Only if the enable_manual_shard=True, the 'index' is able to be set not -1. Please refer 
-    to :code:`paddle.static.IpuStrategy` . 
-    Only if the enable_pipelining=True, the 'stage' is able to be set not -1. Please refer 
+    Only if the enable_manual_shard=True, the 'index' is able to be set not -1. Please refer
+    to :code:`paddle.static.IpuStrategy` .
+    Only if the enable_pipelining=True, the 'stage' is able to be set not -1. Please refer
     to :code:`paddle.static.IpuStrategy` .
-    A index is allowed to match none stage or a stage. A stage is only allowed to match a new or 
+    A index is allowed to match none stage or a stage. A stage is only allowed to match a new or
     duplicated index.
 
     Examples:
@@ -336,7 +336,7 @@ def set_ipu_shard(call_func, index=-1, stage=-1):
         index(int, optional): Specify which ipu the Tensor is computed on, (such as ‘0, 1, 2, 3’).
             The default value is -1, which means the Op only run on IPU 0.
         stage(int, optional): Specify the computation order of the sharded model(such as ‘0, 1, 2, 3’).
-            The sharded model will be computed from small to large. The default value is -1, 
+            The sharded model will be computed from small to large. The default value is -1,
             which means no pipelining computation order and run Ops in terms of graph.
 
     Returns:
@@ -639,12 +639,12 @@ def _set_expected_place(place):
 
 # TODO(zhiqiu): remove this function.
 def _var_base_to_np(var_base):
-    """	
-    convert VarBase tp numpy	
+    """
+    convert VarBase tp numpy
 
-    Args:	
-        var_base(VarBase) : the VarBase to convert	
-    Returns (np.ndarray): the np.ndarray contain the value of VarBase	
+    Args:
+        var_base(VarBase) : the VarBase to convert
+    Returns (np.ndarray): the np.ndarray contain the value of VarBase
     """
 
     warnings.warn(
@@ -742,14 +742,14 @@ def disable_signal_handler():
     Paddle installs signal handlers at C++ level to log debug information upon failing.
     However, conflicts can happen if another python module is making use of such signal.
     Such being the case, one may disblae paddle signal handler via this interface.
-    
+
     Known frameworks that require disabling signal handler includes:
     1. TVM
     2. ADLIK
 
     Make sure you called paddle.disable_signal_handler() before using above mentioned frameworks.
 
-    Returns: None 
+    Returns: None
 
     Examples:
         .. code-block:: python
@@ -822,7 +822,7 @@ def cuda_places(device_ids=None):
 
     If :code:`device_ids` is not None, it should be the device
     ids of GPUs. For example, if :code:`device_ids=[0,1,2]`,
-    the returned list would be 
+    the returned list would be
     [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)].
 
     Parameters:
@@ -832,14 +832,14 @@ def cuda_places(device_ids=None):
         list of paddle.CUDAPlace: Created GPU place list.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
             import paddle.static as static
 
             # required: gpu
-            
+
             paddle.enable_static()
 
             cuda_places = static.cuda_places()
@@ -867,9 +867,9 @@ def xpu_places(device_ids=None):
         xpu places would be returned.
         If :code:`device_ids` is not None, it should be the device
         ids of XPUs. For example, if :code:`device_ids=[0,1,2]`,
-        the returned list would be 
+        the returned list would be
         [paddle.XPUPlace(0), paddle.XPUPlace(1), paddle.XPUPlace(2)].
-    
+
     Parameters:
         device_ids (list or tuple of int, optional): list of XPU device ids.
     Returns:
@@ -881,7 +881,7 @@ def xpu_places(device_ids=None):
 
             import paddle
             import paddle.static as static
-            
+
             paddle.enable_static()
             xpu_places = static.xpu_places()
     """
@@ -898,7 +898,7 @@ def npu_places(device_ids=None):
     """
     **Note**:
         For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device.
-    
+
     This function creates a list of :code:`paddle.NPUPlace` objects.
     If :code:`device_ids` is None, environment variable of
     :code:`FLAGS_selected_npus` would be checked first. For example, if
@@ -908,9 +908,9 @@ def npu_places(device_ids=None):
     npu places would be returned.
     If :code:`device_ids` is not None, it should be the device
     ids of NPUs. For example, if :code:`device_ids=[0,1,2]`,
-    the returned list would be 
+    the returned list would be
     [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
-    
+
     Parameters:
         device_ids (list or tuple of int, optional): list of NPU device ids.
     Returns:
@@ -922,7 +922,7 @@ def npu_places(device_ids=None):
 
             import paddle
             import paddle.static as static
-            
+
             paddle.enable_static()
             npu_places = static.npu_places()
     """
@@ -940,7 +940,7 @@ def cpu_places(device_count=None):
     This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
 
     If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
+    be determined by environment variable :code:`CPU_NUM`.
     If :code:`CPU_NUM` is not set, the default value is 1,
     i.e. CPU_NUM=1.
     :code:`CPU_NUM` indicates the number of devices used in the current task.
@@ -953,7 +953,7 @@ def cpu_places(device_count=None):
         list of paddle.CPUPlace: Created list of CPU places.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
@@ -974,7 +974,7 @@ def cuda_pinned_places(device_count=None):
     This function creates a list of :code:`fluid.CUDAPinnedPlace` objects.
 
     If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
+    be determined by environment variable :code:`CPU_NUM`.
     If :code:`CPU_NUM` is not set, the default value is 1,
     i.e. CPU_NUM=1.
     :code:`CPU_NUM` indicates the number of devices used in the current task.
@@ -1077,7 +1077,7 @@ def name_scope(prefix=None):
 
     Generate hierarchical name prefix for the operators in Static Graph.
 
-    Note: 
+    Note:
         This should only used for debugging and visualization purpose.
         Don't use it for serious analysis such as graph/program transformations.
         Don't use it in dygraph, since it will cause memory leak.
@@ -1086,7 +1086,7 @@ def name_scope(prefix=None):
         prefix(str, optional): prefix. Default is none.
 
     Examples:
-    
+
         .. code-block:: python
 
           import paddle
@@ -1103,7 +1103,7 @@ def name_scope(prefix=None):
           with paddle.static.name_scope("s4"):
                 g = f - 1
 
-          # Op are created in the default main program.  
+          # Op are created in the default main program.
           for op in paddle.static.default_main_program().block(0).ops:
               # elementwise_add is created in /s1/
               if op.type == 'elementwise_add':
@@ -1758,7 +1758,7 @@ class Variable(object):
     def element_size(self):
         """
         Returns the size in bytes of an element in the Tensor.
-        
+
         Examples:
           .. code-block:: python
 
@@ -2064,7 +2064,7 @@ class Variable(object):
     def clone(self):
         """
         Returns a new static Variable, which is the clone of the original static
-        Variable. It remains in the current graph, that is, the cloned Variable 
+        Variable. It remains in the current graph, that is, the cloned Variable
         provides gradient propagation. Calling ``out = tensor.clone()`` is same
         as ``out = assign(tensor)`` .
 
@@ -2116,7 +2116,7 @@ class Variable(object):
             key(str): Key for this information.
             value(object): The value associated to the key.
 
-        Returns: 
+        Returns:
             None
         """
         if not hasattr(self, "_info"):
@@ -2130,7 +2130,7 @@ class Variable(object):
         Args:
             key(str): Key for this information.
 
-        Returns: 
+        Returns:
             object
         """
         if hasattr(self, "_info") and key in self._info:
@@ -2293,10 +2293,10 @@ class Variable(object):
 
     def get_value(self, scope=None):
         """
-        Get the value of variable in given scope. 
+        Get the value of variable in given scope.
 
         Args:
-            scope(Scope, optional) : If `scope` is None, it will be set to global scope 
+            scope(Scope, optional) : If `scope` is None, it will be set to global scope
                 obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`.
                 Default: None
 
@@ -2307,7 +2307,7 @@ class Variable(object):
             .. code-block:: python
 
                 import paddle
-                import paddle.static as static 
+                import paddle.static as static
                 import numpy as np
 
                 paddle.enable_static()
@@ -2352,22 +2352,22 @@ class Variable(object):
 
     def set_value(self, value, scope=None):
         '''
-        Set the value to the tensor in given scope. 
+        Set the value to the tensor in given scope.
 
         Args:
             value(Tensor/ndarray) : The value to be set.
-            scope(Scope, optional) : If `scope` is None, it will be set to global scope 
+            scope(Scope, optional) : If `scope` is None, it will be set to global scope
                 obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`.
                 Default: None
 
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
                 import paddle
-                import paddle.static as static 
+                import paddle.static as static
                 import numpy as np
 
                 paddle.enable_static()
@@ -3849,7 +3849,7 @@ class Block(object):
 
     def _insert_op_without_sync(self, index, *args, **kwargs):
         """
-        Insert an Operator according to the giving arguments, 
+        Insert an Operator according to the giving arguments,
         without sync_with_cpp to meke the compilation faster.
 
         Args:
@@ -5461,8 +5461,8 @@ class Program(object):
     def clone(self, for_test=False):
         """
         .. note:::
-            1. :code:`Program.clone()` method DOES NOT clone :ref:`api_paddle_io_DataLoader` . 
-            2. Recommend you to use :code:`clone` before using :code:`Opimizer.minimize` . 
+            1. :code:`Program.clone()` method DOES NOT clone :ref:`api_paddle_io_DataLoader` .
+            2. Recommend you to use :code:`clone` before using :code:`Opimizer.minimize` .
             3. This API has no effect in Dygraph Mode.
 
         Create a new Program with forward content of original one when ``for_test=True``.
@@ -5690,8 +5690,8 @@ class Program(object):
     def _prune_with_input(self, feeded_var_names, targets):
         """
         Prune operators and variables which are not needed to generate
-        :code:`targets`. Prune operators and variables which are needed 
-        to generate feeded_var 
+        :code:`targets`. Prune operators and variables which are needed
+        to generate feeded_var
 
         Notes: This is a very low level API. Users should not use this API
         directly. This API is in flux and not stable.
@@ -5957,7 +5957,7 @@ class Program(object):
     def parse_from_string(binary_str):
         """
         .. note::
-            1. All information about parameters will be lost after serialization; 
+            1. All information about parameters will be lost after serialization;
             2. This API has no effect in Dygraph mode.
 
         Deserialize a Program from  `protobuf <https://en.wikipedia.org/wiki/Protocol_Buffers>`_  binary string.
@@ -6022,7 +6022,7 @@ class Program(object):
         The default random seed for random operators in Program. ``0`` means get
         the random seed from random device.
 
-        .. note:: 
+        .. note::
             It must be set before the operators have been added.
 
         Returns:
@@ -6060,7 +6060,7 @@ class Program(object):
         """
         The number of :ref:`api_guide_Block_en`  in this Program.
 
-        .. note:: 
+        .. note::
             This API has no effect in Dygraph mode.
 
         Returns:
@@ -6268,8 +6268,8 @@ class Program(object):
         Args:
             other(Program): Other program
             pruned_origin_block_id_map(dict{int:int}): A dict which maps the block id in program
-            self to the block id in program other. For example, {0:0, 1:1, 2:3} means block 0 in self is 
-            cloned from block 0 in other, etc. Default is None, which means default mapped, 
+            self to the block id in program other. For example, {0:0, 1:1, 2:3} means block 0 in self is
+            cloned from block 0 in other, etc. Default is None, which means default mapped,
             {0:0, 1:1,..., n:n}.
 
         Returns:
@@ -6375,12 +6375,12 @@ class Program(object):
             This function MUST called after run start_up_program
 
         Args:
-            mode(str, optional): Source of the obtained parameters and buffers. 
-                    'opt' :  The return value only contains the variable in the optimizer. 
-                    'param' : The return value only contains the variable in the network, not the variable in the optimizer.  
+            mode(str, optional): Source of the obtained parameters and buffers.
+                    'opt' :  The return value only contains the variable in the optimizer.
+                    'param' : The return value only contains the variable in the network, not the variable in the optimizer.
                     'all' : The return value contains the variable in the network and optimizer.
                     Default: 'all'
-            scope(Scope, optional) : If scope is None, state_dict will be set to global scope 
+            scope(Scope, optional) : If scope is None, state_dict will be set to global scope
                 obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope.
                 Default: None
 
@@ -6467,20 +6467,20 @@ class Program(object):
 
     def set_state_dict(self, state_dict, scope=None):
         """
-        Set parameters and persistable buffers in state_dict to program. 
+        Set parameters and persistable buffers in state_dict to program.
         An exception will throw if shape or dtype of the parameters is not match.
-        
+
         .. note::
             This function MUST called after run start_up_program
 
         Args:
-            state_dict(dict): the dict store parameters and persistable buffers. 
+            state_dict(dict): the dict store parameters and persistable buffers.
                 The key is the name of the parameter or the name of the buffer.
                 The value is the tensor of this variable in the given scope.
-            scope(Scope, optional) : If scope is None, state_dict will be set to global scope 
+            scope(Scope, optional) : If scope is None, state_dict will be set to global scope
                 obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope.
                 Default: None
-        
+
         Returns:
             None
 
@@ -6556,7 +6556,7 @@ class Parameter(Variable):
             be applied on the parameter. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this parameter.
-        need_clip (bool): Whether the parameter gradient need to be cliped 
+        need_clip (bool): Whether the parameter gradient need to be cliped
             in optimizer. Default is True.
     """
 
@@ -6645,8 +6645,8 @@ class Parameter(Variable):
 
 class ParamBase(core.VarBase):
     """
-    ParamBase is derived from Tensor( Which is the concept in Dygraph Mode). 
-    A ParamBase is a persistable Tensor, and will be updated by optimizers 
+    ParamBase is derived from Tensor( Which is the concept in Dygraph Mode).
+    A ParamBase is a persistable Tensor, and will be updated by optimizers
     after each iteration.
     The training of a neural network is essentially the updating of
     its ParamBase.
@@ -6664,7 +6664,7 @@ class ParamBase(core.VarBase):
             be applied on the ParamBase. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this ParamBase.
-        need_clip (bool): Whether the parameter gradient need to be cliped 
+        need_clip (bool): Whether the parameter gradient need to be cliped
             in optimizer. Default is True.
     """
 
@@ -6791,8 +6791,8 @@ else:
 
 class EagerParamBase(_core_eager_eagertensor):
     """
-    EagerParamBase is derived from Tensor( Which is the concept in Eager-Dygraph Mode). 
-    A EagerParamBase is a persistable Tensor, and will be updated by optimizers 
+    EagerParamBase is derived from Tensor( Which is the concept in Eager-Dygraph Mode).
+    A EagerParamBase is a persistable Tensor, and will be updated by optimizers
     after each iteration.
     The training of a neural network is essentially the updating of
     its EagerParamBase.
@@ -6810,7 +6810,7 @@ class EagerParamBase(_core_eager_eagertensor):
             be applied on the EagerParamBase. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this EagerParamBase.
-        need_clip (bool): Whether the parameter gradient need to be cliped 
+        need_clip (bool): Whether the parameter gradient need to be cliped
             in optimizer. Default is True.
     """
 
@@ -6963,7 +6963,7 @@ def default_startup_program():
     Get default/global startup program.
 
     The :code:`paddle.nn` function will append the initialization operators into startup program.
-    The :code:`startup_program` will initialize the parameters by the OPs. 
+    The :code:`startup_program` will initialize the parameters by the OPs.
 
     This method will return the default or the current startup program. Users can use
     :ref:`api_paddle_fluid_framework_program_guard`  to switch :ref:`api_paddle_fluid_framework_Program` .
@@ -6971,7 +6971,7 @@ def default_startup_program():
     Returns:
         Program: current default startup program.
 
-    Returns type: 
+    Returns type:
 
     Examples:
         .. code-block:: python
@@ -6989,13 +6989,13 @@ def default_startup_program():
 
 def default_main_program():
     """
-    This API can be used to get ``default main program`` which store the 
+    This API can be used to get ``default main program`` which store the
     descriptions of Ops and tensors.
 
-    For example ``z = paddle.add(x, y)`` will create a new ``add`` 
-    Op and a new ``z`` tensor, and they will be recorded in ``default main program`` . 
+    For example ``z = paddle.add(x, y)`` will create a new ``add``
+    Op and a new ``z`` tensor, and they will be recorded in ``default main program`` .
 
-    The ``default main program`` is the default value for ``Program`` parameter in 
+    The ``default main program`` is the default value for ``Program`` parameter in
     a lot of APIs. For example, the :code:`Executor.run()` will execute the
     :code:`default_main_program` when the program is not specified.
 
@@ -7065,8 +7065,8 @@ def program_guard(main_program, startup_program=None):
 
     Args:
         main_program(Program): New main program inside ``with`` statement.
-        startup_program(Program, optional): New startup program inside ``with`` 
-            statement. :code:`None` means not changing startup program, 
+        startup_program(Program, optional): New startup program inside ``with``
+            statement. :code:`None` means not changing startup program,
             default_startup_program is still used.
             Default: None.
 
@@ -7173,7 +7173,7 @@ def switch_device(device):
 @signature_safe_contextmanager
 def device_guard(device=None):
     """
-    
+
     Note:
         The API only supports static mode.
 
@@ -7181,7 +7181,7 @@ def device_guard(device=None):
 
     Args:
         device(str|None): Specify the device to use in the context. It should be ``cpu``,
-            ``gpu`` or ``gpu:x``, where ``x`` is the index of the GPUs. 
+            ``gpu`` or ``gpu:x``, where ``x`` is the index of the GPUs.
             When it is set to 'cpu' or 'gpu', all OPs created in the context will be
             placed on CPUPlace or CUDAPlace. When 'gpu' is set and the program runs on
             single-card, the device index will be the same as the device on which the
@@ -7189,9 +7189,9 @@ def device_guard(device=None):
             assigned devices.
 
     Examples:
-    
+
         .. code-block:: python
-            
+
             # required: gpu
             import paddle
 
diff --git a/python/paddle/fluid/generator.py b/python/paddle/fluid/generator.py
index 7ce2d6a4bf3..5bbe7a0e12c 100644
--- a/python/paddle/fluid/generator.py
+++ b/python/paddle/fluid/generator.py
@@ -27,7 +27,7 @@ class Generator(core.Generator):
         Create a generator object which manages the random number generation. ( Experimental Feature )
 
         Parameters:
-            place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str,optional): The place to allocate Tensor. Can be  
+            place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str,optional): The place to allocate Tensor. Can be
                 CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is
                 string, it can be ``cpu`` and ``gpu:x``, where ``x`` is the index of the GPUs.
 
diff --git a/python/paddle/fluid/incubate/data_generator/__init__.py b/python/paddle/fluid/incubate/data_generator/__init__.py
index 0ef851f52e7..60bfa9eb110 100644
--- a/python/paddle/fluid/incubate/data_generator/__init__.py
+++ b/python/paddle/fluid/incubate/data_generator/__init__.py
@@ -41,7 +41,7 @@ class DataGenerator(object):
         '''
         Set batch size of current DataGenerator
         This is necessary only if a user wants to define generator_batch
-        
+
         Example:
             .. code-block:: python
                 import paddle.fluid.incubate.data_generator as dg
@@ -57,7 +57,7 @@ class DataGenerator(object):
                                 yield ("words", s[1].extend([s[1][0]]))
                 mydata = MyData()
                 mydata.set_batch(128)
-                    
+
         '''
         self.batch_size_ = batch_size
 
@@ -95,12 +95,12 @@ class DataGenerator(object):
     def run_from_stdin(self):
         '''
         This function reads the data row from stdin, parses it with the
-        process function, and further parses the return value of the 
+        process function, and further parses the return value of the
         process function with the _gen_str function. The parsed data will
         be wrote to stdout and the corresponding protofile will be
         generated.
         Example:
-        
+
             .. code-block:: python
                 import paddle.fluid.incubate.data_generator as dg
                 class MyData(dg.DataGenerator):
@@ -144,16 +144,16 @@ class DataGenerator(object):
 
     def generate_sample(self, line):
         '''
-        This function needs to be overridden by the user to process the 
+        This function needs to be overridden by the user to process the
         original data row into a list or tuple.
         Args:
             line(str): the original data row
         Returns:
             Returns the data processed by the user.
-              The data format is list or tuple: 
-            [(name, [feasign, ...]), ...] 
+              The data format is list or tuple:
+            [(name, [feasign, ...]), ...]
               or ((name, [feasign, ...]), ...)
-             
+
             For example:
             [("words", [1926, 08, 17]), ("label", [1])]
               or (("words", [1926, 08, 17]), ("label", [1]))
@@ -259,13 +259,13 @@ class MultiSlotDataGenerator(DataGenerator):
         user, outputting data that can be directly read by the MultiSlotDataFeed,
         and updating proto_info information.
         The input line will be in this format:
-            >>> [(name, [feasign, ...]), ...] 
+            >>> [(name, [feasign, ...]), ...]
             >>> or ((name, [feasign, ...]), ...)
         The output will be in this format:
             >>> [ids_num id1 id2 ...] ...
         The proto_info will be in this format:
             >>> [(name, type), ...]
-        
+
         For example, if the input is like this:
             >>> [("words", [1926, 08, 17]), ("label", [1])]
             >>> or (("words", [1926, 08, 17]), ("label", [1]))
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
index 0018b73e264..a227cb3037d 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
@@ -57,7 +57,7 @@ def split_trainer_ops_pass(program, config, default_device="cpu"):
     split cpu-trainer program from origin-program
     1. find heter op (located on different device)
     2. find input&output of every heter-block
-    3. create cpu-trainer program, add send&recv op 
+    3. create cpu-trainer program, add send&recv op
     """
     # Todo: support user define default_device (MrChengmo)
     default_device_ = default_device
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
index 38a4a14b02f..ed2c93f4e72 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
@@ -49,7 +49,7 @@ def _same_or_split_var(p_name, var_name):
 def _get_optimizer_input_shape(op_type, varkey, orig_shape, param_shape):
     """
     Returns the shape for optimizer inputs that need to be reshaped when
-    Param and Grad is split to multiple servers. 
+    Param and Grad is split to multiple servers.
     """
     # HACK(typhoonzero) : Should use functions of corresponding optimizer in
     # optimizer.py to get the shape, do not bind this in the transpiler.
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 18755212cc1..e34688f2a51 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -794,7 +794,7 @@ def find_heter_ops(program, default_device="cpu"):
                     if no_grad_var in var2idx:
                         """
                        insert sum op & remove sum op from var2idx and origin place
-  
+
                        """
                         op_list = list(block.ops)
                         sum_op = op_list[var2idx[no_grad_var]]
@@ -1442,7 +1442,7 @@ def union_forward_gradient_op(program_block_ops_list):
     block_length = len(program_block_ops_list)
     '''
     ## get the final part
-    final_part_idx = -1 
+    final_part_idx = -1
     for i in range(block_length):
         op_list = program_block_ops_list[i]
         for op in op_list:
@@ -1451,7 +1451,7 @@ def union_forward_gradient_op(program_block_ops_list):
               break
         if final_part_idx != -1:
             break
-    
+
     ## eliminate wrong partition because of sum op
     ## lookup_table_v2_grad
     ## every looup_table_v2_grad op block should follow a sum op
@@ -1470,9 +1470,9 @@ def union_forward_gradient_op(program_block_ops_list):
                 if forward_op_type in SPARSE_OP_TYPE_DICT.keys() \
                     and op.attr('remote_prefetch') is True:
                     param_name = op.input(SPARSE_OP_TYPE_DICT[forward_op_type])[0]
-                    
-                    var2idx[] = [i,j] ## 
-    
+
+                    var2idx[] = [i,j] ##
+
     '''
 
     union_program_block_ops_list = []
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index 3d625d47f30..ef6c34b23fd 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -1106,7 +1106,7 @@ class DownpourOptimizer(DistributedOptimizer):
                 in `parameter_list`.
             parameter_list (list): list of Variables to update.
             no_grad_set (set|None): set of Variables should be ignored.
-            program_mode (str|"all_reduce"): grad action for grogram when use_ps_gpu. 
+            program_mode (str|"all_reduce"): grad action for grogram when use_ps_gpu.
         Returns:
             tuple: (optimize_ops, params_grads) which are, list of operators appended;
             and list of (param, grad) Variables pair for optimization.
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
index 48ce51b3724..365043f92d0 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -1582,8 +1582,8 @@ class FleetUtil(object):
 
     def parse_program_proto(self, prog_path, is_text, output_dir):
         """
-        Parse program.proto into a more readable format. 
-        This function will generate three files: 
+        Parse program.proto into a more readable format.
+        This function will generate three files:
         output_dir/vars_all.log,
         output_dir/vars_persistable.log,
         output_dir/ops.log.
diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
index fb1b36e33c5..3f45842f1ad 100644
--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
@@ -119,8 +119,8 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def ls_dir(self, fs_path):
-        """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
+        """
+        list directory under fs_path, and only give the pure name, not include the fs_path
         """
         if not self.is_exist(fs_path):
             return [], []
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 26ed67f6e8c..8ddb94efc0d 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -127,7 +127,7 @@ class ConstantInitializer(Initializer):
     """Implements the constant initializer
 
     Args:
-        value (float32): constant value to initialize the variable 
+        value (float32): constant value to initialize the variable
 
     Examples:
         .. code-block:: python
@@ -1169,11 +1169,11 @@ def set_global_initializer(weight_init, bias_init=None):
 
     After this API is invoked, the global initializer will takes effect in subsequent code.
 
-    The model parameters include ``weight`` and ``bias`` . In the framework, they correspond 
+    The model parameters include ``weight`` and ``bias`` . In the framework, they correspond
     to ``paddle.ParamAttr`` , which is inherited from ``paddle.Tensor`` , and is a persistable Variable.
-    This API only takes effect for model parameters, not for variables created through apis such as 
+    This API only takes effect for model parameters, not for variables created through apis such as
     :ref:`api_fluid_layers_create_global_var` , :ref:`api_fluid_layers_create_tensor`.
-    
+
     If the initializer is also set up by ``param_attr`` or ``bias_attr`` when creating a network layer,
     the global initializer setting here will not take effect because it has a lower priority.
 
@@ -1181,7 +1181,7 @@ def set_global_initializer(weight_init, bias_init=None):
 
     Args:
         weight_init (Initializer): set the global initializer for ``weight`` of model parameters.
-        bias_init (Initializer, optional): set the global initializer for ``bias`` of model parameters. 
+        bias_init (Initializer, optional): set the global initializer for ``bias`` of model parameters.
             Default: None.
 
     Returns:
@@ -1204,7 +1204,7 @@ def set_global_initializer(weight_init, bias_init=None):
             # If set param_attr/bias_attr too, global initializer will not take effect
             # The weight of conv2 is initialized by Xavier
             # The bias of conv2 is initialized by Normal
-            conv2 = nn.Conv2D(4, 6, (3, 3), 
+            conv2 = nn.Conv2D(4, 6, (3, 3),
                 weight_attr=nn.initializer.XavierUniform(),
                 bias_attr=nn.initializer.Normal())
             y_var2 = conv2(x_var)
@@ -1240,13 +1240,13 @@ def _global_bias_initializer():
 
 def calculate_gain(nonlinearity, param=None):
     """
-    Get the recommended ``gain`` value of some nonlinearity function. ``gain`` value can be used in some 
+    Get the recommended ``gain`` value of some nonlinearity function. ``gain`` value can be used in some
     ``paddle.nn.initializer`` api to adjust the initialization value.
 
     Args:
-        nonlinearity(str): name of nonlinearity activation function. If it is a linear function, such as: 
+        nonlinearity(str): name of nonlinearity activation function. If it is a linear function, such as:
             `linear/conv1d/conv2d/conv3d/conv1d_transpose/conv2d_transpose/conv3d_transpose` , 1.0 will be returned.
-        param(bool|int|float, optional): optional parameter for somme nonlinearity function. Now, it only applies to 
+        param(bool|int|float, optional): optional parameter for somme nonlinearity function. Now, it only applies to
             'leaky_relu'. Default: None, it will be calculated as 0.01 in the formula.
 
     Returns:
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index 502a89ec36d..07159e19136 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -63,7 +63,7 @@ def one_hot(input, depth, allow_out_of_range=False):
         output:
             Out.shape = [4, 4]
             Out.data = [[0., 1., 0., 0.],
-                        [0., 1., 0., 0.], 
+                        [0., 1., 0., 0.],
                         [0., 0., 0., 0.], # This id is 5, which goes beyond depth, so set it all-zeros data.
                         [1., 0., 0., 0.]]
 
@@ -76,7 +76,7 @@ def one_hot(input, depth, allow_out_of_range=False):
             allow_out_of_range = False
 
         output: Throw an exception for Illegal value
-            The second dimension in X is 5, which is greater than depth.  
+            The second dimension in X is 5, which is greater than depth.
             Allow_out_of_range =False means that does not allow the word id to exceed depth,
             so it throws an exception.
 
@@ -84,7 +84,7 @@ def one_hot(input, depth, allow_out_of_range=False):
     Args:
         input(Variable): Tensor or LoDTensor with shape :math:`[N_1, N_2, ..., N_k]` ,
             which contains at least one dimension. The data type is int32 or int64.
-        depth(int): An integer defining the depth of the one hot dimension. If input 
+        depth(int): An integer defining the depth of the one hot dimension. If input
             is word id, depth is generally the dictionary size.
         allow_out_of_range(bool): A bool value indicating whether the input
             indices could be out of range :math:`[0, depth)` . When input indices are
@@ -143,14 +143,14 @@ def embedding(input,
     r"""
     :api_attr: Static Graph
 
-    The operator is used to lookup embeddings vector of ids provided by :attr:`input` . 
+    The operator is used to lookup embeddings vector of ids provided by :attr:`input` .
     It automatically constructs a 2D embedding matrix based on the
     input :attr:`size` (vocab_size, emb_size) and :attr:`dtype` .
 
     The shape of output Tensor is generated by appending an emb_size dimension to the
     last dimension of the input Tensor shape.
 
-    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` , 
+    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` ,
     otherwise the program will throw an exception and exit.
 
     .. code-block:: text
@@ -168,12 +168,12 @@ def embedding(input,
 
                         [[0.345249859, 0.124939536, ..., 0.194353745],
                         [0.945345345, 0.435394634, ..., 0.435345365]],
-                        
+
                         [[0.945345345, 0.435394634, ..., 0.435345365],
                         [0.0,         0.0,         ..., 0.0        ]]]  # padding data
         The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
         It will pad all-zero data when ids is 127.
-        
+
         Case 2:
 
         input is a LoDTensor with 1-level LoD. padding_idx = 0
@@ -198,19 +198,19 @@ def embedding(input,
         size(tuple|list): The shape of lookup table parameter. It should have two elements which
             indicates the size of the dictionary of embeddings and the size of each embedding vector respectively.
         is_sparse(bool): The flag indicating whether to use sparse update. This parameter only
-            affects the performance of the backwards gradient update. It is recommended to set 
+            affects the performance of the backwards gradient update. It is recommended to set
             True because sparse update is faster. But some optimizer does not support sparse update
             In these case, is_sparse must be False. Default: False.
         is_distributed(bool): Whether to store the embedding matrix in a distributed manner. Only used
             in multi-machine distributed CPU training. Default: False.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size). 
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
             to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
             encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
             If set None, it makes no effect to output. Default: None.
         param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
             default weight parameter property is used. In addition,
-            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. 
+            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
             The local word vector needs to be transformed into numpy format, and the shape of local word
             vector should be consistent with :attr:`size` .
         dtype(str): It refers to the data type of output Tensor.
@@ -225,28 +225,28 @@ def embedding(input,
             import paddle
             import numpy as np
             paddle.enable_static()
-            
+
             x = paddle.static.data(name="x", shape = [2, 4], dtype=np.int64)
             embedding = paddle.nn.Embedding(10, 3,
                         weight_attr=paddle.nn.initializer.Constant(value=1.0))
             adam = paddle.optimizer.SGD(parameters=[embedding.weight], learning_rate=0.01)
             output = embedding(x)
             m_output=paddle.mean(output)
-            
+
             adam.minimize(m_output)
-            
+
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             exe.run(paddle.static.default_startup_program())
-            
+
             x = np.array([[7, 2, 4, 5],[4, 3, 2, 9]], dtype=np.int64)
-            
+
             # x is a Numpy.
             # x.data = [[7, 2, 4, 5], [4, 3, 2, 9]]
             # x.shape = [2, 4]
-            
+
             out, = exe.run(paddle.static.default_main_program(), feed={'x':x}, fetch_list=[output])
-            
+
             # out is a Numpy.
             # out.data = [[1., 1., 1.],
             #             [1., 1., 1.],
@@ -267,18 +267,18 @@ def embedding(input,
             import numpy as np
 
             x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
-            
+
             # x is a Tensor.
             # x.data = [[3], [4], [5]]
             # x.shape = [3, 1]
             x = paddle.to_tensor(x_data, stop_gradient=False)
-            
+
             # embedding weight shape = [10, 3]
             embedding = paddle.nn.Embedding(10, 3, sparse=True)
-            
+
             # embedding weight data = [10, 3]
             w0 = np.full(shape=(10, 3), fill_value=2).astype(np.float32)
-            
+
             # embedding.weight.shape = [10, 3]
             # embedding.weight.data =
             #                        [[2., 2., 2.],
@@ -292,18 +292,18 @@ def embedding(input,
             #                         [2., 2., 2.],
             #                         [2., 2., 2.]]
             embedding.weight.set_value(w0)
-            
+
             adam = paddle.optimizer.Adam(
                 parameters=[embedding.weight], learning_rate=0.01)
             adam.clear_grad()
-            
+
             # out is Tensor
             # out.shape: [3, 1, 3]
             # out.layout: NCHW
             # out.dtype: float
             # out.data: [2 2 2 2 2 2 2 2 2]
             out = embedding(x)
-            
+
             out.backward()
             adam.step()
 
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 98d7fa6a037..299275be5e2 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -56,7 +56,7 @@ def run_check():
             fluid.install_check.run_check()
 
             # If installed successfully, output may be
-            # Running Verify Fluid Program ... 
+            # Running Verify Fluid Program ...
             # W0805 04:24:59.496919 35357 device_context.cc:268] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.2, Runtime API Version: 10.1
             # W0805 04:24:59.505594 35357 device_context.cc:276] device: 0, cuDNN Version: 7.6.
             # Your Paddle Fluid works well on SINGLE GPU or CPU.
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 0f69949018d..567ab0c1a84 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -662,8 +662,8 @@ def _save_distributed_persistables(executor, dirname, main_program):
 @dygraph_not_support
 def save_persistables(executor, dirname, main_program=None, filename=None):
     """
-    Save all persistable variables from :code:`main_program` to 
-    the folder :code:`dirname` or file :code:`filename`. You can refer to 
+    Save all persistable variables from :code:`main_program` to
+    the folder :code:`dirname` or file :code:`filename`. You can refer to
     :ref:`api_guide_model_save_reader_en` for more details. And then
     saves these persistables variables to the folder :code:`dirname` or file
     :code:`filename`.
@@ -681,7 +681,7 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
         dirname(str, optional): The saving directory path.
                             When you need to save the parameter to the memory, set it to None.
         main_program(Program, optional): The program whose persistbale variables will
-                                         be saved. You can refer to 
+                                         be saved. You can refer to
                                          :ref:`api_guide_Program_en` for more details.
                                          If it is None, the default main program will
                                          be used.
@@ -1458,7 +1458,7 @@ def load_inference_model(dirname,
           Default: ``None``.
         params_filename(str, optional): It is only used for the case that all
             parameters were saved in a single binary file. One of the following:
-          - The name of file to load all parameters.  
+          - The name of file to load all parameters.
           - When ``dirname`` is ``None``, it must be set to a string containing all the parameters.
           - If parameters were saved in separate files, set it as ``None``.
             Default: ``None``.
@@ -1809,7 +1809,7 @@ def _legacy_save(param_dict, model_path, protocol=2):
 @static_only
 def save(program, model_path, protocol=4, **configs):
     """
-    
+
     This function save parameters, optimizer information and network description to model_path.
 
     The parameters contains all the trainable Tensor, will save to a file with suffix ".pdparams".
@@ -1821,7 +1821,7 @@ def save(program, model_path, protocol=4, **configs):
         model_path(str): the file prefix to save the program. The format is "dirname/file_prefix". If file_prefix is empty str. A exception will be raised
         protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
                                  Default: 4
-        configs(dict, optional) : optional keyword arguments.                        
+        configs(dict, optional) : optional keyword arguments.
 
     Returns:
         None
@@ -2131,7 +2131,7 @@ def load_program_state(model_path, var_list=None):
         state_dict(dict): the dict store Parameter and optimizer information
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 6275dff31ad..0b3b0d08303 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -42,7 +42,7 @@ __all__ = [
 
 def select_output(input, outputs, mask):
     """
-    **select_output**    
+    **select_output**
     This API takes in one input and multiple outputs and an integer mask. It
     selects the output specified by the mask and copy the input to selected
     output. It is useful in control flow.
@@ -92,7 +92,7 @@ def _select_input_infer_shape(first_shape, second_shape):
 def select_input(inputs, mask):
     """
     **select_input**
-    
+
     This API takes in multiple inputs and uses an integer mask to select one
     input to output. It is useful in control flow.
 
@@ -334,7 +334,7 @@ def Print(input,
         print_tensor_layout (bool, optional): Print the tensor layout. Default: True.
         print_tensor_lod (bool, optional): Print the tensor lod. Default: True.
         print_phase (str): Which phase to displace, including 'forward',
-                'backward' and 'both'. Default: 'both'. If set to 'backward', will 
+                'backward' and 'both'. Default: 'both'. If set to 'backward', will
                 only print the gradients of input tensor; If set to 'both', will
                 both print the input tensor itself and the gradients of input tensor.
 
@@ -348,11 +348,11 @@ def Print(input,
 
     Examples:
         .. code-block:: python
-           
+
            import paddle
 
            paddle.enable_static()
-        
+
            x = paddle.full(shape=[2, 3], fill_value=3, dtype='int64')
            out = paddle.static.Print(x, message="The content of input layer:")
 
@@ -576,7 +576,7 @@ class StaticRNN(object):
                 hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
                 # use hidden to update prev
                 rnn.update_memory(prev, hidden)
-                # mark hidden as output 
+                # mark hidden as output
                 rnn.step_output(hidden)
             # get StaticrNN final output
             result = rnn()
@@ -1077,7 +1077,7 @@ def get_inputs_outputs_in_block(current_block, inner_inputs, inner_outputs,
 class While(object):
     """
     :api_attr: Static Graph
-    
+
     while loop control flow. Repeat while body until cond is False.
 
     Note:
@@ -1097,7 +1097,7 @@ class While(object):
 
     Examples 1:
           .. code-block:: python
-            
+
             import paddle.fluid as fluid
             import numpy as np
 
@@ -1285,7 +1285,7 @@ def while_loop(cond, body, loop_vars, is_test=False, name=None):
                 i = paddle.full(shape=[1], fill_value=0, dtype='int64')     # loop counter
                 ten = paddle.full(shape=[1], fill_value=10, dtype='int64')  # loop length
                 i, ten = paddle.static.nn.while_loop(cond, body, [i, ten])
-                
+
                 exe = paddle.static.Executor(paddle.CPUPlace())
                 res = exe.run(main_program, feed={}, fetch_list=[i])
                 print(res) # [array([10])]
@@ -1365,7 +1365,7 @@ def _deal_with_undefined_var(output_vars, loop_vars):
                       (Variable, ) + support_ret_buildin_type) or o_var is None:
             return create_undefined_variable()
         if is_sequence(o_var):
-            """ 
+            """
             Create a complex container class inside the body of while, including Python list and python Dict
             """
             return map_structure(lambda x: create_undefined_variable(), o_var)
@@ -1618,8 +1618,8 @@ def array_write(x, i, array=None):
             Tensor or LoDTensor. Data type: float32, float64, int32, int64.
         i (Variable): 1-D Tensor with shape [1], which represents the position into which
             ``x`` is written. Data type: int64.
-        array (LoDTensorArray, optional): The LoDTensorArray into which ``x`` is written. 
-            The default value is None, when a new LoDTensorArray will be created and returned 
+        array (LoDTensorArray, optional): The LoDTensorArray into which ``x`` is written.
+            The default value is None, when a new LoDTensorArray will be created and returned
             as a result.
 
     Returns:
@@ -1651,8 +1651,8 @@ def array_write(x, i, array=None):
 
             # the output is 2-D Tensor with shape [3,2], which is tmp above.
             # dtype is the corresponding C++ data type, which may vary in different environments.
-            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t, 
-            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux, 
+            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t,
+            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux,
             #       and '__int64' on Windows. They both represent 64-bit integer variables.
 
     """
@@ -1707,7 +1707,7 @@ def array_write(x, i, array=None):
 def create_array(dtype, initialized_list=None):
     """
     This OP creates an LOD_TENSOR_ARRAY. It is used as
-    the input of :ref:`api_fluid_layers_array_read` and 
+    the input of :ref:`api_fluid_layers_array_read` and
     :ref:`api_fluid_layers_array_write`. Also it can be used
     with  :ref:`api_fluid_layers_While` to create RNN network.
 
@@ -1824,7 +1824,7 @@ def less_equal(x, y, cond=None, name=None):
     This OP returns the truth value of :math:`x <= y` elementwise, which is equivalent function to the overloaded operator `<=`.
 
     Args:
-        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64. 
+        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         y(Variable): Second input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         cond(Variable, optional): Optional output which can be any created Variable that meets the requirements to store the result of *less_equal*.
             if cond is None, a new Varibale will be created to store the result.
@@ -1879,7 +1879,7 @@ def greater_than(x, y, cond=None, name=None):
     This OP returns the truth value of :math:`x > y` elementwise, which is equivalent function to the overloaded operator `>`.
 
     Args:
-        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64. 
+        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         y(Variable): Second input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         cond(Variable, optional): Optional output which can be any created Variable that meets the requirements to store the result of *greater_than*.
             if cond is None, a new Varibale will be created to store the result.
@@ -1936,7 +1936,7 @@ def greater_equal(x, y, cond=None, name=None):
     This OP returns the truth value of :math:`x >= y` elementwise, which is equivalent function to the overloaded operator `>=`.
 
     Args:
-        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64. 
+        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         y(Variable): Second input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         cond(Variable, optional): Optional output which can be any created Variable that meets the requirements to store the result of *greater_equal*.
             if cond is None, a new Varibale will be created to store the result.
@@ -1989,7 +1989,7 @@ def equal(x, y, cond=None, name=None):
     Args:
         x(Variable): Tensor, data type is float32, float64, int32, int64.
         y(Variable): Tensor, data type is float32, float64, int32, int64.
-        cond(Variable, optional): Optional output which can be any created 
+        cond(Variable, optional): Optional output which can be any created
             Variable that meets the requirements to store the result of *equal*.
             if cond is None, a new Varibale will be created to store the result.
         name(str, optional): The default value is None.  Normally there is no need for
@@ -2045,7 +2045,7 @@ def not_equal(x, y, cond=None, name=None):
     This OP returns the truth value of :math:`x != y` elementwise, which is equivalent function to the overloaded operator `!=`.
 
     Args:
-        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64. 
+        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         y(Variable): Second input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         cond(Variable, optional): Optional output which can be any created Variable that meets the requirements to store the result of *not_equal*.
             if cond is None, a new Varibale will be created to store the result.
@@ -2059,7 +2059,7 @@ def not_equal(x, y, cond=None, name=None):
         .. code-block:: python
 
           import paddle.fluid as fluid
-          
+
           label = fluid.layers.data(name='label', shape=[1], dtype='int64')
           limit = fluid.layers.fill_constant(shape=[1], value=1, dtype='int64')
           out = fluid.layers.not_equal(x=label, y=limit)
@@ -2087,9 +2087,9 @@ def not_equal(x, y, cond=None, name=None):
 
 def array_read(array, i):
     """
-    This OP is used to read data at the specified position from the input array 
+    This OP is used to read data at the specified position from the input array
     :ref:`api_fluid_LoDTensorArray` . ``array`` is the input array and ``i``
-    is the specified read position. This OP is often used together with 
+    is the specified read position. This OP is often used together with
     :ref:`api_fluid_layers_array_write` OP.
 
     Case 1:
@@ -2142,8 +2142,8 @@ def array_read(array, i):
 
             # the output is 2-D Tensor with shape [3,2].
             # dtype is the corresponding C++ data type, which may vary in different environments.
-            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t, 
-            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux, 
+            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t,
+            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux,
             #       and '__int64' on Windows. They both represent 64-bit integer variables.
     """
     if _non_static_mode():
@@ -2219,7 +2219,7 @@ def shrink_memory(x, i, table):
 def array_length(array):
     """
     This OP is used to get the length of the input array :ref:`api_fluid_LoDTensorArray` .
-    It can be used together with :ref:`api_fluid_layers_array_read` , :ref:`api_fluid_layers_array_write` , 
+    It can be used together with :ref:`api_fluid_layers_array_read` , :ref:`api_fluid_layers_array_write` ,
     :ref:`api_fluid_layers_While` OP to traverse, read and write LoDTensorArray.
 
     Args:
@@ -2253,12 +2253,12 @@ def array_length(array):
             #    shape: [1,]
             #    dtype: l
             #    data: 11,
-            
+
             # 1-D Tensor with shape [1], whose value is 11. It means that the length of LoDTensorArray
             # is 11.
             # dtype is the corresponding C++ data type, which may vary in different environments.
-            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t, 
-            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux, 
+            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t,
+            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux,
             #       and '__int64' on Windows. They both represent 64-bit integer variables.
     """
 
@@ -2500,8 +2500,8 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
     or both return ``None`` if user doens't like to return anything. A nest
     structure of tensors in PaddlePaddle is tensor(s), or tuple of tensors, or
     list of tensors.
-    
-    Note: 
+
+    Note:
         1. The tuples or lists returned by ``true_fn`` and ``false_fn`` must have
         the same shape because of dataflow model of PaddlePaddle while the
         tensors in the tuples or the lists can have different shapes.
@@ -2509,7 +2509,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
         2. This API could be used under both static mode or dygraph mode. If it
         is in dygraph mode, the API only runs one branch based on condition.
 
-        3. If it is in static mode, any tensors or operations created outside 
+        3. If it is in static mode, any tensors or operations created outside
         or inside of ``true_fn`` and ``false_fn`` will be in net building
         regardless of which branch is selected at runtime. This has frequently
         surprised users who expected a lazy semantics. For example:
@@ -2538,9 +2538,9 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
         name(str, optional): The default value is ``None`` . Normally users
              don't have to set this parameter. For more information, please
              refer to :ref:`api_guide_Name` .
-        return_names(sequence of string, optional): The default value is ``None`` . 
-             Normally users don't have to set this parameters.  A sequence of strings 
-             to represents the name of returned vars.  The structure of sequence must 
+        return_names(sequence of string, optional): The default value is ``None`` .
+             Normally users don't have to set this parameters.  A sequence of strings
+             to represents the name of returned vars.  The structure of sequence must
              be same with return values of true_fn and false_fn.
 
     Returns:
@@ -2586,7 +2586,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
             # ret is a tuple containing 2 tensors
             # ret[0] = [[1 1]]
             # ret[1] = [[ True  True  True]
-            #           [ True  True  True]]            
+            #           [ True  True  True]]
 
     """
     if _non_static_mode():
@@ -2655,7 +2655,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
     if return_names is None:
         return_names = ["no name"] * len(to_sequence(true_output))
     else:
-        """ 
+        """
         dy2static will set the return_names and expand the return values to UndefinedVar.
         """
         true_output, false_output = expand_undefined_var(
@@ -2862,11 +2862,11 @@ class Switch(object):
     """
     :api_attr: Static Graph
 
-    This class is used to implement Switch branch control function. 
-    Switch branch contains several case branches and one default branch. 
-    Switch control flow checks whether the case branch conditions are satisfied in turn, 
-    and only executes the statement after the first case branch that satisfies the conditions. 
-    If there is no case branch that satisfies the condition, 
+    This class is used to implement Switch branch control function.
+    Switch branch contains several case branches and one default branch.
+    Switch control flow checks whether the case branch conditions are satisfied in turn,
+    and only executes the statement after the first case branch that satisfies the conditions.
+    If there is no case branch that satisfies the condition,
     only the statement following the default branch is executed.
 
     Note:
@@ -2875,7 +2875,7 @@ class Switch(object):
 
     Member Functions:
         case(condition): The case branch of Switch whose parameter cond is a scalar Variable of bool type. Only if the cond of the current case branch is True and the cond of the previous case branch is False, the statement after the case branch will be executed, and the statement after the case branch will not be executed.
-        
+
         default(): The default branch of Switch. When cond of all case branches is False, the statement after default branch is executed.
 
     Case and default functions can only be used inside the scope of Switch, as shown below:
@@ -2897,7 +2897,7 @@ class Switch(object):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle.fluid as fluid
 
             lr = fluid.layers.create_global_var(
@@ -3031,7 +3031,7 @@ class IfElse(object):
     IfElse OP is different from other OPs in usage, which may cause some users confusion. Here is a simple example to illustrate this OP.
 
     .. code-block:: python
-        
+
         # The following code completes the function: subtract 10 from the data greater than 0 in x, add 10 to the data less than 0 in x, and sum all the data.
         import numpy as np
         import paddle.fluid as fluid
@@ -3041,7 +3041,7 @@ class IfElse(object):
 
         x_d = np.array([[3], [1], [-2], [-3]]).astype(np.float32)
         y_d = np.zeros((4, 1)).astype(np.float32)
-        
+
         # Compare the size of x, y pairs of elements, output cond, cond is shape [4, 1], data type bool 2-D tensor.
         # Based on the input data x_d, y_d, it can be inferred that the data in cond are [[true], [true], [false], [false]].
         cond = fluid.layers.greater_than(x, y)
@@ -3060,7 +3060,7 @@ class IfElse(object):
             ie.output(out_1)
 
         # According to cond condition, the data processed in the two blocks are merged. The output here is output, the type is List, and the element type in List is Variable.
-        output = ie() #  [array([[-7.], [-9.], [ 8.], [ 7.]], dtype=float32)] 
+        output = ie() #  [array([[-7.], [-9.], [ 8.], [ 7.]], dtype=float32)]
 
         # Get the first Variable in the output List and add all elements.
         out = fluid.layers.reduce_sum(output[0])
@@ -3070,7 +3070,7 @@ class IfElse(object):
 
         res = exe.run(fluid.default_main_program(), feed={"x":x_d, "y":y_d}, fetch_list=[out])
         print(res)
-        # [array([-1.], dtype=float32)] 
+        # [array([-1.], dtype=float32)]
 
     Args:
         cond (Variable): cond is a 2-D Tensor with shape [N, 1] and data type bool, representing the corresponding execution conditions of N input data. The data type is bool.
@@ -3081,7 +3081,7 @@ class IfElse(object):
 
     Internal Functions:
         The block is constructed by calling the ``with ie. true_block()`` function in the object, and the computational logic under condition true is put into the block. If no corresponding block is constructed, the input data in the corresponding conditional dimension is unchanged.
- 
+
         The block is constructed by calling the ``with ie. false_block()`` function in the object, and the computational logic under condition false is put into the block. If no corresponding block is constructed, the input data in the corresponding conditional dimension is unchanged.
 
         ``Out = ie. input (x)`` will take out the data of the corresponding conditional dimension in X and put it into out, supporting the internal processing of multiple inputs in block.
@@ -3785,7 +3785,7 @@ class DynamicRNN(object):
 
         Returns:
             None
-        
+
         Raises:
             ValueError: When :code:`update_memory()` is called outside :code:`block()` .
             TypeError: When :code:`ex_mem` or :code:`new_mem` is not a Variable.
@@ -4014,7 +4014,7 @@ def reorder_lod_tensor_by_rank(x, rank_table):
     Args:
         x(${x_type}): ${x_comment}.
         rank_table(${rank_table_type}): ${rank_table_comment}.
-    
+
     Returns:
         out(${out_type}): ${out_comment}.
 
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3300a9fc492..5bec744542d 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -140,9 +140,9 @@ def retinanet_target_assign(bbox_pred,
             :math:`[xmin, ymin]` is the left top coordinate of the anchor box,
             :math:`[xmax, ymax]` is the right bottom coordinate of the anchor box.
             The data type of :attr:`anchor_box` is float32 or float64. Please refer
-            to the OP :ref:`api_fluid_layers_anchor_generator` 
+            to the OP :ref:`api_fluid_layers_anchor_generator`
             for the generation of :attr:`anchor_box`.
-        anchor_var(Variable): A 2-D Tensor with shape :math:`[M,4]` represents the expanded 
+        anchor_var(Variable): A 2-D Tensor with shape :math:`[M,4]` represents the expanded
             factors of anchor locations used in loss function. :math:`M` is number of
             all anchors of one image, each anchor possesses a 4-vector expanded factor.
             The data type of :attr:`anchor_var` is float32 or float64. Please refer
@@ -181,7 +181,7 @@ def retinanet_target_assign(bbox_pred,
 
     Returns:
         A tuple with 6 Variables:
-        
+
         **predict_scores** (Variable): A 2-D Tensor with shape :math:`[F+B, C]` represents
         category prediction belonging to positive and negative samples. :math:`F`
         is the number of positive samples in a mini-batch, :math:`B` is the number
@@ -357,7 +357,7 @@ def rpn_target_assign(bbox_pred,
             if the input is image feature map, they are close to the origin
             of the coordinate system. [xmax, ymax] is the right bottom
             coordinate of the anchor box. The data type can be float32 or float64.
-        anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded 
+        anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded
             variances of anchors. The data type can be float32 or float64.
         gt_boxes (Variable): The ground-truth bounding boxes (bboxes) are a 2D
             LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
@@ -382,7 +382,7 @@ def rpn_target_assign(bbox_pred,
     Returns:
         tuple:
         A tuple(predicted_scores, predicted_location, target_label,
-        target_bbox, bbox_inside_weight) is returned. The predicted_scores 
+        target_bbox, bbox_inside_weight) is returned. The predicted_scores
         and predicted_location is the predicted result of the RPN.
         The target_label and target_bbox is the ground truth,
         respectively. The predicted_location is a 2D Tensor with shape
@@ -484,12 +484,12 @@ def sigmoid_focal_loss(x, label, fg_num, gamma=2.0, alpha=0.25):
     `Focal Loss <https://arxiv.org/abs/1708.02002>`_ is used to address the foreground-background
     class imbalance existed on the training phase of many computer vision tasks. This OP computes
     the sigmoid value for each element in the input tensor :attr:`x`, after which focal loss is
-    measured between the sigmoid value and target label. 
+    measured between the sigmoid value and target label.
 
     The focal loss is given as followed:
 
     .. math::
-  
+
         \\mathop{loss_{i,\\,j}}\\limits_{i\\in\\mathbb{[0,\\,N-1]},\\,j\\in\\mathbb{[0,\\,C-1]}}=\\left\\{
         \\begin{array}{rcl}
         - \\frac{1}{fg\_num} * \\alpha * {(1 - \\sigma(x_{i,\\,j}))}^{\\gamma} * \\log(\\sigma(x_{i,\\,j})) & & {(j +1) = label_{i,\\,0}} \\\\
@@ -498,7 +498,7 @@ def sigmoid_focal_loss(x, label, fg_num, gamma=2.0, alpha=0.25):
 
 
     We know that
-    
+
     .. math::
         \\sigma(x_j) = \\frac{1}{1 + \\exp(-x_j)}
 
@@ -524,7 +524,7 @@ def sigmoid_focal_loss(x, label, fg_num, gamma=2.0, alpha=0.25):
             is set to 0.25.
 
     Returns:
-        Variable(the data type is float32 or float64): 
+        Variable(the data type is float32 or float64):
             A 2-D tensor with shape :math:`[N, C]`, which is the focal loss of each element in the input
             tensor :attr:`x`.
 
@@ -533,22 +533,22 @@ def sigmoid_focal_loss(x, label, fg_num, gamma=2.0, alpha=0.25):
 
             import numpy as np
             import paddle.fluid as fluid
-            
+
             num_classes = 10  # exclude background
             image_width = 16
             image_height = 16
             batch_size = 32
             max_iter = 20
-            
-            
+
+
             def gen_train_data():
                 x_data = np.random.uniform(0, 255, (batch_size, 3, image_height,
                                                     image_width)).astype('float64')
                 label_data = np.random.randint(0, num_classes,
                                                (batch_size, 1)).astype('int32')
                 return {"x": x_data, "label": label_data}
-            
-            
+
+
             def get_focal_loss(pred, label, fg_num, num_classes):
                 pred = fluid.layers.reshape(pred, [-1, num_classes])
                 label = fluid.layers.reshape(label, [-1, 1])
@@ -557,8 +557,8 @@ def sigmoid_focal_loss(x, label, fg_num, gamma=2.0, alpha=0.25):
                     pred, label, fg_num, gamma=2.0, alpha=0.25)
                 loss = fluid.layers.reduce_sum(loss)
                 return loss
-            
-            
+
+
             def build_model(mode='train'):
                 x = fluid.data(name="x", shape=[-1, 3, -1, -1], dtype='float64')
                 output = fluid.layers.pool2d(input=x, pool_type='avg', global_pooling=True)
@@ -586,8 +586,8 @@ def sigmoid_focal_loss(x, label, fg_num, gamma=2.0, alpha=0.25):
                     # output of the final fc layer should be connected to a sigmoid layer.
                     pred = fluid.layers.sigmoid(output)
                     return pred
-            
-            
+
+
             loss = build_model('train')
             moment_optimizer = fluid.optimizer.MomentumOptimizer(
                 learning_rate=0.001, momentum=0.9)
@@ -681,7 +681,7 @@ def detection_output(loc,
     Returns:
 
         A tuple with two Variables: (Out, Index) if return_index is True,
-        otherwise, a tuple with one Variable(Out) is returned. 
+        otherwise, a tuple with one Variable(Out) is returned.
 
         Out (Variable): The detection outputs is a LoDTensor with shape [No, 6].
         Data type is the same as input (loc). Each row has six values:
@@ -836,7 +836,7 @@ def box_coder(prior_box,
     **Box Coder Layer**
 
     Encode/Decode the target bounding box with the priorbox information.
-    
+
     The Encoding schema described below:
 
     .. math::
@@ -845,78 +845,78 @@ def box_coder(prior_box,
 
         oy = (ty - py) / ph / pyv
 
-        ow = \log(\abs(tw / pw)) / pwv 
+        ow = \log(\abs(tw / pw)) / pwv
 
-        oh = \log(\abs(th / ph)) / phv 
+        oh = \log(\abs(th / ph)) / phv
 
     The Decoding schema described below:
-    
+
     .. math::
-  
+
         ox = (pw * pxv * tx * + px) - tw / 2
 
         oy = (ph * pyv * ty * + py) - th / 2
 
         ow = \exp(pwv * tw) * pw + tw / 2
 
-        oh = \exp(phv * th) * ph + th / 2   
+        oh = \exp(phv * th) * ph + th / 2
 
-    where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, 
-    width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote 
-    the priorbox's (anchor) center coordinates, width and height. `pxv`, 
-    `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, 
-    `ow`, `oh` denote the encoded/decoded coordinates, width and height. 
+    where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates,
+    width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote
+    the priorbox's (anchor) center coordinates, width and height. `pxv`,
+    `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`,
+    `ow`, `oh` denote the encoded/decoded coordinates, width and height.
 
-    During Box Decoding, two modes for broadcast are supported. Say target 
-    box has shape [N, M, 4], and the shape of prior box can be [N, 4] or 
-    [M, 4]. Then prior box will broadcast to target box along the 
-    assigned axis. 
+    During Box Decoding, two modes for broadcast are supported. Say target
+    box has shape [N, M, 4], and the shape of prior box can be [N, 4] or
+    [M, 4]. Then prior box will broadcast to target box along the
+    assigned axis.
 
     Args:
-        prior_box(Variable): Box list prior_box is a 2-D Tensor with shape 
+        prior_box(Variable): Box list prior_box is a 2-D Tensor with shape
             [M, 4] holds M boxes and data type is float32 or float64. Each box
-            is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the 
+            is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the
             left top coordinate of the anchor box, if the input is image feature
-            map, they are close to the origin of the coordinate system. 
-            [xmax, ymax] is the right bottom coordinate of the anchor box.       
-        prior_box_var(List|Variable|None): prior_box_var supports three types 
-            of input. One is variable with shape [M, 4] which holds M group and 
-            data type is float32 or float64. The second is list consist of 
-            4 elements shared by all boxes and data type is float32 or float64. 
-            Other is None and not involved in calculation. 
-        target_box(Variable): This input can be a 2-D LoDTensor with shape 
-            [N, 4] when code_type is 'encode_center_size'. This input also can 
-            be a 3-D Tensor with shape [N, M, 4] when code_type is 
-            'decode_center_size'. Each box is represented as 
-            [xmin, ymin, xmax, ymax]. The data type is float32 or float64. 
-            This tensor can contain LoD information to represent a batch of inputs. 
+            map, they are close to the origin of the coordinate system.
+            [xmax, ymax] is the right bottom coordinate of the anchor box.
+        prior_box_var(List|Variable|None): prior_box_var supports three types
+            of input. One is variable with shape [M, 4] which holds M group and
+            data type is float32 or float64. The second is list consist of
+            4 elements shared by all boxes and data type is float32 or float64.
+            Other is None and not involved in calculation.
+        target_box(Variable): This input can be a 2-D LoDTensor with shape
+            [N, 4] when code_type is 'encode_center_size'. This input also can
+            be a 3-D Tensor with shape [N, M, 4] when code_type is
+            'decode_center_size'. Each box is represented as
+            [xmin, ymin, xmax, ymax]. The data type is float32 or float64.
+            This tensor can contain LoD information to represent a batch of inputs.
         code_type(str): The code type used with the target box. It can be
-            `encode_center_size` or `decode_center_size`. `encode_center_size` 
+            `encode_center_size` or `decode_center_size`. `encode_center_size`
             by default.
         box_normalized(bool): Whether treat the priorbox as a normalized box.
             Set true by default.
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default. 
-        axis(int): Which axis in PriorBox to broadcast for box decode, 
-            for example, if axis is 0 and TargetBox has shape [N, M, 4] and 
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+        axis(int): Which axis in PriorBox to broadcast for box decode,
+            for example, if axis is 0 and TargetBox has shape [N, M, 4] and
             PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4]
-            for decoding. It is only valid when code type is 
-            `decode_center_size`. Set 0 by default. 
+            for decoding. It is only valid when code type is
+            `decode_center_size`. Set 0 by default.
 
     Returns:
         Variable:
 
-        output_box(Variable): When code_type is 'encode_center_size', the 
-        output tensor of box_coder_op with shape [N, M, 4] representing the 
-        result of N target boxes encoded with M Prior boxes and variances. 
-        When code_type is 'decode_center_size', N represents the batch size 
+        output_box(Variable): When code_type is 'encode_center_size', the
+        output tensor of box_coder_op with shape [N, M, 4] representing the
+        result of N target boxes encoded with M Prior boxes and variances.
+        When code_type is 'decode_center_size', N represents the batch size
         and M represents the number of decoded boxes.
 
     Examples:
- 
+
         .. code-block:: python
- 
+
             import paddle.fluid as fluid
             import paddle
             paddle.enable_static()
@@ -1002,7 +1002,7 @@ def polygon_box_transform(input, name=None):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle.fluid as fluid
             input = fluid.data(name='input', shape=[4, 10, 5, 5], dtype='float32')
             out = fluid.layers.polygon_box_transform(input)
@@ -1038,23 +1038,23 @@ def yolov3_loss(x,
     ${comment}
 
     Args:
-        x (Variable): ${x_comment}The data type is float32 or float64. 
+        x (Variable): ${x_comment}The data type is float32 or float64.
         gt_box (Variable): groud truth boxes, should be in shape of [N, B, 4],
-                          in the third dimension, x, y, w, h should be stored. 
+                          in the third dimension, x, y, w, h should be stored.
                           x,y is the center coordinate of boxes, w, h are the
-                          width and height, x, y, w, h should be divided by 
+                          width and height, x, y, w, h should be divided by
                           input image height to scale to [0, 1].
-                          N is the batch number and B is the max box number in 
-                          an image.The data type is float32 or float64. 
+                          N is the batch number and B is the max box number in
+                          an image.The data type is float32 or float64.
         gt_label (Variable): class id of ground truth boxes, should be in shape
-                            of [N, B].The data type is int32. 
+                            of [N, B].The data type is int32.
         anchors (list|tuple): ${anchors_comment}
         anchor_mask (list|tuple): ${anchor_mask_comment}
         class_num (int): ${class_num_comment}
         ignore_thresh (float): ${ignore_thresh_comment}
         downsample_ratio (int): ${downsample_ratio_comment}
-        name (string): The default value is None.  Normally there is no need 
-                       for user to set this property.  For more information, 
+        name (string): The default value is None.  Normally there is no need
+                       for user to set this property.  For more information,
                        please refer to :ref:`api_guide_Name`
         gt_score (Variable): mixup score of ground truth boxes, should be in shape
                             of [N, B]. Default None.
@@ -1087,7 +1087,7 @@ def yolov3_loss(x,
           anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
           anchor_mask = [0, 1, 2]
           loss = fluid.layers.yolov3_loss(x=x, gt_box=gt_box, gt_label=gt_label,
-                                          gt_score=gt_score, anchors=anchors, 
+                                          gt_score=gt_score, anchors=anchors,
                                           anchor_mask=anchor_mask, class_num=80,
                                           ignore_thresh=0.7, downsample_ratio=32)
     """
@@ -1174,23 +1174,23 @@ def yolo_box(x,
     ${comment}
 
     Args:
-        x (Variable): ${x_comment} The data type is float32 or float64. 
-        img_size (Variable): ${img_size_comment} The data type is int32. 
+        x (Variable): ${x_comment} The data type is float32 or float64.
+        img_size (Variable): ${img_size_comment} The data type is int32.
         anchors (list|tuple): ${anchors_comment}
         class_num (int): ${class_num_comment}
         conf_thresh (float): ${conf_thresh_comment}
         downsample_ratio (int): ${downsample_ratio_comment}
         clip_bbox (bool): ${clip_bbox_comment}
         scale_x_y (float): ${scale_x_y_comment}
-        name (string): The default value is None.  Normally there is no need 
-                       for user to set this property.  For more information, 
+        name (string): The default value is None.  Normally there is no need
+                       for user to set this property.  For more information,
                        please refer to :ref:`api_guide_Name`
         iou_aware (bool): ${iou_aware_comment}
         iou_aware_factor (float): ${iou_aware_factor_comment}
 
     Returns:
         Variable: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
-        and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification 
+        and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification
         scores of boxes.
 
     Raises:
@@ -1209,7 +1209,7 @@ def yolo_box(x,
         x = fluid.data(name='x', shape=[None, 255, 13, 13], dtype='float32')
         img_size = fluid.data(name='img_size',shape=[None, 2],dtype='int64')
         anchors = [10, 13, 16, 30, 33, 23]
-        boxes,scores = fluid.layers.yolo_box(x=x, img_size=img_size, class_num=80, anchors=anchors, 
+        boxes,scores = fluid.layers.yolo_box(x=x, img_size=img_size, class_num=80, anchors=anchors,
                                         conf_thresh=0.01, downsample_ratio=32)
     """
     helper = LayerHelper('yolo_box', **locals())
@@ -1375,23 +1375,23 @@ def bipartite_match(dist_matrix,
 
     Args:
         dist_matrix(Variable): This input is a 2-D LoDTensor with shape
-            [K, M]. The data type is float32 or float64. It is pair-wise 
-            distance matrix between the entities represented by each row and 
-            each column. For example, assumed one entity is A with shape [K], 
-            another entity is B with shape [M]. The dist_matrix[i][j] is the 
-            distance between A[i] and B[j]. The bigger the distance is, the 
-            better matching the pairs are. NOTE: This tensor can contain LoD 
-            information to represent a batch of inputs. One instance of this 
+            [K, M]. The data type is float32 or float64. It is pair-wise
+            distance matrix between the entities represented by each row and
+            each column. For example, assumed one entity is A with shape [K],
+            another entity is B with shape [M]. The dist_matrix[i][j] is the
+            distance between A[i] and B[j]. The bigger the distance is, the
+            better matching the pairs are. NOTE: This tensor can contain LoD
+            information to represent a batch of inputs. One instance of this
             batch can contain different numbers of entities.
         match_type(str, optional): The type of matching method, should be
            'bipartite' or 'per_prediction'. None ('bipartite') by default.
         dist_threshold(float32, optional): If `match_type` is 'per_prediction',
             this threshold is to determine the extra matching bboxes based
             on the maximum distance, 0.5 by default.
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
             None by default.
- 
+
     Returns:
         Tuple:
 
@@ -1872,16 +1872,16 @@ def prior_box(
 	    place = fluid.CPUPlace()
 	    exe = fluid.Executor(place)
 	    exe.run(fluid.default_startup_program())
- 
+
 	    # prepare a batch of data
 	    input_data = np.random.rand(1,3,6,9).astype("float32")
 	    image_data = np.random.rand(1,3,9,12).astype("float32")
- 
+
 	    box_out, var_out = exe.run(fluid.default_main_program(),
                 feed={"input":input_data,"image":image_data},
                 fetch_list=[box,var],
                 return_numpy=True)
- 
+
 	    # print(box_out.shape)
 	    # (6, 9, 1, 4)
 	    # print(var_out.shape)
@@ -1982,16 +1982,16 @@ def density_prior_box(input,
                       name=None):
     r"""
 
-    This op generates density prior boxes for SSD(Single Shot MultiBox Detector) 
-    algorithm. Each position of the input produce N prior boxes, N is 
-    determined by the count of densities, fixed_sizes and fixed_ratios. 
-    Boxes center at grid points around each input position is generated by 
-    this operator, and the grid points is determined by densities and 
-    the count of density prior box is determined by fixed_sizes and fixed_ratios. 
+    This op generates density prior boxes for SSD(Single Shot MultiBox Detector)
+    algorithm. Each position of the input produce N prior boxes, N is
+    determined by the count of densities, fixed_sizes and fixed_ratios.
+    Boxes center at grid points around each input position is generated by
+    this operator, and the grid points is determined by densities and
+    the count of density prior box is determined by fixed_sizes and fixed_ratios.
     Obviously, the number of fixed_sizes is equal to the number of densities.
-    
+
     For densities_i in densities:
-    
+
     .. math::
 
         N\_density_prior\_box = SUM(N\_fixed\_ratios * densities\_i^2)
@@ -2002,11 +2002,11 @@ def density_prior_box(input,
        input(Variable): 4-D tensor(NCHW), the data type should be float32 of float64.
        image(Variable): 4-D tensor(NCHW), the input image data of PriorBoxOp, the data type should be float32 or float64.
             the layout is NCHW.
-       densities(list|tuple|None): The densities of generated density prior 
-            boxes, this attribute should be a list or tuple of integers. 
+       densities(list|tuple|None): The densities of generated density prior
+            boxes, this attribute should be a list or tuple of integers.
             Default: None.
        fixed_sizes(list|tuple|None): The fixed sizes of generated density
-            prior boxes, this attribute should a list or tuple of same 
+            prior boxes, this attribute should a list or tuple of same
             length with :attr:`densities`. Default: None.
        fixed_ratios(list|tuple|None): The fixed ratios of generated density
             prior boxes, if this attribute is not set and :attr:`densities`
@@ -2023,7 +2023,7 @@ def density_prior_box(input,
        flatten_to_2d(bool): Whether to flatten output prior boxes and variance
            to 2D shape, the second dim is 4. Default: False.
        name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`
-    
+
     Returns:
         Tuple: A tuple with two Variable (boxes, variances)
 
@@ -2063,7 +2063,7 @@ def density_prior_box(input,
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
- 
+
             # prepare a batch of data
             input_data = np.random.rand(1,3,6,9).astype("float32")
             image_data = np.random.rand(1,3,9,12).astype("float32")
@@ -2252,7 +2252,7 @@ def multi_box_head(inputs,
         is the number of prior boxes. Data type is the same as input.
 
         mbox_conf (Variable): The predicted boxes' confidence of the inputs.
-        The layout is [N, num_priors, C], where ``N`` and ``num_priors`` 
+        The layout is [N, num_priors, C], where ``N`` and ``num_priors``
         has the same meaning as above. C is the number of Classes.
         Data type is the same as input.
 
@@ -2466,28 +2466,28 @@ def anchor_generator(input,
        input(Variable): 4-D Tensor with shape [N,C,H,W]. The input feature map.
        anchor_sizes(float32|list|tuple, optional): The anchor sizes of generated
           anchors, given in absolute pixels e.g. [64., 128., 256., 512.].
-          For instance, the anchor size of 64 means the area of this anchor 
+          For instance, the anchor size of 64 means the area of this anchor
           equals to 64**2. None by default.
-       aspect_ratios(float32|list|tuple, optional): The height / width ratios 
+       aspect_ratios(float32|list|tuple, optional): The height / width ratios
            of generated anchors, e.g. [0.5, 1.0, 2.0]. None by default.
-       variance(list|tuple, optional): The variances to be used in box 
-           regression deltas. The data type is float32, [0.1, 0.1, 0.2, 0.2] by 
+       variance(list|tuple, optional): The variances to be used in box
+           regression deltas. The data type is float32, [0.1, 0.1, 0.2, 0.2] by
            default.
        stride(list|tuple, optional): The anchors stride across width and height.
            The data type is float32. e.g. [16.0, 16.0]. None by default.
        offset(float32, optional): Prior boxes center offset. 0.5 by default.
-       name(str, optional): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and None 
-           by default. 
+       name(str, optional): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and None
+           by default.
 
     Returns:
         Tuple:
 
         Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4].
         H is the height of input, W is the width of input,
-        num_anchors is the box count of each position. 
+        num_anchors is the box count of each position.
         Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
- 
+
         Variances(Variable): The expanded variances of anchors
         with a layout of [H, W, num_priors, 4].
         H is the height of input, W is the width of input
@@ -2563,26 +2563,26 @@ def roi_perspective_transform(input,
     """
     **The** `rois` **of this op should be a LoDTensor.**
 
-    ROI perspective transform op applies perspective transform to map each roi into an 
+    ROI perspective transform op applies perspective transform to map each roi into an
     rectangular region. Perspective transform is a type of transformation in linear algebra.
 
     Parameters:
-        input (Variable): 4-D Tensor, input of ROIPerspectiveTransformOp. The format of 
+        input (Variable): 4-D Tensor, input of ROIPerspectiveTransformOp. The format of
                           input tensor is NCHW. Where N is batch size, C is the
                           number of input channels, H is the height of the feature,
                           and W is the width of the feature. The data type is float32.
-        rois (Variable):  2-D LoDTensor, ROIs (Regions of Interest) to be transformed. 
-                          It should be a 2-D LoDTensor of shape (num_rois, 8). Given as 
-                          [[x1, y1, x2, y2, x3, y3, x4, y4], ...], (x1, y1) is the 
-                          top left coordinates, and (x2, y2) is the top right 
-                          coordinates, and (x3, y3) is the bottom right coordinates, 
+        rois (Variable):  2-D LoDTensor, ROIs (Regions of Interest) to be transformed.
+                          It should be a 2-D LoDTensor of shape (num_rois, 8). Given as
+                          [[x1, y1, x2, y2, x3, y3, x4, y4], ...], (x1, y1) is the
+                          top left coordinates, and (x2, y2) is the top right
+                          coordinates, and (x3, y3) is the bottom right coordinates,
                           and (x4, y4) is the bottom left coordinates. The data type is the
-                          same as `input` 
+                          same as `input`
         transformed_height (int): The height of transformed output.
         transformed_width (int): The width of transformed output.
         spatial_scale (float): Spatial scale factor to scale ROI coords. Default: 1.0
-        name(str, optional): The default value is None.  
-                             Normally there is no need for user to set this property.  
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
                              For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
@@ -2831,8 +2831,8 @@ def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
                         gt_segm.append(np.array(polys).reshape(-1, 2))
                     gt_masks.append(gt_segm)
                 batch_masks.append(gt_masks)
-            
-            
+
+
             place = fluid.CPUPlace()
             feeder = fluid.DataFeeder(place=place, feed_list=feeds)
             feeder.feed(batch_masks)
@@ -2962,7 +2962,7 @@ def generate_proposals(scores,
     **Generate proposal Faster-RCNN**
 
     This operation proposes RoIs according to each box with their
-    probability to be a foreground object and 
+    probability to be a foreground object and
     the box can be calculated by anchors. Bbox_deltais and scores
     to be an object are the output of RPN. Final proposals
     could be used to train detection net.
@@ -2971,9 +2971,9 @@ def generate_proposals(scores,
 
     1. Transposes and resizes scores and bbox_deltas in size of
        (H*W*A, 1) and (H*W*A, 4)
-    2. Calculate box locations as proposals candidates. 
+    2. Calculate box locations as proposals candidates.
     3. Clip boxes to image
-    4. Remove predicted boxes with small area. 
+    4. Remove predicted boxes with small area.
     5. Apply NMS to get final proposals as output.
 
     Args:
@@ -2985,8 +2985,8 @@ def generate_proposals(scores,
             represents the difference between predicted box location and
             anchor location. The data type must be float32.
         im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin
-            image information for N batch. Height and width are the input sizes 
-            and scale is the ratio of network input size and original size. 
+            image information for N batch. Height and width are the input sizes
+            and scale is the ratio of network input size and original size.
             The data type can be float32 or float64.
         anchors(Variable):   A 4-D Tensor represents the anchors with a layout
             of [H, W, A, 4]. H and W are height and width of the feature map,
@@ -3004,13 +3004,13 @@ def generate_proposals(scores,
             width < min_size. The data type must be float32. `0.1` by default.
         eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`,
             `adaptive_threshold = adaptive_threshold * eta` in each iteration.
-        return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's 
+        return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's
             num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents
-            the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. 
-            'False' by default. 
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default. 
+            the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model.
+            'False' by default.
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
 
     Returns:
         tuple:
@@ -3021,7 +3021,7 @@ def generate_proposals(scores,
 
     Examples:
         .. code-block:: python
-        
+
             import paddle.fluid as fluid
             import paddle
             paddle.enable_static()
@@ -3050,19 +3050,19 @@ def generate_proposals(scores,
 
 def box_clip(input, im_info, name=None):
     """
-	
+
     Clip the box into the size given by im_info
     For each input box, The formula is given as follows:
-        
+
     .. code-block:: text
 
         xmin = max(min(xmin, im_w - 1), 0)
-        ymin = max(min(ymin, im_h - 1), 0) 
+        ymin = max(min(ymin, im_h - 1), 0)
         xmax = max(min(xmax, im_w - 1), 0)
         ymax = max(min(ymax, im_h - 1), 0)
-    
+
     where im_w and im_h are computed from im_info:
- 
+
     .. code-block:: text
 
         im_h = round(height / scale)
@@ -3071,24 +3071,24 @@ def box_clip(input, im_info, name=None):
     Args:
         input(Variable): The input Tensor with shape :math:`[N_1, N_2, ..., N_k, 4]`,
             the last dimension is 4 and data type is float32 or float64.
-        im_info(Variable): The 2-D Tensor with shape [N, 3] with layout 
-            (height, width, scale) representing the information of image. 
+        im_info(Variable): The 2-D Tensor with shape [N, 3] with layout
+            (height, width, scale) representing the information of image.
             Height and width are the input sizes and scale is the ratio of network input
             size and original size. The data type is float32 or float64.
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default. 
-    
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+
     Returns:
         Variable:
 
-        output(Variable): The clipped tensor with data type float32 or float64. 
+        output(Variable): The clipped tensor with data type float32 or float64.
         The shape is same as input.
 
-        
+
     Examples:
         .. code-block:: python
-        
+
             import paddle.fluid as fluid
             import paddle
             paddle.enable_static()
@@ -3123,7 +3123,7 @@ def retinanet_detection_output(bboxes,
     """
     **Detection Output Layer for the detector RetinaNet.**
 
-    In the detector `RetinaNet <https://arxiv.org/abs/1708.02002>`_ , many 
+    In the detector `RetinaNet <https://arxiv.org/abs/1708.02002>`_ , many
     `FPN <https://arxiv.org/abs/1612.03144>`_ levels output the category
     and location predictions, this OP is to get the detection results by
     performing following steps:
@@ -3131,7 +3131,7 @@ def retinanet_detection_output(bboxes,
     1. For each FPN level, decode box predictions according to the anchor
        boxes from at most :attr:`nms_top_k` top-scoring predictions after
        thresholding detector confidence at :attr:`score_threshold`.
-    2. Merge top predictions from all levels and apply multi-class non 
+    2. Merge top predictions from all levels and apply multi-class non
        maximum suppression (NMS) on them to get the final detections.
 
     Args:
@@ -3167,7 +3167,7 @@ def retinanet_detection_output(bboxes,
         keep_top_k(int): Number of total bounding boxes to be kept per image after
             NMS step. Default value is set to 100, -1 means keeping all bounding
             boxes after NMS step.
-        nms_threshold(float): The Intersection-over-Union(IoU) threshold used to 
+        nms_threshold(float): The Intersection-over-Union(IoU) threshold used to
             filter out boxes in NMS.
         nms_eta(float): The parameter for adjusting :attr:`nms_threshold` in NMS.
             Default value is set to 1., which represents the value of
@@ -3279,7 +3279,7 @@ def multiclass_nms(bboxes,
     """
 
     **Multiclass NMS**
-    
+
     This operator is to do multi-class non maximum suppression (NMS) on
     boxes and scores.
 
@@ -3310,9 +3310,9 @@ def multiclass_nms(bboxes,
 
         Then:
             iou = 4/11 > 0.3
-            out.data = [[1, 0.3, 3.0, 4.0, 8.0, 5.0],    
+            out.data = [[1, 0.3, 3.0, 4.0, 8.0, 5.0],
                          [2, 0.4, 2.0, 3.0, 7.0, 5.0]]
-                         
+
             Out format is (label, confidence, xmin, ymin, xmax, ymax)
     Args:
         bboxes (Variable): Two types of bboxes are supported:
@@ -3320,29 +3320,29 @@ def multiclass_nms(bboxes,
                            [N, M, 4 or 8 16 24 32] represents the
                            predicted locations of M bounding bboxes,
                            N is the batch size. Each bounding box has four
-                           coordinate values and the layout is 
+                           coordinate values and the layout is
                            [xmin, ymin, xmax, ymax], when box size equals to 4.
                            The data type is float32 or float64.
                            2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]
-                           M is the number of bounding boxes, C is the 
-                           class number. The data type is float32 or float64.   
+                           M is the number of bounding boxes, C is the
+                           class number. The data type is float32 or float64.
         scores (Variable): Two types of scores are supported:
                            1. (Tensor) A 3-D Tensor with shape [N, C, M]
                            represents the predicted confidence predictions.
-                           N is the batch size, C is the class number, M is 
-                           number of bounding boxes. For each category there 
+                           N is the batch size, C is the class number, M is
+                           number of bounding boxes. For each category there
                            are total M scores which corresponding M bounding
                            boxes. Please note, M is equal to the 2nd dimension
-                           of BBoxes.The data type is float32 or float64. 
+                           of BBoxes.The data type is float32 or float64.
                            2. (LoDTensor) A 2-D LoDTensor with shape [M, C].
                            M is the number of bbox, C is the class number.
                            In this case, input BBoxes should be the second
-                           case with shape [M, C, 4].The data type is float32 or float64. 
-        background_label (int): The index of background label, the background 
+                           case with shape [M, C, 4].The data type is float32 or float64.
+        background_label (int): The index of background label, the background
                                 label will be ignored. If set to -1, then all
                                 categories will be considered. Default: 0
         score_threshold (float): Threshold to filter out bounding boxes with
-                                 low confidence score. If not provided, 
+                                 low confidence score. If not provided,
                                  consider all boxes.
         nms_top_k (int): Maximum number of detections to be kept according to
                          the confidences after the filtering detections based
@@ -3358,13 +3358,13 @@ def multiclass_nms(bboxes,
         Variable: A 2-D LoDTensor with shape [No, 6] represents the detections.
              Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
              or A 2-D LoDTensor with shape [No, 10] represents the detections.
-             Each row has 10 values: 
-             [label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the 
+             Each row has 10 values:
+             [label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the
              total number of detections. If there is no detected boxes for all
              images, lod will be set to {1} and Out only contains one value
              which is -1.
-             (After version 1.3, when no boxes detected, the lod is changed 
-             from {0} to {1}) 
+             (After version 1.3, when no boxes detected, the lod is changed
+             from {0} to {1})
 
 
     Examples:
@@ -3433,7 +3433,7 @@ def locality_aware_nms(bboxes,
                        name=None):
     """
     **Local Aware NMS**
-    
+
     `Local Aware NMS <https://arxiv.org/abs/1704.03155>`_ is to do locality-aware non maximum
     suppression (LANMS) on boxes and scores.
 
@@ -3700,14 +3700,14 @@ def distribute_fpn_proposals(fpn_rois,
                              rois_num=None,
                              name=None):
     r"""
-	
-    **This op only takes LoDTensor as input.** In Feature Pyramid Networks 
-    (FPN) models, it is needed to distribute all proposals into different FPN 
-    level, with respect to scale of the proposals, the referring scale and the 
-    referring level. Besides, to restore the order of proposals, we return an 
-    array which indicates the original index of rois in current proposals. 
+
+    **This op only takes LoDTensor as input.** In Feature Pyramid Networks
+    (FPN) models, it is needed to distribute all proposals into different FPN
+    level, with respect to scale of the proposals, the referring scale and the
+    referring level. Besides, to restore the order of proposals, we return an
+    array which indicates the original index of rois in current proposals.
     To compute FPN level for each roi, the formula is given as follows:
-    
+
     .. math::
 
         roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
@@ -3718,36 +3718,36 @@ def distribute_fpn_proposals(fpn_rois,
 
     Args:
 
-        fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is 
+        fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is
             float32 or float64. The input fpn_rois.
-        min_level(int32): The lowest level of FPN layer where the proposals come 
+        min_level(int32): The lowest level of FPN layer where the proposals come
             from.
         max_level(int32): The highest level of FPN layer where the proposals
             come from.
         refer_level(int32): The referring level of FPN layer with specified scale.
         refer_scale(int32): The referring scale of FPN layer with specified level.
-        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. 
+        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image.
             The shape is [B] and data type is int32. B is the number of images.
-            If it is not None then return a list of 1-D Tensor. Each element 
+            If it is not None then return a list of 1-D Tensor. Each element
             is the output RoIs' number of each image on the corresponding level
             and the shape is [B]. None by default.
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default. 
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
 
     Returns:
         Tuple:
 
-        multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4] 
-        and data type of float32 and float64. The length is 
+        multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4]
+        and data type of float32 and float64. The length is
         max_level-min_level+1. The proposals in each FPN level.
 
-        restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is 
+        restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is
         the number of total rois. The data type is int32. It is
         used to restore the order of fpn_rois.
 
-        rois_num_per_level(List): A list of 1-D Tensor and each Tensor is 
-        the RoIs' number in each image on the corresponding level. The shape 
+        rois_num_per_level(List): A list of 1-D Tensor and each Tensor is
+        the RoIs' number in each image on the corresponding level. The shape
         is [B] and data type of int32. B is the number of images
 
 
@@ -3783,7 +3783,7 @@ def box_decoder_and_assign(prior_box,
                            box_clip,
                            name=None):
     """
-	
+
     ${comment}
     Args:
         prior_box(${prior_box_type}): ${prior_box_comment}
@@ -3791,9 +3791,9 @@ def box_decoder_and_assign(prior_box,
         target_box(${target_box_type}): ${target_box_comment}
         box_score(${box_score_type}): ${box_score_comment}
         box_clip(${box_clip_type}): ${box_clip_comment}
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default. 
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
 
     Returns:
         Tuple:
@@ -3857,9 +3857,9 @@ def collect_fpn_proposals(multi_rois,
                           rois_num_per_level=None,
                           name=None):
     """
-	
-    **This OP only supports LoDTensor as input**. Concat multi-level RoIs 
-    (Region of Interest) and select N RoIs with respect to multi_scores. 
+
+    **This OP only supports LoDTensor as input**. Concat multi-level RoIs
+    (Region of Interest) and select N RoIs with respect to multi_scores.
     This operation performs the following steps:
 
     1. Choose num_level RoIs and scores as input: num_level = max_level - min_level
@@ -3869,38 +3869,38 @@ def collect_fpn_proposals(multi_rois,
     5. Re-sort RoIs by corresponding batch_id
 
     Args:
-        multi_rois(list): List of RoIs to collect. Element in list is 2-D 
-            LoDTensor with shape [N, 4] and data type is float32 or float64, 
+        multi_rois(list): List of RoIs to collect. Element in list is 2-D
+            LoDTensor with shape [N, 4] and data type is float32 or float64,
             N is the number of RoIs.
-        multi_scores(list): List of scores of RoIs to collect. Element in list 
+        multi_scores(list): List of scores of RoIs to collect. Element in list
             is 2-D LoDTensor with shape [N, 1] and data type is float32 or
             float64, N is the number of RoIs.
         min_level(int): The lowest level of FPN layer to collect
         max_level(int): The highest level of FPN layer to collect
         post_nms_top_n(int): The number of selected RoIs
-        rois_num_per_level(list, optional): The List of RoIs' numbers. 
-            Each element is 1-D Tensor which contains the RoIs' number of each 
-            image on each level and the shape is [B] and data type is 
-            int32, B is the number of images. If it is not None then return 
-            a 1-D Tensor contains the output RoIs' number of each image and 
+        rois_num_per_level(list, optional): The List of RoIs' numbers.
+            Each element is 1-D Tensor which contains the RoIs' number of each
+            image on each level and the shape is [B] and data type is
+            int32, B is the number of images. If it is not None then return
+            a 1-D Tensor contains the output RoIs' number of each image and
             the shape is [B]. Default: None
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default.        
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
 
     Returns:
         Variable:
 
-        fpn_rois(Variable): 2-D LoDTensor with shape [N, 4] and data type is 
-        float32 or float64. Selected RoIs. 
+        fpn_rois(Variable): 2-D LoDTensor with shape [N, 4] and data type is
+        float32 or float64. Selected RoIs.
 
-        rois_num(Tensor): 1-D Tensor contains the RoIs's number of each 
-        image. The shape is [B] and data type is int32. B is the number of 
-        images. 
+        rois_num(Tensor): 1-D Tensor contains the RoIs's number of each
+        image. The shape is [B] and data type is int32. B is the number of
+        images.
 
     Examples:
         .. code-block:: python
-           
+
             import paddle.fluid as fluid
             import paddle
             paddle.enable_static()
@@ -3914,10 +3914,10 @@ def collect_fpn_proposals(multi_rois,
                     name='score_'+str(i), shape=[None, 1], dtype='float32', lod_level=1))
 
             fpn_rois = fluid.layers.collect_fpn_proposals(
-                multi_rois=multi_rois, 
+                multi_rois=multi_rois,
                 multi_scores=multi_scores,
-                min_level=2, 
-                max_level=5, 
+                min_level=2,
+                max_level=5,
                 post_nms_top_n=2000)
     """
     num_lvl = max_level - min_level + 1
diff --git a/python/paddle/fluid/layers/distributions.py b/python/paddle/fluid/layers/distributions.py
index 757ba0dc885..52042a1f070 100644
--- a/python/paddle/fluid/layers/distributions.py
+++ b/python/paddle/fluid/layers/distributions.py
@@ -283,7 +283,7 @@ class Normal(Distribution):
 
     Examples:
         .. code-block:: python
-          
+
           import numpy as np
           from paddle.fluid import layers
           from paddle.fluid.layers import Normal
@@ -427,9 +427,9 @@ class Normal(Distribution):
 
 class Categorical(Distribution):
     r"""
-    Categorical distribution is a discrete probability distribution that 
-    describes the possible results of a random variable that can take on 
-    one of K possible categories, with the probability of each category 
+    Categorical distribution is a discrete probability distribution that
+    describes the possible results of a random variable that can take on
+    one of K possible categories, with the probability of each category
     separately specified.
 
     The probability mass function (pmf) is:
@@ -459,7 +459,7 @@ class Categorical(Distribution):
           b_logits_npdata = np.array([-0.102,-0.112], dtype="float32")
           b_logits_tensor = layers.create_tensor(dtype="float32")
           layers.assign(b_logits_npdata, b_logits_tensor)
-          
+
           a = Categorical(a_logits_tensor)
           b = Categorical(b_logits_tensor)
 
@@ -564,7 +564,7 @@ class MultivariateNormalDiag(Distribution):
 
     Examples:
         .. code-block:: python
-    
+
             import numpy as np
             from paddle.fluid import layers
             from paddle.fluid.layers import MultivariateNormalDiag
@@ -588,7 +588,7 @@ class MultivariateNormalDiag(Distribution):
 
             a = MultivariateNormalDiag(a_loc_tensor, a_scale_tensor)
             b = MultivariateNormalDiag(b_loc_tensor, b_scale_tensor)
-            
+
             a.entropy()
             # [2.033158] with shape: [1]
             b.entropy()
@@ -596,7 +596,7 @@ class MultivariateNormalDiag(Distribution):
 
             a.kl_divergence(b)
             # [0.06542051] with shape: [1]
-       
+
     """
 
     def __init__(self, loc, scale):
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index c24a0477ffc..1b8a9677f3e 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -54,13 +54,13 @@ def data(name,
     This operator creates the global variable. The global variables can be
     accessed by all the following operators in the graph.
 
-    Note: 
-        :code:`paddle.fluid.layers.data` is deprecated as it will be removed in 
+    Note:
+        :code:`paddle.fluid.layers.data` is deprecated as it will be removed in
         a later version. Please use :code:`paddle.fluid.data` .
 
         This :code:`paddle.fluid.layers.data` set shape and dtype at compile
         time but does NOT check the shape or the dtype of fed data, the
-        :code:`paddle.fluid.data` checks the shape and the dtype of data fed 
+        :code:`paddle.fluid.data` checks the shape and the dtype of data fed
         by Executor or ParallelExecutor during run time.
 
         To feed variable size inputs, users can feed variable size inputs
@@ -77,12 +77,12 @@ def data(name,
        name(str): The name/alias of the variable, see :ref:`api_guide_Name`
             for more details.
        shape(list|tuple): Tuple declaring the shape. If :code:`append_batch_size` is
-            True and there is no -1 inside :code:`shape`, it should be 
+            True and there is no -1 inside :code:`shape`, it should be
             considered as the shape of the each sample. Otherwise, it should
-            be considered as the shape of the batched data.  
+            be considered as the shape of the batched data.
        append_batch_size(bool):
           1. If true, it prepends -1 to the shape.
-            For example if shape=[1], the resulting shape is [-1, 1]. This will 
+            For example if shape=[1], the resulting shape is [-1, 1]. This will
             be useful to set different batch size at run time.
           2. If shape contains -1, such as shape=[1, -1].
             append_batch_size will be enforced to be be False (ineffective)
@@ -91,11 +91,11 @@ def data(name,
        dtype(np.dtype|VarType|str): The type of the data. Supported dtype: bool,
             float16, float32, float64, int8, int16, int32, int64, uint8.
        type(VarType): The output type. Supported dtype: VarType.LOD_TENSOR,
-            VarType.SELECTED_ROWS, VarType.NCCL_ID. Default: VarType.LOD_TENSOR. 
+            VarType.SELECTED_ROWS, VarType.NCCL_ID. Default: VarType.LOD_TENSOR.
        lod_level(int): The LoD Level. 0 means the input data is not a sequence.
             Default: 0.
        stop_gradient(bool): A boolean that mentions whether gradient should flow.
-            Default: True. 
+            Default: True.
 
     Returns:
         The global variable that gives access to the data.
@@ -571,24 +571,24 @@ def py_reader(capacity,
     This operator returns a Reader Variable.
     The Reader provides :code:`decorate_paddle_reader()` and
     :code:`decorate_tensor_provider()` to set a Python generator as the data
-    source and feed the data from the data source to the Reader Variable. 
-    When :code:`Executor::Run()` is invoked in C++ side, the data from the 
+    source and feed the data from the data source to the Reader Variable.
+    When :code:`Executor::Run()` is invoked in C++ side, the data from the
     generator would be read automatically. Unlike :code:`DataFeeder.feed()`,
-    the data reading process and :code:`Executor::Run()` process can run in 
+    the data reading process and :code:`Executor::Run()` process can run in
     parallel using :code:`py_reader`. The :code:`start()` method of the Reader
-    should be called when each pass begins, while the :code:`reset()` method 
+    should be called when each pass begins, while the :code:`reset()` method
     should be called when the pass ends and :code:`fluid.core.EOFException` raises.
 
     Note:
-       :code:`Program.clone()` method cannot clone :code:`py_reader`. You can 
+       :code:`Program.clone()` method cannot clone :code:`py_reader`. You can
        refer to :ref:`api_fluid_Program` for more details.
-       
+
        The :code:`read_file` call needs to be in the program block of :code:`py_reader`.
        You can refer to :ref:`api_fluid_layers_read_file` for more details.
 
     Args:
        capacity(int): The buffer capacity maintained by :code:`py_reader`.
-       shapes(list|tuple): List of tuples which declaring data shapes. shapes[i] 
+       shapes(list|tuple): List of tuples which declaring data shapes. shapes[i]
             represents the i-th data shape.
        dtypes(list|tuple): List of strings which declaring data type. Supported dtype:
             bool, float16, float32, float64, int8, int16, int32, int64, uint8.
@@ -596,8 +596,8 @@ def py_reader(capacity,
        name(basestring): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
-       use_double_buffer(bool): Whether use double buffer or not. The double buffer is 
-            for pre-reading the data of the next batch and copy the data asynchronously 
+       use_double_buffer(bool): Whether use double buffer or not. The double buffer is
+            for pre-reading the data of the next batch and copy the data asynchronously
             from CPU to GPU. Default is True.
 
     Returns:
@@ -608,9 +608,9 @@ def py_reader(capacity,
 
     Examples:
        1. The basic usage of :code:`py_reader` is as follows:
-       
+
        .. code-block:: python
-    
+
          import paddle
          import paddle.fluid as fluid
          import paddle.dataset.mnist as mnist
@@ -649,7 +649,7 @@ def py_reader(capacity,
        :code:`py_reader` should be created with different names, e.g.:
 
        .. code-block:: python
-    
+
          import paddle
          import paddle.fluid as fluid
          import paddle.dataset.mnist as mnist
@@ -843,15 +843,15 @@ def double_buffer(reader, place=None, name=None):
     Args:
         reader (Variable): The Reader Variable need to be wrapped.
         place (Place|str, optional): The place of target data, such as CPU, GPU, and if use GPU, it's necessary to point out which card is involved. Default is the sample place of executor perform.
-            if ``place`` is string, It can be ``cpu``, ``gpu:x``, where ``x`` is the ndex of the GPUs. 
-        name (str, optional): Variable name. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default is None. 
+            if ``place`` is string, It can be ``cpu``, ``gpu:x``, where ``x`` is the ndex of the GPUs.
+        name (str, optional): Variable name. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default is None.
 
     Returns:
         Variable(Reader): wrapped reader with double buffer.
 
     Examples:
         ..  code-block:: python
-          
+
             import paddle.fluid as fluid
             reader = fluid.layers.py_reader(capacity=64,
                                             shapes=[(-1, 1, 28, 28), (-1, 1)],
@@ -889,7 +889,7 @@ def read_file(reader):
 
     Examples:
         .. code-block:: python
-          
+
            import paddle.fluid as fluid
            reader = fluid.layers.py_reader(capacity=64,
                                            shapes=[(-1, 1, 28, 28), (-1, 1)],
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 65ce37157c2..fccef292c68 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -393,7 +393,7 @@ def templatedoc(op_type=None):
 
 def add_sample_code(func, sample_code):
     """
-    Append sample code for dynamically generated functions. 
+    Append sample code for dynamically generated functions.
 
     Args:
        func: The function of the function to be append sample code to.
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index e1a65633e60..7da1403ef4d 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -56,7 +56,7 @@ def noam_decay(d_model, warmup_steps, learning_rate=1.0):
     Noam decay method. The numpy implementation of noam decay as follows.
 
     .. code-block:: python
-      
+
       import paddle.fluid as fluid
       import numpy as np
       # set hyper parameters
@@ -128,11 +128,11 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     >>>     decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
 
     Args:
-        learning_rate(Variable|float): The initial learning rate. It should be a Variable 
+        learning_rate(Variable|float): The initial learning rate. It should be a Variable
                                        or a float
         decay_steps(int): The learning rate decay steps. See the decay computation above.
         decay_rate(float): The learning rate decay rate. See the decay computation above.
-        staircase(bool): If True, decay the learning rate at discrete intervals, which 
+        staircase(bool): If True, decay the learning rate at discrete intervals, which
                          means the learning rate will be decayed by `decay_rate` every
                          `decay_steps`. If False, learning rate will be decayed continuously
                          and following the formula above. Default: False
@@ -189,11 +189,11 @@ Applies natural exponential decay to the initial learning rate.
     >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * floor(global_step / decay_steps))
 
     Args:
-        learning_rate(Variable|float): The initial learning rate. It should be a Variable 
+        learning_rate(Variable|float): The initial learning rate. It should be a Variable
                                        or a float
         decay_steps(int): The learning rate decay steps. See the decay computation above.
         decay_rate(float): The learning rate decay rate. See the decay computation above.
-        staircase(bool): If True, decay the learning rate at discrete intervals, which 
+        staircase(bool): If True, decay the learning rate at discrete intervals, which
                          means the learning rate will be decayed by natural exponential power
                          `decay_rate` every `decay_steps`. If False, learning rate will be
                          decayed continuously and following the formula above. Default: False
@@ -250,13 +250,13 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
 
     Args:
-        learning_rate(Variable|float): The initial learning rate. It should be a Variable 
+        learning_rate(Variable|float): The initial learning rate. It should be a Variable
                                        or a float
         decay_steps(int): The learning rate decay steps. See the decay computation above.
         decay_rate(float): The learning rate decay rate. See the decay computation above.
-        staircase(bool): If True, decay the learning rate at discrete intervals, which 
-                         means the learning rate will be decayed by `decay_rate` times 
-                         every `decay_steps`. If False, learning rate will be decayed 
+        staircase(bool): If True, decay the learning rate at discrete intervals, which
+                         means the learning rate will be decayed by `decay_rate` times
+                         every `decay_steps`. If False, learning rate will be decayed
                          continuously and following the formula above. Default: False
 
     Returns:
@@ -493,40 +493,40 @@ def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
 
     This operator use the linear learning rate warm up strategy to adjust the learning rate preliminarily before the normal learning rate scheduling.
     For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
-    
+
     When global_step < warmup_steps, learning rate is updated as:
-    
+
     .. code-block:: text
-    
+
             linear_step = end_lr - start_lr
             lr = start_lr + linear_step * (global_step / warmup_steps)
-    
+
     where start_lr is the initial learning rate, and end_lr is the final learning rate;
-    
+
     When global_step >= warmup_steps, learning rate is updated as:
-    
+
     .. code-block:: text
-    
+
             lr = learning_rate
-    
+
     where lr is the learning_rate after warm-up.
-    
+
     Args:
         learning_rate (Variable|float): Learning_rate after warm-up, it could be 1D-Tensor or single value with the data type of float32.
         warmup_steps (int): Steps for warm up.
         start_lr (float): Initial learning rate of warm up.
         end_lr (float): Final learning rate of warm up.
-    
+
     Returns:
         Variable: Warm-up learning rate with the same data type as learning_rate.
-    
-    
+
+
     Examples:
-    
+
     .. code-block:: python
-    
+
         import paddle.fluid as fluid
-    
+
         boundaries = [100, 200]
         lr_steps = [0.1, 0.01, 0.001]
         learning_rate = fluid.layers.piecewise_decay(boundaries, lr_steps) #case1, 1D-Tensor
@@ -536,7 +536,7 @@ def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
         end_lr = 0.1
         decayed_lr = fluid.layers.linear_lr_warmup(learning_rate,
             warmup_steps, start_lr, end_lr)
-    
+
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 20c198388a9..cd62b86b193 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -64,14 +64,14 @@ def center_loss(input,
     :api_attr: Static Graph
 
     **Center loss Cost layer**
-    
+
     This OP accepts input (deep features,the output of the last hidden layer)
-    and target label and return the center loss cost. The average of the 
-    distances of each sample in the mini-batch from the center of the 
+    and target label and return the center loss cost. The average of the
+    distances of each sample in the mini-batch from the center of the
     corresponding category is calculated as the center loss.
-    
+
     For deep features, :math:`X`, and target labels, :math:`Y`, the equation is:
-    
+
     .. math::
 
         Out = \\frac{1}{2}(X - Y)^2
@@ -82,16 +82,16 @@ def center_loss(input,
                          with shape[N x 1],where N is the batch size. Its dtype should be int32.
         num_classes (int): the number of classification categories.
         alpha (float|Variable): learning rate of centers.
-        param_attr (ParamAttr): Attribute initializer of centers. 
+        param_attr (ParamAttr): Attribute initializer of centers.
         update_center (bool): whether to update value of center.
 
     Returns:
-        Variable: 2-D tensor with shape [N * 1] 
+        Variable: 2-D tensor with shape [N * 1]
 
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid 
+          import paddle.fluid as fluid
           import paddle
           paddle.enable_static()
 
@@ -326,7 +326,7 @@ def square_error_cost(input, label):
         label (Tensor): Label tensor, the data type should be float32.
 
     Returns:
-        Tensor, The tensor storing the element-wise squared 
+        Tensor, The tensor storing the element-wise squared
         error difference between input and label.
 
     Examples:
@@ -439,28 +439,28 @@ def warpctc(input,
     Args:
        input (Variable): The unscaled probabilities of variable-length sequences,
          which is a 2-D Tensor with LoD information, or a 3-D Tensor without Lod
-         information. When it is a 2-D LodTensor, its shape is 
+         information. When it is a 2-D LodTensor, its shape is
          `[Lp, num_classes + 1]`, where `Lp` is the sum of all input
          sequences' length and `num_classes` is the true number of classes.
-         (not including the blank label). When it is a 3-D Tensor, its shape 
+         (not including the blank label). When it is a 3-D Tensor, its shape
          is `[max_logit_length, batch_size, num_classes + 1]`,
          where `max_logit_length` is the longest length of
          input logit sequence. The data type should be float32 or float64.
        label (Variable): The ground truth of variable-length sequence,
          which must be a 2-D Tensor with LoD information or a 3-D Tensor without
-         LoD information, needs to be consistent with the coressponding input. 
-         When it is a 2-D LoDTensor, its shape is `[Lg, 1]`, where `Lg` is the sum 
-         of all labels' length. When it is a 3-D Tensor, its shape is 
+         LoD information, needs to be consistent with the coressponding input.
+         When it is a 2-D LoDTensor, its shape is `[Lg, 1]`, where `Lg` is the sum
+         of all labels' length. When it is a 3-D Tensor, its shape is
          `[batch_size, max_label_length]`, where `max_label_length` is the longest
          length of label sequence. Data type must be int32.
        blank (int, default 0): The blank label index of Connectionist
          Temporal Classification (CTC) loss, which is in the
-         half-opened interval `[0, num_classes + 1)`. The data type must be int32. 
+         half-opened interval `[0, num_classes + 1)`. The data type must be int32.
        norm_by_times(bool, default false): Whether to normalize the gradients
          by the number of time-step, which is also the sequence's length.
          There is no need to normalize the gradients if warpctc layer was
          followed by a mean_op.
-       input_length(Variable): The length for each input sequence if it is 
+       input_length(Variable): The length for each input sequence if it is
          of Tensor type, it should have shape `[batch_size]` and dtype int64.
        label_length(Variable): The length for each label sequence if it is
          of Tensor type, it should have shape `[batch_size]` and dtype int64.
@@ -494,10 +494,10 @@ def warpctc(input,
             cost = fluid.layers.warpctc(input=logits, label=label)
             place = fluid.CPUPlace()
             x = fluid.create_lod_tensor(
-                     np.random.rand(np.sum(seq_lens), class_num+1).astype("float32"), 
+                     np.random.rand(np.sum(seq_lens), class_num+1).astype("float32"),
                      [seq_lens], place)
             y = fluid.create_lod_tensor(
-                     np.random.randint(0, class_num, [np.sum(label_lens), 1]).astype("int32"), 
+                     np.random.randint(0, class_num, [np.sum(label_lens), 1]).astype("int32"),
                      [label_lens], place)
             exe = fluid.Executor(place)
             output= exe.run(fluid.default_main_program(),
@@ -620,7 +620,7 @@ def nce(input,
     ${comment}
 
     Args:
-        input (Tensor): Input tensor, 2-D tensor with shape [batch_size, dim], 
+        input (Tensor): Input tensor, 2-D tensor with shape [batch_size, dim],
             and data type is float32 or float64.
         label (Tensor): Input label, 2-D tensor with shape [batch_size, num_true_class],
             and data type is int64.
@@ -628,14 +628,14 @@ def nce(input,
         sample_weight (Tensor|None): A Tensor of shape [batch_size, 1]
             storing a weight for each sample. The default weight for each
             sample is 1.0.
-        param_attr (ParamAttr|None): To specify the weight parameter attribute. 
-            Default: None, which means the default weight parameter property is 
+        param_attr (ParamAttr|None): To specify the weight parameter attribute.
+            Default: None, which means the default weight parameter property is
             used. See usage for details in :ref:`api_fluid_ParamAttr` .
-        bias_attr (ParamAttr|None): To specify the bias parameter attribute. 
-            Default: None, which means the default bias parameter property is 
+        bias_attr (ParamAttr|None): To specify the bias parameter attribute.
+            Default: None, which means the default bias parameter property is
             used. See usage for details in :ref:`api_fluid_ParamAttr` .
         num_neg_samples (int): ${num_neg_samples_comment}.
-        name(str|None): For detailed information, please refer to 
+        name(str|None): For detailed information, please refer to
             :ref:`api_guide_Name` . Usually name is no need to set and None by default.
         sampler (str, optional): The sampler used to sample class from negative classes.
                        It can be 'uniform', 'log_uniform' or 'custom_dist'.
@@ -645,7 +645,7 @@ def nce(input,
                        custom_dist[i] is the probability of i-th class to be sampled.
                        default: None.
         seed (int, optional): The seed used in sampler. Default 0, means no random seed.
-        is_sparse(bool, optional): The flag indicating whether to use sparse update, 
+        is_sparse(bool, optional): The flag indicating whether to use sparse update,
             the weight@GRAD and bias@GRAD will be changed to SelectedRows. Default False.
 
     Returns:
@@ -828,7 +828,7 @@ def hsigmoid(input,
              is_sparse=False):
     """
     :api_attr: Static Graph
-    
+
     The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
     and speed up the model training, especially the training of language model.
     Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
@@ -990,50 +990,50 @@ def sampled_softmax_with_cross_entropy(logits,
     """
     **Sampled Softmax With Cross Entropy Operator.**
 
-    Cross entropy loss with sampled softmax is used as the output layer for 
+    Cross entropy loss with sampled softmax is used as the output layer for
     larger output classes extensively. This operator samples a number of samples
-    for all examples, and computes the softmax normalized values for each 
-    row of the sampled tensor, after which cross-entropy loss is computed. 
+    for all examples, and computes the softmax normalized values for each
+    row of the sampled tensor, after which cross-entropy loss is computed.
 
     Because this operator performs a softmax on logits internally, it expects
     unscaled logits. This operator should not be used with the output of
     softmax operator since that would produce incorrect results.
-    
+
     For examples with T true labels (T >= 1), we assume that each true label has
     a probability of 1/T. For each sample, S samples are generated using a
     log uniform distribution. True labels are concatenated with these samples to
     form T + S samples for each example. So, assume the shape of logits is
-    [N x K], the shape for samples is [N x (T+S)]. For each sampled label, a 
-    probability is calculated, which corresponds to the Q(y|x) in 
+    [N x K], the shape for samples is [N x (T+S)]. For each sampled label, a
+    probability is calculated, which corresponds to the Q(y|x) in
     [Jean et al., 2014](http://arxiv.org/abs/1412.2007).
-    
-    Logits are sampled according to the sampled labels. Then if 
-    remove_accidental_hits is True, if a sample[i, j] accidentally hits true 
-    labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to 
+
+    Logits are sampled according to the sampled labels. Then if
+    remove_accidental_hits is True, if a sample[i, j] accidentally hits true
+    labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to
     make its softmax result close to zero. Then sampled logits are subtracted by
-    logQ(y|x), these sampled logits and re-indexed labels are used to compute 
+    logQ(y|x), these sampled logits and re-indexed labels are used to compute
     a softmax with cross entropy.
 
     Args:
         logits (Variable): The unscaled log probabilities, which is a 2-D tensor
             with shape [N x K]. N is the batch_size, and K is the class number.
-        label (Variable): The ground truth which is a 2-D tensor. Label is a 
-            Tensor<int64> with shape [N x T], where T is the number of true 
-            labels per example. 
-        num_samples (int): The number for each example, num_samples should be 
+        label (Variable): The ground truth which is a 2-D tensor. Label is a
+            Tensor<int64> with shape [N x T], where T is the number of true
+            labels per example.
+        num_samples (int): The number for each example, num_samples should be
             less than the number of class.
         num_true(int): The number of target classes per training example.
-        remove_accidental_hits (bool): A flag indicating whether to remove 
-            accidental hits when sampling. If True and if a sample[i, j] 
-            accidentally hits true labels, then the corresponding 
-            sampled_logits[i, j] is minus by 1e20 to make its softmax result 
+        remove_accidental_hits (bool): A flag indicating whether to remove
+            accidental hits when sampling. If True and if a sample[i, j]
+            accidentally hits true labels, then the corresponding
+            sampled_logits[i, j] is minus by 1e20 to make its softmax result
             close to zero. Default is True.
         use_customized_samples (bool): Whether to use custom samples and probabities to sample
             logits.
         customized_samples (Variable): User defined samples, which is a 2-D tensor
-            with shape [N, T + S]. S is the num_samples, and T is the number of true 
-            labels per example. 
-        customized_probabilities (Variable): User defined probabilities of samples, 
+            with shape [N, T + S]. S is the num_samples, and T is the number of true
+            labels per example.
+        customized_probabilities (Variable): User defined probabilities of samples,
             a 2-D tensor which has the same shape with customized_samples.
         seed (int): The random seed for generating random number, which is used
             in the process of sampling. Default is 0.
@@ -1140,17 +1140,17 @@ def softmax_with_cross_entropy(logits,
                                axis=-1):
     r"""
 
-    This operator implements the cross entropy loss function with softmax. This function 
-    combines the calculation of the softmax operation and the cross entropy loss function 
+    This operator implements the cross entropy loss function with softmax. This function
+    combines the calculation of the softmax operation and the cross entropy loss function
     to provide a more numerically stable gradient.
 
     Because this operator performs a softmax on logits internally, it expects
     unscaled logits. This operator should not be used with the output of
     softmax operator since that would produce incorrect results.
 
-    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
-    expects mutually exclusive hard labels, each sample in a batch is in exactly 
-    one class with a probability of 1.0. Each sample in the batch will have a 
+    When the attribute :attr:`soft_label` is set :attr:`False`, this operators
+    expects mutually exclusive hard labels, each sample in a batch is in exactly
+    one class with a probability of 1.0. Each sample in the batch will have a
     single label.
 
     The equation is as follows:
@@ -1185,27 +1185,27 @@ def softmax_with_cross_entropy(logits,
     Args:
         logits (Tensor): A multi-dimension ``Tensor`` , and the data type is float32 or float64. The input tensor of unscaled log probabilities.
         label (Tensor): The ground truth  ``Tensor`` , data type is the same
-            as the ``logits`` . If :attr:`soft_label` is set to :attr:`True`, 
-            Label is a ``Tensor``  in the same shape with :attr:`logits`. 
-            If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor`` 
+            as the ``logits`` . If :attr:`soft_label` is set to :attr:`True`,
+            Label is a ``Tensor``  in the same shape with :attr:`logits`.
+            If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor``
             in the same shape with :attr:`logits` expect shape in dimension :attr:`axis` as 1.
         soft_label (bool, optional): A flag to indicate whether to interpretant the given
             labels as soft labels. Default False.
         ignore_index (int, optional): Specifies a target value that is ignored and does
                                       not contribute to the input gradient. Only valid
-                                      if :attr:`soft_label` is set to :attr:`False`. 
+                                      if :attr:`soft_label` is set to :attr:`False`.
                                       Default: kIgnoreIndex(-100).
         numeric_stable_mode (bool, optional): A flag to indicate whether to use a more
                                               numerically stable algorithm. Only valid
-                                              when :attr:`soft_label` is :attr:`False` 
-                                              and GPU is used. When :attr:`soft_label` 
-                                              is :attr:`True` or CPU is used, the 
+                                              when :attr:`soft_label` is :attr:`False`
+                                              and GPU is used. When :attr:`soft_label`
+                                              is :attr:`True` or CPU is used, the
                                               algorithm is always numerically stable.
                                               Note that the speed may be slower when use
                                               stable algorithm. Default: True.
         return_softmax (bool, optional): A flag indicating whether to return the softmax
                                          along with the cross entropy loss. Default: False.
-        axis (int, optional): The index of dimension to perform softmax calculations. It 
+        axis (int, optional): The index of dimension to perform softmax calculations. It
                               should be in range :math:`[-1, rank - 1]`, while :math:`rank`
                               is the rank of input :attr:`logits`. Default: -1.
 
@@ -1244,7 +1244,7 @@ def identity_loss(x, reduction="none"):
     it is used as the start of backpropagation.
 
     When `reduction` is `none`, return raw `Out`.
-    
+
     When `reduction` is `mean`, return
 
     .. math::
@@ -1297,15 +1297,15 @@ def identity_loss(x, reduction="none"):
 def rank_loss(label, left, right, name=None):
     r"""
 
-    This operator implements the sort loss layer in the RankNet model. RankNet is a pairwise ranking model 
-    with a training sample consisting of a pair of documents (A and B), The label (P) 
-    indicates whether A is ranked higher than B or not. Please refer to more details: 
+    This operator implements the sort loss layer in the RankNet model. RankNet is a pairwise ranking model
+    with a training sample consisting of a pair of documents (A and B), The label (P)
+    indicates whether A is ranked higher than B or not. Please refer to more details:
     `RankNet <http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf>`_
 
     Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and
     label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores
-    for documents A and B and the value of label P. Rank loss layer takes batch inputs 
-    with size batch_size (batch_size >= 1), P = {0, 1} or {0, 0.5, 1}, 
+    for documents A and B and the value of label P. Rank loss layer takes batch inputs
+    with size batch_size (batch_size >= 1), P = {0, 1} or {0, 0.5, 1},
     where 0.5 means that there is no information about the rank of the input pair.
     The following equation computes rank loss C_{i,j} from the inputs:
 
@@ -1374,7 +1374,7 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
        left (Variable): Ranking score for left. Data type float32.
        right (Variable): Ranking score for right. Data type float32.
        margin (float): Indicates the given margin.
-       name(str|None): For detailed information, please refer to 
+       name(str|None): For detailed information, please refer to
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
 
     Returns:
@@ -1430,7 +1430,7 @@ def sigmoid_cross_entropy_with_logits(x,
                 as log(p/(1-p)) The data type should be float32 or float64.
         label (Tensor): a 2-D tensor of the same type and shape as X.
                 This input is a tensor of probabalistic labels for each logit.
-        ignore_index(int): Specifies a target value that is ignored and 
+        ignore_index(int): Specifies a target value that is ignored and
                 does not contribute to the input gradient.
         name(str|None): The default value is None.  Normally there is
             no need for user to set this property.  For more information,
@@ -1449,7 +1449,7 @@ def sigmoid_cross_entropy_with_logits(x,
 
             input = paddle.rand(shape=[10], dtype='float32')
             label = paddle.rand(shape=[10], dtype='float32')
-            loss = paddle.fluid.layers.sigmoid_cross_entropy_with_logits(input, label, 
+            loss = paddle.fluid.layers.sigmoid_cross_entropy_with_logits(input, label,
                                                             ignore_index=-1, normalize=True)
             print(loss)
     """
@@ -1507,7 +1507,7 @@ def teacher_student_sigmoid_loss(input,
 
     Examples:
         .. code-block:: python
-          
+
           import paddle.fluid as fluid
           import paddle
           paddle.enable_static()
@@ -1632,22 +1632,22 @@ def kldiv_loss(x, target, reduction='mean', name=None):
 
             import paddle
             import paddle.fluid as fluid
-            
+
             x = paddle.rand(shape=[3,4,2,2], dtype='float32')
             target = paddle.rand(shape=[3,4,2,2], dtype='float32')
 
             # 'batchmean' reduction, loss shape will be [1]
             loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='batchmean')
             print(loss.shape) # shape=[1]
-            
+
             # 'mean' reduction, loss shape will be [1]
             loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='mean')
             print(loss.shape) # shape=[1]
-            
+
             # 'sum' reduction, loss shape will be [1]
             loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='sum')
             print(loss.shape) # shape=[1]
-            
+
             # 'none' reduction, loss shape is same with X shape
             loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='none')
             print(loss.shape) # shape=[3, 4, 2, 2]
@@ -1674,42 +1674,42 @@ from .control_flow import equal
 
 
 def npair_loss(anchor, positive, labels, l2_reg=0.002):
-    """ 
-  
+    """
+
     Npair loss requires paired data. Npair loss has two parts: the first part is L2
     regularizer on the embedding vector; the second part is cross entropy loss which
     takes the similarity matrix of anchor and positive as logits.
-  
+
     For more information, please refer to:
     `Improved Deep Metric Learning with Multi class N pair Loss Objective <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf>`_
-  
+
     Args:
-      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims], 
+      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims],
                         the data type is float32 or float64.
-      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims], 
+      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims],
                         the data type is float32 or float64.
       labels(Tensor): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
       l2_reg(float32): L2 regularization term on embedding vector, default: 0.002.
 
-  
+
     Returns:
       A Tensor representing the npair loss, the data type is the same as anchor, the shape is [1].
-  
+
     Examples:
 
       .. code-block:: python
-  
+
           import paddle
-          
+
           DATATYPE = "float32"
-  
+
           anchor = paddle.rand(shape=(18, 6), dtype=DATATYPE)
           positive = paddle.rand(shape=(18, 6), dtype=DATATYPE)
           labels = paddle.rand(shape=(18,), dtype=DATATYPE)
-          
+
           npair_loss = paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg = 0.002)
           print(npair_loss)
-  
+
     """
     return paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg)
 
@@ -1722,10 +1722,10 @@ def mse_loss(input, label):
     The loss can be described as:
 
     .. math::
-        
+
         Out = MEAN((input - label)^2)
 
-    Parameters: 
+    Parameters:
         input (Tensor): Input tensor, the data type should be float32.
         label (Tensor): Label tensor, the data type should be float32.
 
@@ -1733,7 +1733,7 @@ def mse_loss(input, label):
         Tensor: The tensor storing the mean square error difference of input and label.
 
     Return type: Tensor.
-    
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 7c66e9736ea..1afea145e4a 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -131,7 +131,7 @@ def monkey_patch_variable():
 
     @static_only
     def cpu(self):
-        """ 
+        """
             Variable should not have cpu() and cuda() interface.
             But this interface can greatly facilitate dy2static.
             We do nothing here.
@@ -140,7 +140,7 @@ def monkey_patch_variable():
 
     @static_only
     def cuda(self):
-        """ 
+        """
             Variable should not have cpu() and cuda() interface.
             But this interface can greatly facilitate dy2static.
             We do nothing here.
@@ -209,7 +209,7 @@ def monkey_patch_variable():
         """
          **Notes**:
             **The type variable must be LoD Tensor Array.
-        
+
         """
         if not isinstance(var, Variable):
             if in_declarative_mode():
@@ -229,8 +229,8 @@ def monkey_patch_variable():
 
     @static_only
     def _item(self):
-        """ 
-        In order to be compatible with the item interface introduced by the dynamic graph, it does nothing but returns self. 
+        """
+        In order to be compatible with the item interface introduced by the dynamic graph, it does nothing but returns self.
         It will check that the shape must be a 1-D tensor
         """
         if len(self.shape) > 1:
@@ -243,7 +243,7 @@ def monkey_patch_variable():
     def pop(self, *args):
         """
         The type variable must be LoD Tensor Array.
-        When self is LoDTensorArray, calling pop is similar to Python's pop on list. 
+        When self is LoDTensorArray, calling pop is similar to Python's pop on list.
         This interface is used to simplify dygraph to static graph operations.
 
         Args:
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 736213340e9..8c1fa12cd15 100755
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -155,7 +155,7 @@ def auc(input,
                              the roc curve. Default 4095.
         topk(int): only topk number of prediction output will be used for auc.
         slide_steps: when calc batch auc, we can not only use step currently but the previous steps can be used. slide_steps=1 means use the current step, slide_steps=3 means use current step and the previous second steps, slide_steps=0 use all of the steps.
-        ins_tag_weight(Tensor): A 2D int Tensor indicating the data's tag weight, 1 means real data, 0 means fake data. Default None, and it will be assigned to a tensor of value 1. 
+        ins_tag_weight(Tensor): A 2D int Tensor indicating the data's tag weight, 1 means real data, 0 means fake data. Default None, and it will be assigned to a tensor of value 1.
                          A Tensor with type float32,float64.
 
     Returns:
@@ -186,7 +186,7 @@ def auc(input,
             output= exe.run(feed={"input": x,"label": y},
                              fetch_list=[result[0]])
             print(output)
-            
+
             #you can learn the usage of ins_tag_weight by the following code.
             '''
             import paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index b4330f1c4a7..2898ad64fcc 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -507,7 +507,7 @@ def embedding(input,
           import numpy as np
           import paddle
           paddle.enable_static()
-          
+
           data = fluid.data(name='x', shape=[None, 1], dtype='int64')
 
           # example 1
@@ -1093,7 +1093,7 @@ def dropout(x,
     Args:
         x (Variable): The input tensor variable. The data type is float16 or float32 or float64.
         dropout_prob (float): Probability of setting units to zero.
-        is_test (bool): A flag indicating whether it is in test phrase or not. 
+        is_test (bool): A flag indicating whether it is in test phrase or not.
                         Default None, in dynamic graph, it use global tracer mode; in static graph, it means False.
         seed (int): A Python integer used to create random seeds. If this
                     parameter is set to None, a random seed is used.
@@ -1128,7 +1128,7 @@ def dropout(x,
 
             import paddle
             import paddle.fluid as fluid
-            
+
             paddle.enable_static()
             x = fluid.data(name="data", shape=[None, 32, 32], dtype="float32")
             dropped = fluid.layers.dropout(x, dropout_prob=0.5)
@@ -1260,7 +1260,7 @@ def chunk_eval(input,
     Args:
         input (Tensor): A Tensor representing the predicted labels
             from the network. Its shape would be `[N, M, 1]`,
-            where `N` stands for batch size, `M` for sequence length. 
+            where `N` stands for batch size, `M` for sequence length.
             The data type should be int64.
         label (Tensor): A Tensor representing the ground-truth labels.
             It should have the same shape, lod and data type as ``input`` .
@@ -1620,7 +1620,7 @@ def conv2d(input,
 
           import paddle
           paddle.enable_static()
-          
+
           data = paddle.static.data(name='data', shape=[None, 3, 32, 32], dtype='float32')
           conv2d = paddle.static.nn.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
           print(conv2d.shape) # [-1, 2, 30, 30]
@@ -1920,7 +1920,7 @@ def conv3d(input,
 
           import paddle
           import numpy as np
-	  
+
           paddle.enable_static()
           data = paddle.static.data(name='data', shape=[None, 3, 12, 32, 32], dtype='float32')
           param_attr = paddle.framework.ParamAttr(name='conv3d.weight', initializer=paddle.nn.initializer.XavierNormal(), learning_rate=0.001)
@@ -2945,7 +2945,7 @@ def batch_norm(input,
         .. code-block:: python
 
             import paddle
-            
+
             paddle.enable_static()
             x = paddle.static.data(name='x', shape=[3, 7, 3, 7], dtype='float32')
             hidden1 = paddle.static.nn.fc(x=x, size=200)
@@ -3806,7 +3806,7 @@ def group_norm(input,
 
             import paddle
             paddle.enable_static()
-            
+
             data = paddle.static.data(name='data', shape=[2, 8, 32, 32], dtype='float32')
             x = paddle.static.nn.group_norm(input=data, groups=4)
             print(x.shape) # [2, 8, 32, 32]
@@ -4071,14 +4071,14 @@ def conv2d_transpose(input,
         stride(int|tuple, optional): The stride size. It means the stride in transposed convolution.
             If stride is a tuple, it must contain two integers, (stride_height, stride_width).
             Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-        padding(str|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
-            on both sides for each dimension. If `padding` is a string, either 'VALID' or 
+        padding(str|int|list|tuple, optional): The padding size. It means the number of zero-paddings
+            on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If `padding` is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or 
+            it could be in three forms: `[pad_height, pad_width]` or
             `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCHW"`, `padding` can be in the form 
+            and when `data_format` is `"NCHW"`, `padding` can be in the form
             `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `padding` can be in the form 
+            when `data_format` is `"NHWC"`, `padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points.
@@ -4118,7 +4118,7 @@ def conv2d_transpose(input,
     Returns:
         A Tensor representing the conv2d_transpose, whose
         data type is the same with input and shape is (num_batches, channels, out_h,
-        out_w) or (num_batches, out_h, out_w, channels). If act is None, the tensor 
+        out_w) or (num_batches, out_h, out_w, channels). If act is None, the tensor
         storing the transposed convolution result, and if act is not None, the
         tensor storing transposed convolution and non-linearity activation
         result.
@@ -4472,7 +4472,7 @@ def conv3d_transpose(input,
 
           import paddle
           import numpy as np
-	    
+
           paddle.enable_static()
           data = paddle.static.data(name='data', shape=[None, 3, 12, 32, 32], dtype='float32')
           param_attr = paddle.framework.ParamAttr(name='conv3d.weight', initializer=paddle.nn.initializer.XavierNormal(), learning_rate=0.001)
@@ -5147,15 +5147,15 @@ def split(input, num_or_sections, dim=-1, name=None):
 
     Args:
         input (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, int32 or int64.
-        num_or_sections (int|list|tuple): If ``num_or_sections`` is int, then the ``num_or_sections`` 
+        num_or_sections (int|list|tuple): If ``num_or_sections`` is int, then the ``num_or_sections``
             indicates the number of equal sized sub-Tensors that the ``input``
-            will be divided into. If ``num_or_sections`` is a list or tuple, the length of it 
+            will be divided into. If ``num_or_sections`` is a list or tuple, the length of it
             indicates the number of sub-Tensors and the elements in it indicate the sizes of sub-Tensors'
             dimension orderly. The length of the list mustn't be larger than the ``input`` 's size of specified dim.
         dim (int|Tensor, optional): The dimension along which to split, it can be a scalar with type ``int`` or
             a ``Tensor`` with shape [1] and data type ``int32`` or ``int64``. If :math:`dim < 0`,
             the dimension to split along is :math:`rank(input) + dim`. Default is -1.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property. 
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
@@ -5184,7 +5184,7 @@ def split(input, num_or_sections, dim=-1, name=None):
             # out0.shape [3, 2, 5]
             # out1.shape [3, 3, 5]
             # out2.shape [3, 4, 5]
-            
+
             # dim is negative, the real dim is (rank(input) + axis) which real
             # value is 1.
             out0, out1, out2 = fluid.layers.split(input, num_or_sections=3, dim=-2)
@@ -5337,9 +5337,9 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
 
     .. code-block:: python
         :name: code-example1
-        
+
         import paddle
-        
+
         X = paddle.randn(shape=[3, 5], dtype='float64')
         out = paddle.fluid.layers.l2_normalize(X, axis=-1)
         print(out)
@@ -6465,11 +6465,11 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
             import paddle.fluid as fluid
             paddle.enable_static()
-            
+
             # example 1:
             # attr shape is a list which doesn't contain Tensors.
             data_1 = fluid.data(
@@ -6841,10 +6841,10 @@ def lod_reset(x, y=None, target_lod=None):
                 out.dims = [6, 1]
 
     Args:
-        x (Variable): Input variable which could be a Tensor or LoDTensor. 
+        x (Variable): Input variable which could be a Tensor or LoDTensor.
                       The data type should be int32, int64, float32 or float64.
-        y (Variable, optional): If provided, output's LoD would be derived from :attr:`y`. 
-                                If y's lod level>0, the data type can be any type. 
+        y (Variable, optional): If provided, output's LoD would be derived from :attr:`y`.
+                                If y's lod level>0, the data type can be any type.
                                 If y's lod level=0, the data type should be int32.
         target_lod (list|tuple, optional): One level LoD which should be considered
                                       as target LoD when :attr:`y` not provided.
@@ -6907,9 +6907,9 @@ def lod_append(x, level):
                 x.dims = [6, 1]
 
     Args:
-        x (Variable): Input variable which could be a tensor or LoDTensor. 
+        x (Variable): Input variable which could be a tensor or LoDTensor.
                       The data type should be int32, int64, float32 or float64.
-        level (list|tuple|Variable, optional): The LoD level to be appended into LoD of x. 
+        level (list|tuple|Variable, optional): The LoD level to be appended into LoD of x.
                                                If level is variable and its lod level>0, the data type can be any type.
                                                If level is variable and its lod level=0, the data type should be int32.
     Returns:
@@ -7581,19 +7581,19 @@ def image_resize(input,
     future and only use :attr:`out_shape` instead.
 
     Supporting resample methods:
-        'LINEAR' : Linear interpolation 
+        'LINEAR' : Linear interpolation
 
         'BILINEAR' : Bilinear interpolation
 
         'TRILINEAR' : Trilinear interpolation
 
         'NEAREST' : Nearest neighbor interpolation
-        
+
         'BICUBIC' : Bicubic interpolation
-    
-    Linear interpolation is the method of using a line connecting two known quantities 
+
+    Linear interpolation is the method of using a line connecting two known quantities
     to determine the value of an unknown quantity between the two known quantities.
-    
+
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
     direction) on input tensor.
@@ -7608,7 +7608,7 @@ def image_resize(input,
     interpolating functions of three variables (e.g. D-direction,
     H-direction and W-direction in this op) on a rectilinear 3D grid.
     The linear interpolation is performed on three directions.
-    
+
     Bicubic interpolation is an extension of cubic interpolation for interpolating
     data points on a two-dimensional regular grid. The interpolated surface is
     smoother than corresponding surfaces obtained by bilinear interpolation or
@@ -7707,7 +7707,7 @@ def image_resize(input,
               output: (N,C,D_out,H_out,W_out) where:
 
               D_out = D_{in} * scale_{factor}
-       
+
         Trilinear interpolation:
           if:
               align_corners = False , align_mode = 0
@@ -7722,20 +7722,20 @@ def image_resize(input,
               D_out = D_{in} * scale_{factor}
               H_out = H_{in} * scale_{factor}
               W_out = W_{in} * scale_{factor}
-        
+
 
     For details of linear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Linear_interpolation.
-    
+
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-    
+
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation.
-    
+
     For details of trilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
-    
+
     For details of bicubic interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bicubic_interpolation
 
@@ -7743,8 +7743,8 @@ def image_resize(input,
         input (Variable): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         out_shape (list|tuple|Variable|None): Output shape of image resize
-             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
-             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w)
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor.
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
         scale(float|Variable|None): The multiplier for the input height or width. At
@@ -7772,8 +7772,8 @@ def image_resize(input,
                                input and output tensors are aligned, preserving the values at the
                                corner pixels.
                                Default: True
-        align_mode(int)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the fomula in the 
-                            the example code above, it can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , 
+        align_mode(int)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the fomula in the
+                            the example code above, it can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 ,
                             can be \'1\' for src_idx = scale*dst_index.
         data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
@@ -8076,10 +8076,10 @@ def resize_linear(input,
     output shape which specified by actual_shape, out_shape and scale
     in priority order.
 
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in 
+    **Warning:** the parameter :attr:`actual_shape` will be deprecated in
     the future and only use :attr:`out_shape` instead.
 
-    Align_corners and align_mode are optional parameters,the calculation 
+    Align_corners and align_mode are optional parameters,the calculation
     method of interpolation can be selected by them.
 
     Example:
@@ -8087,23 +8087,23 @@ def resize_linear(input,
     .. code-block:: text
 
         For scale:
-          
+
             if align_corners = True && out_size > 1 :
 
               scale_factor = (in_size-1.0)/(out_size-1.0)
-            
+
             else:
-              
+
               scale_factor = float(in_size/out_size)
 
         Linear interpolation:
 
           if:
               align_corners = False , align_mode = 0
-              
+
               input : (N,C,W_in)
               output: (N,C,W_out) where:
-              
+
               W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
           else:
@@ -8116,12 +8116,12 @@ def resize_linear(input,
         input(Variable): 3-D Tensor(NCW), its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         out_shape(list|tuple|Variable|None): Output shape of resize linear
-            layer, the shape is (out_w,). Default: None. If a list, each 
-            element can be an integer or a Tensor Variable with shape: [1]. If a 
+            layer, the shape is (out_w,). Default: None. If a list, each
+            element can be an integer or a Tensor Variable with shape: [1]. If a
             Tensor Variable, its dimension size should be 1.
         scale(float|Variable|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale` must be set. 
-             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
+             least one of :attr:`out_shape` or :attr:`scale` must be set.
+             And :attr:`out_shape` has a higher priority than :attr:`scale`.
              Default: None.
         actual_shape(Variable): An optional input to specify output shape
                                 dynamically. If provided, image resize
@@ -8129,28 +8129,28 @@ def resize_linear(input,
                                 :attr:`out_shape` and :attr:`scale` specifying
                                 shape. That is to say actual_shape has the
                                 highest priority. It is recommended to use
-                                :attr:`out_shape` if you want to specify output 
-                                shape dynamically, because :attr:`actual_shape` 
-                                will be deprecated. When using actual_shape to 
-                                specify output shape, one of :attr:`out_shape` 
-                                and :attr:`scale` should also be set, otherwise 
+                                :attr:`out_shape` if you want to specify output
+                                shape dynamically, because :attr:`actual_shape`
+                                will be deprecated. When using actual_shape to
+                                specify output shape, one of :attr:`out_shape`
+                                and :attr:`scale` should also be set, otherwise
                                 errors would be occurred in graph constructing stage.
                                 Default: None
         align_corners(bool): ${align_corners_comment}
         align_mode(bool): ${align_mode_comment}
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCW"`, `"NWC"`.
             The default is `"NCW"`. When it is `"NCW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_width]`.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property.  
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
 	Variable: 3-D tensor(NCW or NWC).
-    
+
     Examples:
         .. code-block:: python
-	
+
 	    #declarative mode
 	    import paddle.fluid as fluid
 	    import numpy as np
@@ -8161,14 +8161,14 @@ def resize_linear(input,
 	    place = fluid.CPUPlace()
 	    exe = fluid.Executor(place)
 	    exe.run(fluid.default_startup_program())
- 
+
 	    input_data = np.random.rand(1,3,100).astype("float32")
 
 	    output_data = exe.run(fluid.default_main_program(),
                 feed={"input":input_data},
                 fetch_list=[output],
                 return_numpy=True)
- 
+
 	    print(output_data[0].shape)
 
 	    # (1, 3, 50)
@@ -8764,7 +8764,7 @@ def gather(input, index, overwrite=True):
 
     Returns:
         output (Tensor): The output is a tensor with the same rank as input.
-    
+
     Examples:
 
         .. code-block:: python
@@ -9799,7 +9799,7 @@ def pad2d(input,
         name (str, optional) : The default value is None.  Normally there is no need for
                     user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
 
-    Returns: 
+    Returns:
         Tensor, a 4-D Tensor padded according to paddings and mode and data type is same as input.
 
     Examples:
@@ -10225,7 +10225,7 @@ def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
         element: All elements do not share alpha. Each element has its own alpha.
 
     Parameters:
-    
+
         x (Tensor): The input Tensor or LoDTensor with data type float32.
 
         mode (str): The mode for weight sharing.
@@ -10236,7 +10236,7 @@ def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
 
         name (str, optional): Name for the operation (optional, default is None). \
         For more information, please refer to :ref:`api_guide_Name`.
-        
+
         data_format(str, optional): Data format that specifies the layout of input.
             It may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default: "NCHW".
 
@@ -10582,10 +10582,10 @@ def stack(x, axis=0, name=None):
                                      Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
                                      Supported data types: float32, float64, int32, int64.
         axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
-                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
+                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``.
                               If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
         name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
-    
+
 
     Returns:
         Variable: The stacked Tensor, has same data type with input Tensors. Output dim is :math:`rank(x[0])+1`.
@@ -10920,7 +10920,7 @@ def expand_as(x, target_tensor, name=None):
     :alias_main: paddle.expand_as
 	:alias: paddle.expand_as,paddle.tensor.expand_as,paddle.tensor.manipulation.expand_as
 	:old_api: paddle.fluid.layers.expand_as
-    
+
     expand_as operator tiles to the input by given expand tensor. You should set expand tensor
     for each dimension by providing tensor 'target_tensor'. The rank of X
     should be in [1, 6]. Please note that size of 'target_tensor' must be the same
@@ -11162,21 +11162,21 @@ def gaussian_random(shape,
             # result_3 is:
             # [[-0.12310527,  0.8187662,   1.923219  ]
             #  [ 0.70721835,  0.5210541,  -0.03214082]]
-       
+
        .. code-block:: python
-       
+
            # declarative mode
            # required: skiptest
            import numpy as np
            from paddle import fluid
-   
+
            x = fluid.layers.gaussian_random((2, 3), std=2., seed=10)
-   
+
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            start = fluid.default_startup_program()
            main = fluid.default_main_program()
-   
+
            exe.run(start)
            x_np, = exe.run(main, feed={}, fetch_list=[x])
 
@@ -11190,11 +11190,11 @@ def gaussian_random(shape,
            import numpy as np
            from paddle import fluid
            import paddle.fluid.dygraph as dg
-    
+
            place = fluid.CPUPlace()
            with dg.guard(place) as g:
                x = fluid.layers.gaussian_random((2, 4), mean=2., dtype="float32", seed=10)
-               x_np = x.numpy()       
+               x_np = x.numpy()
            x_np
            # array([[2.3060477 , 2.676496  , 3.9911983 , 0.9990833 ],
            #        [2.8675377 , 2.2279181 , 0.79029655, 2.8447366 ]], dtype=float32)
@@ -11453,7 +11453,7 @@ def slice(input, axes, starts, ends):
                 ends = [-1, 1000]       # -1 denotes the reverse 0th position of dimension 0.
             Then:
                 result = [ [2, 3, 4], ] # result = data[0:1, 1:4]
-    
+
     Args:
         input (Tensor): A ``Tensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
         axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to .
@@ -11987,7 +11987,7 @@ def size(input):
 
     Raises:
         TypeError: ``input`` must be a Tensor and the data type of ``input`` must be one of bool, float16, float32, float64, int32, int64.
-    
+
     Examples:
         .. code-block:: python
 
@@ -12050,7 +12050,7 @@ def _elementwise_op(helper):
 
 def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     """
-    
+
     Putting scale and bias to the input Tensor as following:
 
     ``bias_after_scale`` is True:
@@ -12075,9 +12075,9 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
         Tensor: Output tensor of scale operator, with shape and data type same as input.
 
     Examples:
-    
+
         .. code-block:: python
-            
+
             # scale as a float32 number
             import paddle
 
@@ -12927,7 +12927,7 @@ def logical_or(x, y, out=None, name=None):
 
     .. note::
         ``paddle.logical_or`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
-    
+
     Args:
         x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
         y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
@@ -13366,7 +13366,7 @@ def space_to_depth(x, blocksize, name=None):
             to :ref:`api_guide_Name`. Usually name is no need to set and \
             None by default.
 
-    Returns: 
+    Returns:
             Tensor, The output, which should be 4 dims Tensor or LodTensor, with the shape \
             [batch, channel * blocksize * blocksize, height/blocksize, width/blocksize]
 
@@ -15087,7 +15087,7 @@ def deformable_conv(input,
           import paddle.fluid as fluid
           import paddle
           paddle.enable_static()
-          
+
           C_in, H_in, W_in = 3, 32, 32
           filter_size, deformable_groups = 3, 1
           data = fluid.data(name='data', shape=[None, C_in, H_in, W_in], dtype='float32')
@@ -15458,7 +15458,7 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
     For each value `v` in `input`, we reset it to a new value according to the
     following formula:
     ::
-   
+
         v = v - shard_id * shard_size if shard_id * shard_size <= v < (shard_id+1) * shard_size else ignore_value
 
     That is, the value `v` is set to the new offset within the range represented by the shard `shard_id`
@@ -15848,7 +15848,7 @@ def unbind(input, axis=0):
     Removes a tensor dimension, then split the input tensor into multiple sub-Tensors.
     Args:
         input (Variable): The input variable which is an N-D Tensor, data type being float32, float64, int32 or int64.
-       
+
         axis (int32|int64, optional): A scalar with type ``int32|int64`` shape [1]. The dimension along which to unbind. If :math:`axis < 0`, the
             dimension to unbind along is :math:`rank(input) + axis`. Default is 0.
     Returns:
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 51b72267329..9d17b25d95f 100755
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -211,7 +211,7 @@ Examples:
         import paddle.nn.functional as F
 
         x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = F.tanhshrink(x) 
+        out = F.tanhshrink(x)
         print(out)
         # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
 
@@ -510,7 +510,7 @@ Examples:
         import paddle.nn.functional as F
 
         x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = F.softplus(x) 
+        out = F.softplus(x)
         print(out)
         # [0.513015, 0.598139, 0.744397, 0.854355]
 """
@@ -524,7 +524,7 @@ Examples:
         import paddle.nn.functional as F
 
         x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = F.softsign(x) 
+        out = F.softsign(x)
         print(out)
         # [-0.285714, -0.166667, 0.0909091, 0.230769]
 
@@ -566,13 +566,13 @@ softshrink.__doc__ = r"""
 Args:
     x: Input of Softshrink operator, an N-D Tensor, with data type float32, float64 or float16.
     alpha (float): non-negative offset
-    
+
 Returns:
     Output of Softshrink operator with the same type of input.
 
 Examples:
     .. code-block:: python
-    
+
         import paddle.fluid as fluid
         data = fluid.data(name="input", shape=[None, 784])
         result = fluid.layers.softshrink(x=data, alpha=0.3)
@@ -626,17 +626,17 @@ cumsum.__doc__ = """
 The cumulative sum of the elements along a given axis. By default, the first element of the result is the same of the first element of the input. If exlusive is true, the first element of the result is 0.
 
 Args:
-    x (Variable): Input of cumsum operator, the Tensor/LoDTensor needed to be cumsumed. 
+    x (Variable): Input of cumsum operator, the Tensor/LoDTensor needed to be cumsumed.
     axis (int, optional): The dimension to accumulate along. -1 means the last dimension. Default is -1.
     exclusive (bool, optional): Whether to perform exclusive cumsum. Default is False.
     reverse (bool, optional): If true, the cumsum is performed in the reversed direction. Default is False.
 
 Returns:
-    Variable(Tensor/LoDTensor): The result of cumsum operator, output of cumsum operator. 
+    Variable(Tensor/LoDTensor): The result of cumsum operator, output of cumsum operator.
 
 Examples:
     .. code-block:: python
-        
+
         import paddle.fluid as fluid
         data = fluid.layers.data(name="input", shape=[32, 784])
         result = fluid.layers.cumsum(data, axis=0)
@@ -674,7 +674,7 @@ Equation:
 
 Args:
     x(Variable): The input of Thresholded ReLU op, Tensor or LoDTensor, dtype: float32 or float64.
-        
+
     threshold(float, optional): The threshold value. Note that if the arg `threshold` is not set, the threshold in the equation is 1.0.
 
 Returns:
@@ -682,26 +682,26 @@ Returns:
     Variable: The output of Thresholded ReLU op, Tensor or LoDTensor, dtype: float32 or float64, the same as the input, shape: the same as the input.
 
 Examples:
-    
+
     .. code-block:: python
-    
+
         # declarative mode
         import numpy as np
         from paddle import fluid
-        
+
         x = fluid.data(name="x", shape=(-1, 3), dtype="float32")
         y = fluid.layers.thresholded_relu(x, threshold=0.1)
-        
+
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         start = fluid.default_startup_program()
         main = fluid.default_main_program()
-        
+
         data = np.random.randn(2, 3).astype("float32")
         exe.run(start)
-        
+
         y_np, = exe.run(main, feed={"x": data}, fetch_list=[y])
-        
+
         data
         # array([[ 0.21134382, -1.1805999 ,  0.32876605],
         #        [-1.2210793 , -0.7365624 ,  1.0013918 ]], dtype=float32)
@@ -710,12 +710,12 @@ Examples:
         #        [-0.        , -0.        ,  1.0013918 ]], dtype=float32)
 
     .. code-block:: python
-    
+
         # imperative mode
         import numpy as np
         from paddle import fluid
         import paddle.fluid.dygraph as dg
-        
+
         data = np.random.randn(2, 3).astype("float32")
         place = fluid.CPUPlace()
         with dg.guard(place) as g:
@@ -765,26 +765,26 @@ Returns:
     Variable: The output of GeLU op, Tensor or LoDTensor, dtype: float32 or float64, the same as the input, shape: the same as the input.
 
 Examples:
-    
+
     .. code-block:: python
-    
+
         # declarative mode
         import numpy as np
         from paddle import fluid
-        
+
         x = fluid.data(name="x", shape=(-1, 3), dtype="float32")
         y = fluid.layers.gelu(x)
-        
+
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         start = fluid.default_startup_program()
         main = fluid.default_main_program()
-        
+
         data = np.random.randn(2, 3).astype("float32")
         exe.run(start)
-        
+
         y_np, = exe.run(main, feed={"x": data}, fetch_list=[y])
-        
+
         data
         # array([[ 0.87165993, -1.0541513 , -0.37214822],
         #         [ 0.15647964,  0.32496083,  0.33045998]], dtype=float32)
@@ -793,12 +793,12 @@ Examples:
         #        [ 0.08796856,  0.20387867,  0.2080159 ]], dtype=float32)
 
     .. code-block:: python
-    
+
         # imperative mode
         import numpy as np
         from paddle import fluid
         import paddle.fluid.dygraph as dg
-        
+
         data = np.random.randn(2, 3).astype("float32")
         place = fluid.CPUPlace()
         with dg.guard(place) as g:
@@ -845,9 +845,9 @@ Returns:
     Tensor: The output of Erf op, dtype: float32 or float64, the same as the input, shape: the same as the input.
 
 Examples:
-    
+
     .. code-block:: python
-    
+
         import paddle
         x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.erf(x)
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 6b51721aafc..1dff069033a 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -84,8 +84,8 @@ class RNNCell(object):
         Parameters:
             inputs: A (possibly nested structure of) tensor variable[s].
             states: A (possibly nested structure of) tensor variable[s].
-            **kwargs: Additional keyword arguments, provided by the caller. 
-        
+            **kwargs: Additional keyword arguments, provided by the caller.
+
         Returns:
             tuple: outputs and new_states pair. outputs and new_states both \
                 can be nested structure of tensor variables. new_states must \
@@ -123,7 +123,7 @@ class RNNCell(object):
             init_value: A float value used to initialize states.
             batch_dim_idx: An integer indicating which dimension of the tensor in
                 inputs represents batch size.  The default value is 0.
-        
+
         Returns:
             Variable: tensor variable[s] packed in the same structure provided \
                 by shape, representing the initialized states.
@@ -209,7 +209,7 @@ class RNNCell(object):
         Used to initialize states.
         A (possibly nested structure of) shape[s], where a shape is represented
         as a list/tuple of integers (-1 for batch size would be automatically
-        inserted into a shape if shape is not started with it). 
+        inserted into a shape if shape is not started with it).
         Not necessary to be implemented if states are not initialized by
         `get_initial_states` or the `shape` argument is provided when using
         `get_initial_states`.
@@ -237,7 +237,7 @@ class GRUCell(RNNCell):
     r"""
 	:api_attr: Static Graph
 
-    Gated Recurrent Unit cell. It is a wrapper for 
+    Gated Recurrent Unit cell. It is a wrapper for
     `fluid.contrib.layers.rnn_impl.BasicGRUUnit` to make it adapt to RNNCell.
 
     The formula used is as follow:
@@ -311,7 +311,7 @@ class GRUCell(RNNCell):
             tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \
                 `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \
                 corresponding to :math:`h_t` in the formula. The data type of the \
-                tensor is same as that of `states`.        
+                tensor is same as that of `states`.
         """
 
         check_variable_and_dtype(inputs, 'inputs', ['float32', 'float64'],
@@ -335,7 +335,7 @@ class LSTMCell(RNNCell):
     r"""
 	:api_attr: Static Graph
 
-    Long-Short Term Memory cell. It is a wrapper for 
+    Long-Short Term Memory cell. It is a wrapper for
     `fluid.contrib.layers.rnn_impl.BasicLSTMUnit` to make it adapt to RNNCell.
 
     The formula used is as follow:
@@ -351,7 +351,7 @@ class LSTMCell(RNNCell):
         o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
 
         h_{t} & = o_{t} act_c (c_{t})
-    
+
     For more details, please refer to `RECURRENT NEURAL NETWORK REGULARIZATION <http://arxiv.org/abs/1409.2329>`_
 
     Examples:
@@ -450,42 +450,42 @@ def rnn(cell,
         **kwargs):
     """
     rnn creates a recurrent neural network specified by RNNCell `cell`,
-    which performs :code:`cell.call()` (for dygraph mode :code:`cell.forward`) 
+    which performs :code:`cell.call()` (for dygraph mode :code:`cell.forward`)
     repeatedly until reaches to the maximum length of `inputs`.
 
     Arguments:
         cell(RNNCellBase): An instance of `RNNCellBase`.
-        inputs(Tensor): the input sequences. 
-            If time_major is True, the shape is 
+        inputs(Tensor): the input sequences.
+            If time_major is True, the shape is
             `[time_steps, batch_size, input_size]`
             else the shape is `[batch_size, time_steps, input_size]`.
-        initial_states(Tensor|tuple|list, optional): the initial state of the 
-            rnn cell. Tensor or a possibly nested structure of tensors. If not 
+        initial_states(Tensor|tuple|list, optional): the initial state of the
+            rnn cell. Tensor or a possibly nested structure of tensors. If not
             provided, `cell.get_initial_states` would be called to produce
             the initial state. Defaults to None.
-        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64
             or int32. The valid lengths of input sequences. Defaults to None.
-            If `sequence_length` is not None, the inputs are treated as 
-            padded sequences. In each input sequence, elements whose time step 
+            If `sequence_length` is not None, the inputs are treated as
+            padded sequences. In each input sequence, elements whose time step
             index are not less than the valid length are treated as paddings.
         time_major (bool): Whether the first dimension of the input means the
             time steps. Defaults to False.
         is_reverse (bool, optional): Indicate whether to calculate in the reverse
             order of input sequences. Defaults to False.
-        **kwargs: Additional keyword arguments to pass to `forward` of the cell. 
+        **kwargs: Additional keyword arguments to pass to `forward` of the cell.
 
     Returns:
         (outputs, final_states)
-        outputs (Tensor|list|tuple): the output sequence. Tensor or nested 
+        outputs (Tensor|list|tuple): the output sequence. Tensor or nested
             structure of Tensors.
-            If `time_major` is True, the shape of each tensor in outpus is 
-            `[time_steps, batch_size, hidden_size]`, else 
+            If `time_major` is True, the shape of each tensor in outpus is
+            `[time_steps, batch_size, hidden_size]`, else
             `[batch_size, time_steps, hidden_size]`.
         final_states (Tensor|list|tuple): final states. A (possibly nested structure of)
-            tensor[s], representing the final state for RNN. It has the same 
+            tensor[s], representing the final state for RNN. It has the same
             structure of intial state. Each tensor in final states has the same
             shape and dtype as the corresponding tensor in initial states.
-            
+
 
     Examples:
 
@@ -498,7 +498,7 @@ def rnn(cell,
 
             inputs = paddle.rand((4, 23, 16))
             prev_h = paddle.randn((4, 32))
-            outputs, final_states = paddle.fluid.layers.rnn(cell, inputs, prev_h) 
+            outputs, final_states = paddle.fluid.layers.rnn(cell, inputs, prev_h)
 
     """
     if _non_static_mode():
@@ -671,47 +671,47 @@ def birnn(cell_fw,
           time_major=False,
           **kwargs):
     """
-    birnn creates a bidirectional recurrent neural network specified by 
-    RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()` 
-    (for dygraph mode :code:`cell.forward`) repeatedly until reaches to 
+    birnn creates a bidirectional recurrent neural network specified by
+    RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()`
+    (for dygraph mode :code:`cell.forward`) repeatedly until reaches to
     the maximum length of `inputs` and then concat the outputs for both RNNs
     along the last axis.
 
     Arguments:
         cell_fw(RNNCellBase): An instance of `RNNCellBase`.
         cell_bw(RNNCellBase): An instance of `RNNCellBase`.
-        inputs(Tensor): the input sequences. 
-            If time_major is True, the shape is 
+        inputs(Tensor): the input sequences.
+            If time_major is True, the shape is
             `[time_steps, batch_size, input_size]`
             else the shape is `[batch_size, time_steps, input_size]`.
-        initial_states(tuple, optional): A tuple of initial states of 
+        initial_states(tuple, optional): A tuple of initial states of
             `cell_fw` and `cell_bw`.
-            If not provided, `cell.get_initial_states` would be called to 
+            If not provided, `cell.get_initial_states` would be called to
             produce initial state for each cell. Defaults to None.
-        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64
             or int32. The valid lengths of input sequences. Defaults to None.
-            If `sequence_length` is not None, the inputs are treated as 
-            padded sequences. In each input sequence, elements whose time step 
+            If `sequence_length` is not None, the inputs are treated as
+            padded sequences. In each input sequence, elements whose time step
             index are not less than the valid length are treated as paddings.
         time_major (bool): Whether the first dimension of the input means the
             time steps. Defaults to False.
-        **kwargs: Additional keyword arguments to pass to `forward` of each cell. 
+        **kwargs: Additional keyword arguments to pass to `forward` of each cell.
 
     Returns:
         (outputs, final_states)
-        outputs (Tensor): the outputs of the bidirectional RNN. It is the 
-            concatenation of the outputs from the forward RNN and backward 
-            RNN along the last axis. 
+        outputs (Tensor): the outputs of the bidirectional RNN. It is the
+            concatenation of the outputs from the forward RNN and backward
+            RNN along the last axis.
             If time major is True, the shape is `[time_steps, batch_size, size]`,
             else the shape is `[batch_size, time_steps, size]`, where size is
             `cell_fw.hidden_size + cell_bw.hidden_size`.
-        final_states (tuple): A tuple of the final states of the forward 
-            cell and backward cell.        
+        final_states (tuple): A tuple of the final states of the forward
+            cell and backward cell.
 
     Examples:
 
         .. code-block:: python
-            
+
             import paddle
             paddle.disable_static()
 
@@ -724,7 +724,7 @@ def birnn(cell_fw,
             initial_states = ((hf, cf), (hb, cb))
             outputs, final_states = paddle.fluid.layers.birnn(
                 cell_fw, cell_bw, inputs, initial_states)
-        
+
     """
     if initial_states is None:
         states_fw = cell_fw.get_initial_states(
@@ -761,7 +761,7 @@ class Decoder(object):
 
     Decoder is the base class for any decoder instance used in `dynamic_decode`.
     It provides interface for output generation for one time step, which can be
-    used to generate sequences. 
+    used to generate sequences.
 
     The key abstraction provided by Decoder is:
 
@@ -771,7 +771,7 @@ class Decoder(object):
     It would be called once before the decoding iterations.
 
     2. :code:`(output, next_state, next_input, finished) = step(time, input, state)` ,
-    which transforms the input and state to the output and new state, generates 
+    which transforms the input and state to the output and new state, generates
     input for the next decoding step, and emits the flag indicating finished status.
     It is the main part for each decoding iteration.
 
@@ -805,7 +805,7 @@ class Decoder(object):
 
     def step(self, time, inputs, states, **kwargs):
         r"""
-        Called per step of decoding. 
+        Called per step of decoding.
 
         Parameters:
             time(Variable): A Tensor with shape :math:`[1]` provided by the caller.
@@ -813,7 +813,7 @@ class Decoder(object):
             inputs(Variable): A (possibly nested structure of) tensor variable[s].
             states(Variable): A (possibly nested structure of) tensor variable[s].
             **kwargs: Additional keyword arguments, provided by the caller.
-        
+
         Returns:
             tuple: A tuple( :code:(outputs, next_states, next_inputs, finished)` ). \
                 `next_inputs` and `next_states` both are a (possibly nested \
@@ -831,8 +831,8 @@ class Decoder(object):
         Parameters:
             outputs(Variable): A (possibly nested structure of) tensor variable[s].
                 The structure and data type is same as `output_dtype`.
-                The tensor stacks all time steps' output thus has shape 
-                :math:`[time\_step, batch\_size, ...]` , which is done by the caller. 
+                The tensor stacks all time steps' output thus has shape
+                :math:`[time\_step, batch\_size, ...]` , which is done by the caller.
             final_states(Variable): A (possibly nested structure of) tensor variable[s].
                 It is the `next_states` returned by `decoder.step` at last decoding step,
                 thus has the same structure, shape and data type with states at any time
@@ -887,12 +887,12 @@ class BeamSearchDecoder(Decoder):
 
     Returns:
         BeamSearchDecoder: An instance of decoder which can be used in \
-            `paddle.nn.dynamic_decode` to implement decoding. 
+            `paddle.nn.dynamic_decode` to implement decoding.
 
     Examples:
 
         .. code-block:: python
-            
+
             import numpy as np
             import paddle
             from paddle.nn import BeamSearchDecoder, dynamic_decode
@@ -924,7 +924,7 @@ class BeamSearchDecoder(Decoder):
             start_token(int): The start token id.
             end_token(int): The end token id.
             beam_size(int): The beam width used in beam search.
-            embedding_fn(optional): A callable to apply to selected candidate ids. 
+            embedding_fn(optional): A callable to apply to selected candidate ids.
                 Mostly it is an embedding layer to transform ids to embeddings,
                 and the returned value acts as the `input` argument for `cell.call`.
                 If not provided, the id to embedding transformation must be built into
@@ -943,7 +943,7 @@ class BeamSearchDecoder(Decoder):
     def tile_beam_merge_with_batch(x, beam_size):
         r"""
         Tile the batch dimension of a tensor. Specifically, this function takes
-        a tensor t shaped `[batch_size, s0, s1, ...]` composed of minibatch 
+        a tensor t shaped `[batch_size, s0, s1, ...]` composed of minibatch
         entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape
         `[batch_size * beam_size, s0, s1, ...]` composed of minibatch entries
         `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is repeated
@@ -979,7 +979,7 @@ class BeamSearchDecoder(Decoder):
     def _split_batch_beams(self, x):
         r"""
         Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new
-        tensor with shape `[batch_size, beam_size, ...]`. 
+        tensor with shape `[batch_size, beam_size, ...]`.
 
         Parameters:
             x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The
@@ -987,7 +987,7 @@ class BeamSearchDecoder(Decoder):
 
         Returns:
             Variable: A tensor with shape `[batch_size, beam_size, ...]`, whose \
-                data type is same as `x`.     
+                data type is same as `x`.
         """
         check_type(x, 'x', (Variable), 'BeamSearchDecoder._split_batch_beams')
         # TODO: avoid fake shape in compile-time like tile_beam_merge_with_batch
@@ -996,7 +996,7 @@ class BeamSearchDecoder(Decoder):
     def _merge_batch_beams(self, x):
         r"""
         Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new
-        tensor with shape `[batch_size * beam_size, ...]`. 
+        tensor with shape `[batch_size * beam_size, ...]`.
 
         Parameters:
             x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The
@@ -1004,7 +1004,7 @@ class BeamSearchDecoder(Decoder):
 
         Returns:
             Variable: A tensor with shape `[batch_size * beam_size, ...]`, whose \
-                data type is same as `x`.     
+                data type is same as `x`.
         """
         check_type(x, 'x', (Variable), 'BeamSearchDecoder._merge_batch_beams')
         # TODO: avoid fake shape in compile-time like tile_beam_merge_with_batch
@@ -1178,14 +1178,14 @@ class BeamSearchDecoder(Decoder):
             logits(Variable): A tensor with shape `[batch_size, beam_size, vocab_size]`,
                 representing the logits at the current time step. Its data type is float32.
             next_cell_states(Variable): A (possibly nested structure of) tensor variable[s].
-                It has the same structure, shape and data type as the `cell_states` of 
-                `initial_states` returned by `initialize()`. It represents the next state 
+                It has the same structure, shape and data type as the `cell_states` of
+                `initial_states` returned by `initialize()`. It represents the next state
                 from the cell.
             beam_state(Variable): A structure of tensor variables.
                 It is same as the `initial_states` returned by `initialize()` for
                 the first decoding step and `beam_search_state` returned by
                 `step()` for the others.
-        
+
         Returns:
             tuple: A tuple( :code:`(beam_search_output, beam_search_state)` ). \
                 `beam_search_output` is a namedtuple(including scores, predicted_ids, \
@@ -1259,8 +1259,8 @@ class BeamSearchDecoder(Decoder):
                 It is same as the `initial_states` returned by `initialize()` for
                 the first decoding step and `beam_search_state` returned by
                 `step()` for the others.
-            **kwargs: Additional keyword arguments, provided by the caller. 
-        
+            **kwargs: Additional keyword arguments, provided by the caller.
+
         Returns:
             tuple: A tuple( :code:`(beam_search_output, beam_search_state, next_inputs, finished)` ). \
                 `beam_search_state` and `next_inputs` have the same structure, \
@@ -1303,8 +1303,8 @@ class BeamSearchDecoder(Decoder):
         Parameters:
             outputs(Variable): A structure(namedtuple) of tensor variables,
                 The structure and data type is same as `output_dtype`.
-                The tensor stacks all time steps' output thus has shape 
-                `[time_step, batch_size, ...]`, which is done by the caller. 
+                The tensor stacks all time steps' output thus has shape
+                `[time_step, batch_size, ...]`, which is done by the caller.
             final_states(Variable): A structure(namedtuple) of tensor variables.
                 It is the `next_states` returned by `decoder.step` at last
                 decoding step, thus has the same structure, shape and data type
@@ -1614,7 +1614,7 @@ def dynamic_decode(decoder,
 
     Parameters:
         decoder(Decoder): An instance of `Decoder`.
-        inits(object, optional): Argument passed to `decoder.initialize`. 
+        inits(object, optional): Argument passed to `decoder.initialize`.
             Default `None`.
         max_step_num(int, optional): The maximum number of steps. If not provided,
             decode until the decoder is fully done, or in other words, the returned
@@ -1637,7 +1637,7 @@ def dynamic_decode(decoder,
         return_length(bool, optional):  A flag indicating whether to return an
             extra Tensor variable in the output tuple, which stores the actual
             lengths of all decoded sequences. Default `False`.
-        **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`. 
+        **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`.
 
     Returns:
         tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \
@@ -1653,12 +1653,12 @@ def dynamic_decode(decoder,
             is an `int64` tensor with the same shape as `finished` returned \
             by :code:`decoder.initialize()` , and it stores the actual lengths of \
             all decoded sequences.
-            
+
 
     Examples:
 
         .. code-block:: python
-            
+
             import numpy as np
             import paddle
             from paddle.nn import BeamSearchDecoder, dynamic_decode
@@ -1767,7 +1767,7 @@ class TrainingHelper(DecodeHelper):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle.fluid as fluid
             import paddle.fluid.layers as layers
             trg_emb = fluid.data(name="trg_emb",
@@ -1790,7 +1790,7 @@ class TrainingHelper(DecodeHelper):
         Constructor of TrainingHelper.
 
         Parameters:
-            inputs(Variable): A (possibly nested structure of) tensor variable[s]. 
+            inputs(Variable): A (possibly nested structure of) tensor variable[s].
                 The shape of tensor should be `[batch_size, sequence_length, ...]`
                 for `time_major == False` or `[sequence_length, batch_size, ...]`
                 for `time_major == True`. It represents the inputs to be sliced
@@ -1852,7 +1852,7 @@ class TrainingHelper(DecodeHelper):
             outputs(Variable): A tensor variable. Usually it's data type is float32
                 or float64, and it's shape is `[batch_size, vocabulary_size]`,
                 representing the predicted logits of current step. It is same as
-                `outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`. 
+                `outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`.
             states(Variable): A (possibly nested structure of) tensor variable[s].
                 It is same as `new_states` returned by `BasicDecoder.cell.call()`.
 
@@ -1918,13 +1918,13 @@ class GreedyEmbeddingHelper(DecodeHelper):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle.fluid as fluid
             import paddle.fluid.layers as layers
             trg_emb = fluid.data(name="trg_emb",
                                  shape=[None, None, 128],
                                  dtype="float32")
-            
+
             trg_embeder = lambda x: fluid.embedding(
                 x, size=[10000, 128], param_attr=fluid.ParamAttr(name="trg_embedding"))
             output_layer = lambda x: layers.fc(x,
@@ -1945,7 +1945,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
         Constructor of GreedyEmbeddingHelper.
 
         Parameters:
-            embedding_fn(callable): A functor to apply on the argmax results. 
+            embedding_fn(callable): A functor to apply on the argmax results.
                 Mostly it is an embedding layer to transform ids to embeddings.
                 **Note that fluid.embedding should be used here rather than
                 fluid.layers.embedding, since shape of ids is [batch_size].
@@ -1970,7 +1970,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
         r"""
         GreedyEmbeddingHelper initialization produces inputs for the first decoding
         step by using `start_tokens` of the constructor, and gives initial
-        status telling whether each sequence in the batch is finished. 
+        status telling whether each sequence in the batch is finished.
         It is the partial of the initialization of `BasicDecoder`.
 
         Returns:
@@ -1999,7 +1999,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
             outputs(Variable): A tensor variable. Usually it's data type is float32
                 or float64, and it's shape is `[batch_size, vocabulary_size]`,
                 representing the predicted logits of current step. It is same as
-                `outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`. 
+                `outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`.
             states(Variable): A (possibly nested structure of) tensor variable[s].
                 It is same as `new_states` returned by `BasicDecoder.cell.call()`.
 
@@ -2051,13 +2051,13 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle.fluid as fluid
             import paddle.fluid.layers as layers
             trg_emb = fluid.data(name="trg_emb",
                                  shape=[None, None, 128],
                                  dtype="float32")
-            
+
             trg_embeder = lambda x: fluid.embedding(
                 x, size=[10000, 128], param_attr=fluid.ParamAttr(name="trg_embedding"))
             output_layer = lambda x: layers.fc(x,
@@ -2083,7 +2083,7 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
         Constructor of SampleEmbeddingHelper.
 
         Parameters:
-            embedding_fn(callable): A functor to apply on the argmax results. 
+            embedding_fn(callable): A functor to apply on the argmax results.
                 Mostly it is an embedding layer to transform ids to embeddings.
                 **Note that fluid.embedding should be used here rather than
                 fluid.layers.embedding, since shape of ids is [batch_size].
@@ -2123,7 +2123,7 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
             outputs(Variable): A tensor variable. Usually it's data type is float32
                 or float64, and it's shape is `[batch_size, vocabulary_size]`,
                 representing the predicted logits of current step. It is same as
-                `outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`. 
+                `outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`.
             states(Variable): A (possibly nested structure of) tensor variable[s].
                 It is same as `new_states` returned by `BasicDecoder.cell.call()`.
 
@@ -2162,13 +2162,13 @@ class BasicDecoder(Decoder):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle.fluid as fluid
             import paddle.fluid.layers as layers
             trg_emb = fluid.data(name="trg_emb",
                                  shape=[None, None, 128],
                                  dtype="float32")
-            
+
             trg_embeder = lambda x: fluid.embedding(
                 x, size=[10000, 128], param_attr=fluid.ParamAttr(name="trg_embedding"))
             output_layer = lambda x: layers.fc(x,
@@ -2253,8 +2253,8 @@ class BasicDecoder(Decoder):
                 for the first decoding step and `next_states` returned by
                 `step()` for the others.
             **kwargs: Additional keyword arguments, provided by the caller
-                `dynamic_decode`. 
-        
+                `dynamic_decode`.
+
         Returns:
             tuple: A tuple( :code:`(outputs, next_states, next_inputs, finished)` ). \
                 `outputs` is a namedtuple(including cell_outputs, sample_ids, \
@@ -2356,7 +2356,7 @@ def dynamic_lstm(input,
                                  - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
                                                  W_{fc}, W_{oc}`}.
                                  - The shape is [1, 7*hidden_size].
-                                 
+
         use_peepholes (bool, optional): Whether to use peephole connection or not. Default: True.
         is_reverse (bool, optional): Whether to calculate reverse LSTM. Default: False.
         gate_activation (str, optional): The activation for input gate, forget gate and output gate. Default: "sigmoid".
@@ -2375,12 +2375,12 @@ def dynamic_lstm(input,
 
     Examples:
         .. code-block:: python
-            
+
             import paddle.fluid as fluid
             emb_dim = 256
             vocab_size = 10000
             hidden_dim = 512
-            
+
             data = fluid.data(name='x', shape=[None], dtype='int64', lod_level=1)
             emb = fluid.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
 
@@ -2547,7 +2547,7 @@ def lstm(input,
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
             import paddle.fluid as fluid
             import paddle.fluid.layers as layers
@@ -2910,7 +2910,7 @@ def dynamic_gru(input,
     This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` ,
     **Note** thus a fully-connect layer whose size is 3 times of ``size`` should
     be used before this operator, and the output should be used as ``input`` here.
-    :math:`h_{t-1}` is the hidden state from previous time step. 
+    :math:`h_{t-1}` is the hidden state from previous time step.
     :math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for
     update gate, reset gate, candidate hidden and hidden output separately.
     :math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for
@@ -3070,7 +3070,7 @@ def gru_unit(input,
     This operator does not include the calculations :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` ,
     **Note** thus a fully-connect layer whose size is 3 times of GRU hidden size should
     be used before this operator, and the output should be used as ``input`` here.
-    :math:`h_{t-1}` is the hidden state from previous time step. 
+    :math:`h_{t-1}` is the hidden state from previous time step.
     :math:`u_t` , :math:`r_t` , :math:`\\tilde{h_t}` and :math:`h_t` stand for
     update gate, reset gate, candidate hidden and hidden output separately.
     :math:`W_{uh}, b_u` , :math:`W_{rh}, b_r` and :math:`W_{ch}, b_c` stand for
@@ -3250,8 +3250,8 @@ def beam_search(pre_ids,
             Default 0, which shouldn't be changed currently.
         is_accumulated(bool): Whether the input ``score`` is accumulated scores.
             Default True.
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
             None by default.
         return_parent_idx(bool, optional): Whether to return an extra Tensor variable
             in output, which stores the selected ids' parent index in
@@ -3375,8 +3375,8 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
             counterpart in ``ids`` , and has a float32 data type.
         beam_size(int): The beam width used in beam search.
         end_id(int): The id of end token.
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
             None by default.
 
     Returns:
@@ -3440,7 +3440,7 @@ def lstm_unit(x_t,
 
     We add forget_bias to the biases of the forget gate in order to
     reduce the scale of forgetting. The formula is as follows:
-    
+
     .. math::
 
         i_{t} & = \sigma(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
@@ -3478,8 +3478,8 @@ def lstm_unit(x_t,
         bias_attr (ParamAttr, optional): To specify the bias parameter property.
             Default: None, which means the default bias parameter property is used.
             See usage for details in :ref:`api_fluid_ParamAttr` .
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
             None by default.
 
     Returns:
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index c0ebe1adb61..58c65b69d28 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -55,7 +55,7 @@ def sequence_conv(input,
                   name=None):
     r"""
 
-    Note: 
+    Note:
     	Only receives LoDTensor as input. If your input is Tensor, please use conv2d Op.(fluid.layers.** :ref:`api_fluid_layers_conv2d` ).
 
     This operator receives input sequences with variable length and other convolutional
@@ -63,7 +63,7 @@ def sequence_conv(input,
     It fills all-zero padding data on both sides of the sequence by default to ensure that
     the output is the same length as the input. You can customize the padding behavior by
     configuring the parameter :attr:`padding\_start` .
-    
+
     **Warning:** the parameter :attr:`padding` take no effect and will be deprecated in the future.
 
     .. code-block:: text
@@ -181,7 +181,7 @@ def sequence_softmax(input, use_cudnn=False, name=None):
     r"""
 
     Note:
-        The input type of the OP must be LoDTensor. For Tensor, use:** :ref:`api_fluid_layers_softmax` 
+        The input type of the OP must be LoDTensor. For Tensor, use:** :ref:`api_fluid_layers_softmax`
 
     A LoD-tensor can be regarded as several sequences, and this op apply softmax algo on each sequence.
     The shape of input Tensor can be :math:`[N, 1]` or :math:`[N]`, where :math:`N`
@@ -193,7 +193,7 @@ def sequence_softmax(input, use_cudnn=False, name=None):
 
         Out(X[lod[i]:lod[i+1]], :) = \\frac{\exp(X[lod[i]:lod[i+1], :])}{\sum(\exp(X[lod[i]:lod[i+1], :]))}
 
-    For example, for a LoD-Tensor with 6 sequences ([3, 2, 4, 1, 2, 3] - sequence length list in order), 
+    For example, for a LoD-Tensor with 6 sequences ([3, 2, 4, 1, 2, 3] - sequence length list in order),
     the lod in the runtime is [[0, 3, 5, 9, 10, 12, 15]],
     then softmax will be computed among :math:`X[0:3,:],X[3:5,:],X[5:9,:],X[9:10,:],X[10:12,:],X[12:15,:]`,
     and :math:`N` turns out to be 15.
@@ -213,19 +213,19 @@ def sequence_softmax(input, use_cudnn=False, name=None):
             then:
                  output.data = [0.30724832, 0.41474187, 0.2780098,
                                 0.59868765, 0.40131235,
-                                0.2544242, 0.09359743, 0.13963096, 0.5123474, 
+                                0.2544242, 0.09359743, 0.13963096, 0.5123474,
                                 1.,
                                 0.84553474, 0.15446526,
                                 0.06995796, 0.69777346, 0.23226859]
-                 output.lod = [[0, 3, 5, 9, 10, 12, 15]]    
-    
+                 output.lod = [[0, 3, 5, 9, 10, 12, 15]]
+
 
     Args:
-        input (Variable):A LoDTensor with shape of  :math:`[N, 1]` or  :math:`[N]`, Recommended usage: :math:`[N]`. 
-                         Supported data types: float32, float64. 
-        use_cudnn (bool, optional): Use cudnn kernel or not. Effective only when the cudnn version of the paddle 
+        input (Variable):A LoDTensor with shape of  :math:`[N, 1]` or  :math:`[N]`, Recommended usage: :math:`[N]`.
+                         Supported data types: float32, float64.
+        use_cudnn (bool, optional): Use cudnn kernel or not. Effective only when the cudnn version of the paddle
                                     library is installed and GPU is used for training or reasoning. Default: False.
-        name (str, optional): The default value is None. Normally there is no need for user to set this property. 
+        name (str, optional): The default value is None. Normally there is no need for user to set this property.
                               For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
@@ -234,17 +234,17 @@ def sequence_softmax(input, use_cudnn=False, name=None):
     Examples:
 
         .. code-block:: python
-             
+
              import paddle
              paddle.enable_static()
-             
+
              x = paddle.static.data(name='x', shape=[7, 1],
                               dtype='float32', lod_level=1)
-             x_sequence_softmax_1 = paddle.static.nn.sequence_softmax(input=x)  
+             x_sequence_softmax_1 = paddle.static.nn.sequence_softmax(input=x)
 
              y = paddle.static.data(name='y', shape=[7],
                  dtype='float32', lod_level=1)
-             x_sequence_softmax_2 = paddle.static.nn.sequence_softmax(input=y)  
+             x_sequence_softmax_2 = paddle.static.nn.sequence_softmax(input=y)
     """
     assert not _non_static_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -263,7 +263,7 @@ def sequence_softmax(input, use_cudnn=False, name=None):
 def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
     r"""
 
-    Note: 
+    Note:
         Only receives LoDTensor as input. If your input is Tensor, please use pool2d Op.(fluid.layers.** :ref:`api_fluid_layers_pool2d` ).
 
     This operator only supports LoDTensor as input. It will apply specified pooling
@@ -380,7 +380,7 @@ def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
 def sequence_concat(input, name=None):
     """
 
-    Note: 
+    Note:
         Only receives LoDTensor as input. If your input is Tensor, please use concat Op.(fluid.layers.** :ref:`api_fluid_layers_concat` ).
 
     This operator only supports LoDTensor as input. It concatenates the multiple LoDTensor from input by the LoD information,
@@ -547,7 +547,7 @@ def sequence_last_step(input):
 
              import paddle
              paddle.enable_static()
-             
+
              x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
              x_last_step = paddle.static.nn.sequence_last_step(input=x)
     """
@@ -608,7 +608,7 @@ def sequence_slice(input, offset, length, name=None):
 
              import paddle
              paddle.enable_static()
-             
+
              import numpy as np
              seqs = paddle.static.data(name='x', shape=[10, 5],
                               dtype='float32', lod_level=1)
@@ -656,7 +656,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
         of ``y``. If the lod level of ``x`` is 0, then the first dim of ``x`` should \
         be equal to the size of ``ref_level`` of ``y``. The rank of **x** is at least 2. \
         When rank of ``x`` is greater than 2, then it would be viewed as a 2-D tensor.
-    
+
     Note:
 
         Please note that the input ``x`` should be LodTensor or Tensor, \
@@ -716,16 +716,16 @@ def sequence_expand(x, y, ref_level=-1, name=None):
                          refer the last level of lod.
         name(str, optional): For detailed information, please refer \
             to :ref:`api_guide_Name`. Usually name is no need to set and \
-            None by default. 
+            None by default.
 
-    Returns: 
+    Returns:
             Tensor, The expanded variable which is a LoDTensor, with dims ``[N, K]``. \
             ``N`` depends on the lod info of ``x`` and ``y``. \
             The data type is same as input.
 
     Examples:
         .. code-block:: python
-	
+
             import paddle
             from paddle import fluid
             paddle.enable_static()
@@ -806,7 +806,7 @@ def sequence_expand_as(x, y, name=None):
 
         Consider 4 sequences [a], [b], [c], [d], now we want to expand them to [a][a][a], [b][b][b], [c] and [d].
         It's obvious that the lod info of expanded sequences is [0, 3, 6, 7, 8].
-        Given a 1-level LodTensor ``x``: 
+        Given a 1-level LodTensor ``x``:
             x.data = [[a], [b], [c], [d]]
             x.dims = [4, 1]
         and input ``y``
@@ -840,7 +840,7 @@ def sequence_expand_as(x, y, name=None):
             to :ref:`api_guide_Name`. Usually name is no need to set and \
             None by default.
 
-    Returns: 
+    Returns:
             Tensor, The expanded variable which is a LoDTensor with the dims ``[N, K]``. \
             ``N`` depends on the lod of ``y``, and the lod level must be 1. \
             The data type is same as input.
@@ -908,10 +908,10 @@ def sequence_expand_as(x, y, name=None):
 def sequence_pad(x, pad_value, maxlen=None, name=None):
     r"""
 
-        This layer padding the sequences in a same batch to a common length (according 
-        to ``maxlen``). The padding value is defined by ``pad_value``, and will be 
-        appended to the tail of sequences. The result is a Python tuple ``(Out, Length)``: 
-        the LodTensor ``Out`` is the padded sequences, and LodTensor ``Length`` is 
+        This layer padding the sequences in a same batch to a common length (according
+        to ``maxlen``). The padding value is defined by ``pad_value``, and will be
+        appended to the tail of sequences. The result is a Python tuple ``(Out, Length)``:
+        the LodTensor ``Out`` is the padded sequences, and LodTensor ``Length`` is
         the length information of input sequences. For removing padding data (unpadding operation), See :ref:`api_fluid_layers_sequence_unpad`.
 
     Note:
@@ -972,7 +972,7 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
             to :ref:`api_guide_Name`. Usually name is no need to set and \
             None by default.
 
-    Returns: 
+    Returns:
             tuple, A Python tuple (Out, Length): the 1st is a 0 level LodTensor \
             ``Out``, with the shape ``[batch_size, maxlen, K]``; the second is the original \
             sequences length infor ``Length``, which should be a 0-level 1D LodTensor. \
@@ -1026,8 +1026,8 @@ def sequence_unpad(x, length, name=None):
     """
 
     Note:
-        The input of the OP is Tensor and the output is LoDTensor.  For padding operation, See:**  :ref:`api_fluid_layers_sequence_pad`  
-     
+        The input of the OP is Tensor and the output is LoDTensor.  For padding operation, See:**  :ref:`api_fluid_layers_sequence_pad`
+
     Remove the padding data from the input based on the length information and returns a LoDTensor.
 
     .. code-block:: text
@@ -1052,9 +1052,9 @@ def sequence_unpad(x, length, name=None):
     Args:
         x(Variable): A Tensor which contains padding data, and its shape size can not be less than 2.
                      Supported data types: float32, float64, int32, int64.
-        length(Variable): A 1D Tensor that stores the actual length of each sample, and the Tensor 
+        length(Variable): A 1D Tensor that stores the actual length of each sample, and the Tensor
                           has the same shape with the 0th dimension of the X . Supported data types: int64.
-        name(str|None):  The default value is None.  Normally there is no need for user to set this property.  
+        name(str|None):  The default value is None.  Normally there is no need for user to set this property.
                          For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
@@ -1072,7 +1072,7 @@ def sequence_unpad(x, length, name=None):
             x = paddle.static.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
             pad_value = paddle.assign(numpy.array([0.0], dtype=numpy.float32))
             pad_data, len = paddle.static.nn.sequence_pad(x=x, pad_value=pad_value)
-            
+
             # unpad data
             unpad_data = paddle.static.nn.sequence_unpad(x=pad_data, length=len)
     """
@@ -1101,7 +1101,7 @@ def sequence_unpad(x, length, name=None):
 def sequence_reshape(input, new_dim):
     """
 
-    Note: 
+    Note:
         Only receives LoDTensor as input. If your input is Tensor, please use reshape Op.(fluid.layers.** :ref:`api_fluid_layers_reshape` ).
 
     Only supports LoDTensor as input. Given :attr:`new_dim` ,
@@ -1166,22 +1166,22 @@ def sequence_scatter(input, index, updates, name=None):
 
     Note:
         The index and updates parameters of the OP must be LoDTensor.
-     
+
     Plus the updates data to the corresponding input according to the index.
- 
-    The updated algorithm is as follows: output[instance_index][index [pos]] = input[instance_index][index [pos]] +  updates[pos], 
+
+    The updated algorithm is as follows: output[instance_index][index [pos]] = input[instance_index][index [pos]] +  updates[pos],
     where instance_idx is the K sample corresponding to pos in batch.
 
-    The value of output[i][j] depends on whether j can be found in the i+1th interval of the index. If found, 
+    The value of output[i][j] depends on whether j can be found in the i+1th interval of the index. If found,
     out[i][j] = input[i][j] + update[m] [n], otherwise, out[i][j] = input[i][j].
 
-    For example, in the following example, the lod information for index is divided into three sequences. Among 
-    them, because the element 0 can be found in the first interval of the index, it is updated with the value of 
-    the corresponding position of the updates, out[0][0] = input[0][0]+updates[0][0] . Because element 1 cannot 
+    For example, in the following example, the lod information for index is divided into three sequences. Among
+    them, because the element 0 can be found in the first interval of the index, it is updated with the value of
+    the corresponding position of the updates, out[0][0] = input[0][0]+updates[0][0] . Because element 1 cannot
     be found in the third interval of index, out[2][1] = input[2][1].
 
     .. code-block:: text
-        
+
         *Case 1:
 
             Given:
@@ -1205,9 +1205,9 @@ def sequence_scatter(input, index, updates, name=None):
     Args:
         input (Variable): A Tensor with shape of  :math:`[N, k_1... k_n]`. Supported data types: float32, float64, int32, int64.
         index (Variable):  A LoDTensor contains index information. Its LoD level must be 1 and its data type can be int32 or int64.
-        updates (Variable): A LodTensor contains updates information. It has the same  LoD level with the index and has the 
+        updates (Variable): A LodTensor contains updates information. It has the same  LoD level with the index and has the
                             same data type  with the input. Supported data types: float32, float64, int32, int64.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, 
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information,
                               please refer to :ref:`api_guide_Name`
 
     Returns:
@@ -1216,7 +1216,7 @@ def sequence_scatter(input, index, updates, name=None):
     Examples:
 
         .. code-block:: python
-	
+
             import paddle
             paddle.enable_static()
 
@@ -1288,7 +1288,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
             to :ref:`api_guide_Name`. Usually name is no need to set and \
             None by default.
 
-    Returns: 
+    Returns:
             Tensor, The enumerate sequence variable which is a LoDTensor with \
             shape ``[d_1, win_size]`` and 1-level lod info. \
             The data type is same as ``input``.
@@ -1298,7 +1298,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
 
             import paddle
             paddle.enable_static()
-            
+
             x = paddle.static.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
             out = paddle.static.nn.sequence_enumerate(input=x, win_size=3, pad_value=0)
     """
@@ -1358,7 +1358,7 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
             to :ref:`api_guide_Name`. Usually name is no need to set and \
             None by default.
 
-    Returns: 
+    Returns:
             Tensor, The output sequence mask. Tensor with shape [d_1, d_2, ..., d_n, maxlen]
             and data type of :code:`dtype`. The data type should be bool, float32, float64, int8,
             int32 or int64.
@@ -1384,7 +1384,7 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
 @templatedoc()
 def sequence_reverse(x, name=None):
     """
-    Note: 
+    Note:
         Only receives LoDTensor as input. If your input is Tensor, please use reverse Op.(fluid.layers.** :ref:`api_fluid_layers_reverse` ).
 
     Only supports LoDTensor as input. It will reverse each sequence for input LoDTensor.
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 2910f4187a7..81640339bdc 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -71,7 +71,7 @@ def create_tensor(dtype, name=None, persistable=False):
     Args:
         dtype(string|numpy.dtype): the data type of Tensor to be created, the
             data type is bool, float16, float32, float64, int8, int16, int32 and int64.
-        name(string, optional): The default value is None.  Normally there is no need for 
+        name(string, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
         persistable(bool): Set the persistable flag of the create tensor.
             default value is False.
@@ -281,7 +281,7 @@ def concat(input, axis=0, name=None):
 
     Args:
         input(list|tuple|Tensor): ``input`` can be Tensor, Tensor list or Tensor tuple which is with data type
-            bool, float16, float32, float64, int32, int64. All the Tensors in ``input`` must have the same data type. 
+            bool, float16, float32, float64, int32, int64. All the Tensors in ``input`` must have the same data type.
         axis(int|Tensor, optional): Specify the axis to operate on the input Tensors.
             It's a scalar with data type int or a Tensor with shape [1] and data type int32 or int64.
             The effective range is [-R, R), where R is Rank(x). When ``axis < 0``, it works the same way
@@ -731,10 +731,10 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             If ``shape`` is an Tensor, it should be an 1-D Tensor with date type int32 or int64.
         dtype(np.dtype|str): Data type of the output Tensor which can
             be float16, float32, float64, uint8, int16, int32, int64.
-        value(bool|float|int|Tensor): The constant value used to initialize 
+        value(bool|float|int|Tensor): The constant value used to initialize
             the Tensor to be created. If ``value`` is an Tensor, it should be an 1-D Tensor.
         force_cpu(bool, optional): data should be on CPU if it's true, default value is False.
-        out(Tensor, optional): Optional output which can be any created 
+        out(Tensor, optional): Optional output which can be any created
             Tensor that meets the requirements to store the result of operation.
             if ``out`` is None, a new Tensor will be create to store the result.
         name(str, optional): The default value is None.  Normally there is no need for user to set this
@@ -759,7 +759,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
           # attr shape is a Tensor.
           shape = fluid.layers.fill_constant([2], "int32", 2) # shape=[2,2]
           data4 = fluid.layers.fill_constant(shape=shape, dtype='bool', value=True) # data4=[[True,True],[True,True]]
-          
+
           # attr value is a Tensor.
           val = fluid.layers.fill_constant([1], "float32", 2.0) # val=[2.0]
           data5 = fluid.layers.fill_constant(shape=[2,1], value=val, dtype='float32') #data5=[[2.0],[2.0]]
@@ -877,7 +877,7 @@ def fill_constant_batch_size_like(input,
             according the input.
         dtype(np.dtype|core.VarDesc.VarType|str): The data type of created Tensor which
             can be float32, float64, int32, int64.
-        value(float|int): The constant value used to initialize the Tensor to be created. 
+        value(float|int): The constant value used to initialize the Tensor to be created.
         input_dim_idx(int): When the value is 0 and the input is LoDTensor, the output_dim_idx
             dimension of the created Tensor is set to the batch_size value of input.
             The default value is 0.
@@ -1176,7 +1176,7 @@ def ones(shape, dtype, force_cpu=False):
 
           import paddle.fluid as fluid
           data0 = fluid.layers.ones(shape=[2, 4], dtype='float32') # [[1., 1., 1., 1.], [1., 1., 1., 1.]]
-          
+
           # shape is a Tensor
           shape = fluid.layers.fill_constant(shape=[2], dtype='int32', value=2)
           data1 = fluid.layers.ones(shape=shape, dtype='int32') #[[1, 1], [1, 1]]
@@ -1207,7 +1207,7 @@ def zeros(shape, dtype, force_cpu=False, name=None):
 
           import paddle.fluid as fluid
           data = fluid.layers.zeros(shape=[3, 2], dtype='float32') # [[0., 0.], [0., 0.], [0., 0.]]
-          
+
           # shape is a Tensor
           shape = fluid.layers.fill_constant(shape=[2], dtype='int32', value=2)
           data1 = fluid.layers.zeros(shape=shape, dtype='int32') #[[0, 0], [0, 0]]
@@ -1377,10 +1377,10 @@ def has_inf(x):
 
     Returns:
        Tensor: The tensor storing the output, only a bool value, indicating that whether there is infinity number in x or not.
-    
+
     Examples:
         .. code-block:: python
-          
+
           import paddle
           data = paddle.randn(shape=[4, 32, 32], dtype="float32")
           res = paddle.fluid.layers.has_inf(data)
@@ -1406,10 +1406,10 @@ def has_nan(x):
 
     Returns:
        Tensor: The tensor variable storing the output, only a bool value, indicating that whether there is NAN in x or not.
-    
+
     Examples:
         .. code-block:: python
-    
+
           import paddle
           data = paddle.randn(shape=[2,3], dtype="float32")
           res = paddle.fluid.layers.has_nan(data)
@@ -1485,7 +1485,7 @@ def range(start, end, step, dtype, name=None):
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
-    Returns: 
+    Returns:
         Tensor: A 1-D Tensor with values from the interval [``start``, ``end``)
             taken with common difference ``step`` beginning from ``start``. Its
             data type is set by ``dtype``.
@@ -1571,13 +1571,13 @@ def linspace(start, stop, num, dtype=None, name=None):
             or a Tensor of shape [1] with data type int32.
         dtype(np.dtype|str, optional): The data type of output tensor, it could be
             int32, int64, float32 and float64. Default: if None, the data type is float32.
-        name(str, optional): Normally there is no need for user to set this property. 
+        name(str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.Default: None.
 
     Returns:
         Tensor: the output data type will be float32, float64. The 1-D tensor with fixed number of evenly spaced values, \
         the data shape of this tensor is :math:`[num]` . If the :attr:`num` is set 1, the output tensor just has \
-        the value with input :attr:`start`. 
+        the value with input :attr:`start`.
 
     Examples:
         .. code-block:: python
@@ -1656,7 +1656,7 @@ def linspace(start, stop, num, dtype=None, name=None):
 
 def zeros_like(x, out=None):
     """
-    This OP creates a zeros tensor which has identical shape and dtype 
+    This OP creates a zeros tensor which has identical shape and dtype
     with `x`.
 
     Args:
@@ -1723,7 +1723,7 @@ def diag(diagonal):
 
           # [[3, 0, 0]
           #  [0, 4, 0]
-          #  [0, 0, 5] 
+          #  [0, 0, 5]
 
           import paddle.fluid as fluid
           import numpy as np
@@ -1756,7 +1756,7 @@ def eye(num_rows,
         dtype='float32',
         name=None):
     """
-    This function constructs a or a batch of 2-D tensor with ones on the diagonal and zeros elsewhere. 
+    This function constructs a or a batch of 2-D tensor with ones on the diagonal and zeros elsewhere.
 
     Args:
         num_rows(int): the number of rows in each batch tensor.
@@ -1853,7 +1853,7 @@ def ones_like(x, out=None):
     """
     **ones_like**
 
-    This function creates a ones tensor which has identical shape and dtype 
+    This function creates a ones tensor which has identical shape and dtype
     with `x`.
 
     Args:
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index ad68366e1ed..d6c22bb8c2a 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -426,17 +426,17 @@ def check_shape(shape):
 
 def try_set_static_shape_tensor(tensor, shape):
     """Try to set static shape of tensor from a shape tensor.
-    
+
     For example,
 
     import paddle
     paddle.enable_static()
     data = paddle.static.data(name="x", shape=[-1, 2], dtype='float32')
     shape = paddle.shape(data)  # shape should be [-1, 2] instead of [-1, -1]
-    x = paddle.uniform(shape) 
-    print(x.shape) 
+    x = paddle.uniform(shape)
+    print(x.shape)
     # (-1, 2)
-    
+
     """
     if not _non_static_mode():
         # static mode, and shape is not all inferred (contains -1)
@@ -451,15 +451,15 @@ def try_get_constant_shape_from_tensor(shape_tensor):
     """Try to get shape from a tensor with constant value.
 
     For example,
-    
+
     import paddle
     paddle.enable_static()
     data = paddle.static.data(name="x", shape=[-1, 2], dtype='float32')
     shape = paddle.shape(data)  # shape should be [-1, 2] instead of [-1, -1]
-    x = paddle.uniform(shape) 
-    print(x.shape) 
+    x = paddle.uniform(shape)
+    print(x.shape)
     # (-1, 2)
-    
+
     """
     if not _non_static_mode():
         try:
diff --git a/python/paddle/fluid/lazy_init.py b/python/paddle/fluid/lazy_init.py
index 8d98b1287e3..c44ed7f1189 100644
--- a/python/paddle/fluid/lazy_init.py
+++ b/python/paddle/fluid/lazy_init.py
@@ -97,12 +97,12 @@ class LazyGuard(object):
         Construct instance from class_obj by Lazy Initializing parameters.
 
         Examples:
-    
+
             .. code-block:: python
 
                 from paddle import LazyGuard
                 from paddle.nn import Linear
-                
+
                 with LazyGuard():
                     fc = LazyInit(Linear)(10, 10)
 
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 4bc6be7e093..b0d54b33bfc 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -57,15 +57,15 @@ def _is_number_or_matrix_(var):
 
 class MetricBase(object):
     """
-    In many cases, we usually have to split the test data into mini-batches for evaluating 
-    deep neural networks, therefore we need to collect the evaluation results of each 
-    mini-batch and aggregate them into the final result. The paddle.fluid.metrics is 
-    designed for a convenient way of deep neural network evaluation. 
+    In many cases, we usually have to split the test data into mini-batches for evaluating
+    deep neural networks, therefore we need to collect the evaluation results of each
+    mini-batch and aggregate them into the final result. The paddle.fluid.metrics is
+    designed for a convenient way of deep neural network evaluation.
 
-    The paddle.fluid.metrics contains serval different evaluation metrics 
+    The paddle.fluid.metrics contains serval different evaluation metrics
     like precision and recall, and most of them have the following functions:
 
-    1. take the prediction result and the corresponding labels of a mini-batch as input, 
+    1. take the prediction result and the corresponding labels of a mini-batch as input,
     then compute the evaluation result for the input mini-batch.
 
     2. aggregate the existing evaluation results as the overall performance.
@@ -106,8 +106,8 @@ class MetricBase(object):
 
     def reset(self):
         """
-        reset function empties the evaluation memory for previous mini-batches. 
-        
+        reset function empties the evaluation memory for previous mini-batches.
+
         Args:
             None
 
@@ -159,9 +159,9 @@ class MetricBase(object):
     def update(self, preds, labels):
         """
         Given the prediction results (preds) and the labels (labels)
-        of some mini-batch, compute the evaluation result of that mini-batch, 
+        of some mini-batch, compute the evaluation result of that mini-batch,
         and memorize the evaluation result. Please notice that the update function only
-        memorizes the evaluation result but would not return the score. If you want to 
+        memorizes the evaluation result but would not return the score. If you want to
         get the evaluation result, please call eval() function.
 
         Args:
@@ -198,11 +198,11 @@ class MetricBase(object):
 
 class CompositeMetric(MetricBase):
     """
-    This op creates a container that contains the union of all the added metrics. 
+    This op creates a container that contains the union of all the added metrics.
     After the metrics added in, calling eval() method will compute all the contained metrics automatically.
     CAUTION: only metrics with the SAME argument list can be added in a CompositeMetric instance.
 
-    Inherit from: `MetricBase <https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/api_cn/metrics_cn.html#paddle.fluid.metrics.MetricBase>`_ 
+    Inherit from: `MetricBase <https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/api_cn/metrics_cn.html#paddle.fluid.metrics.MetricBase>`_
 
     Args:
        name (str, optional): Metric name. For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -234,8 +234,8 @@ class CompositeMetric(MetricBase):
 
     def add_metric(self, metric):
         """
-        Add a new metric to container. Noted that the argument list 
-        of the added one should be consistent with existed ones.  
+        Add a new metric to container. Noted that the argument list
+        of the added one should be consistent with existed ones.
 
         Args:
             metric(MetricBase): a instance of MetricBase
@@ -250,7 +250,7 @@ class CompositeMetric(MetricBase):
 
         Args:
             preds(numpy.array): predicted results of current mini-batch, the shape and dtype of which should meet the requirements of the corresponded metric.
-            labels(numpy.array): ground truth of current mini-batch, the shape and dtype of which should meet the requirements of the corresponded metric. 
+            labels(numpy.array): ground truth of current mini-batch, the shape and dtype of which should meet the requirements of the corresponded metric.
         """
         for m in self._metrics:
             m.update(preds, labels)
@@ -260,7 +260,7 @@ class CompositeMetric(MetricBase):
         Calculate the results of all metrics sequentially.
 
         Returns:
-            list: results of all added metrics. 
+            list: results of all added metrics.
             The shape and dtype of each result depend on the definition of its metric.
         """
         ans = []
@@ -315,11 +315,11 @@ class Precision(MetricBase):
         Update the precision based on the current mini-batch prediction results .
 
         Args:
-            preds(numpy.ndarray): prediction results of current mini-batch, 
-                                the output of two-class sigmoid function. 
+            preds(numpy.ndarray): prediction results of current mini-batch,
+                                the output of two-class sigmoid function.
                                 Shape: [batch_size, 1]. Dtype: 'float64' or 'float32'.
-            labels(numpy.ndarray): ground truth (labels) of current mini-batch, 
-                                 the shape should keep the same as preds. 
+            labels(numpy.ndarray): ground truth (labels) of current mini-batch,
+                                 the shape should keep the same as preds.
                                  Shape: [batch_size, 1], Dtype: 'int32' or 'int64'.
         """
         if not _is_numpy_(preds):
@@ -398,11 +398,11 @@ class Recall(MetricBase):
         Update the recall based on the current mini-batch prediction results.
 
         Args:
-            preds(numpy.array): prediction results of current mini-batch, 
-                              the output of two-class sigmoid function. 
+            preds(numpy.array): prediction results of current mini-batch,
+                              the output of two-class sigmoid function.
                               Shape: [batch_size, 1]. Dtype: 'float64' or 'float32'.
-            labels(numpy.array): ground truth (labels) of current mini-batch, 
-                               the shape should keep the same as preds. 
+            labels(numpy.array): ground truth (labels) of current mini-batch,
+                               the shape should keep the same as preds.
                                Shape: [batch_size, 1], Dtype: 'int32' or 'int64'.
         """
         if not _is_numpy_(preds):
@@ -435,7 +435,7 @@ class Recall(MetricBase):
 class Accuracy(MetricBase):
     """
     This interface is used to calculate the mean accuracy over multiple batches.
-    Accuracy object has two state: value and weight. The definition of Accuracy is available at 
+    Accuracy object has two state: value and weight. The definition of Accuracy is available at
     https://en.wikipedia.org/wiki/Accuracy_and_precision
 
     Args:
@@ -500,7 +500,7 @@ class Accuracy(MetricBase):
         """
         This function returns the mean accuracy (float or numpy.array) for all accumulated minibatches.
 
-        Returns: 
+        Returns:
             float or numpy.array: mean accuracy for all accumulated minibatches.
 
         """
@@ -515,9 +515,9 @@ class ChunkEvaluator(MetricBase):
     Accumulate counter numbers output by chunk_eval from mini-batches and
     compute the precision recall and F1-score using the accumulated counter
     numbers.
-    ChunkEvaluator has three states: num_infer_chunks, num_label_chunks and num_correct_chunks, 
+    ChunkEvaluator has three states: num_infer_chunks, num_label_chunks and num_correct_chunks,
     which correspond to the number of chunks, the number of labeled chunks, and the number of correctly identified chunks.
-    For some basics of chunking, please refer to 
+    For some basics of chunking, please refer to
     `Chunking with Support Vector Machines <https://www.aclweb.org/anthology/N01-1025>`_ .
     ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection,
     and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
@@ -534,7 +534,7 @@ class ChunkEvaluator(MetricBase):
 
             # suppose the model predict 10 chucks, while 8 ones are correct and the ground truth has 9 chucks.
             num_infer_chunks = 10
-            num_label_chunks = 9 
+            num_label_chunks = 9
             num_correct_chunks = 8
 
             metric.update(num_infer_chunks, num_label_chunks, num_correct_chunks)
@@ -564,8 +564,8 @@ class ChunkEvaluator(MetricBase):
         r"""
         This function takes (num_infer_chunks, num_label_chunks, num_correct_chunks) as input,
         to accumulate and update the corresponding status of the ChunkEvaluator object. The update method is as follows:
-        
-        .. math:: 
+
+        .. math::
                    \\\\ \\begin{array}{l}{\\text { self. num_infer_chunks }+=\\text { num_infer_chunks }} \\\\ {\\text { self. num_Label_chunks }+=\\text { num_label_chunks }} \\\\ {\\text { self. num_correct_chunks }+=\\text { num_correct_chunks }}\\end{array} \\\\
 
         Args:
@@ -594,7 +594,7 @@ class ChunkEvaluator(MetricBase):
         """
         This function returns the mean precision, recall and f1 score for all accumulated minibatches.
 
-        Returns: 
+        Returns:
             float: mean precision, recall and f1 score.
 
         """
@@ -611,9 +611,9 @@ class ChunkEvaluator(MetricBase):
 class EditDistance(MetricBase):
     """
     This API is for the management of edit distances.
-    Editing distance is a method to quantify the degree of dissimilarity 
-    between two strings, such as words, by calculating the minimum editing 
-    operand (add, delete or replace) required to convert one string into another. 
+    Editing distance is a method to quantify the degree of dissimilarity
+    between two strings, such as words, by calculating the minimum editing
+    operand (add, delete or replace) required to convert one string into another.
     Refer to https://en.wikipedia.org/wiki/Edit_distance.
 
     Args:
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index abafb48d866..f1cbea25480 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -201,7 +201,7 @@ def img_conv_group(input,
             import paddle.fluid as fluid
             import paddle
             paddle.enable_static()
-            
+
             img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
             conv_pool = fluid.nets.img_conv_group(input=img,
                                                   conv_padding=1,
@@ -265,22 +265,22 @@ def sequence_conv_pool(input,
     """
 	:api_attr: Static Graph
 
-    **This api takes input as an LoDTensor. If input is a Tensor, please use** 
+    **This api takes input as an LoDTensor. If input is a Tensor, please use**
     :ref:`api_fluid_nets_simple_img_conv_pool` **instead**
 
-    The sequence_conv_pool is composed of :ref:`api_fluid_layers_sequence_conv` 
+    The sequence_conv_pool is composed of :ref:`api_fluid_layers_sequence_conv`
     and :ref:`api_fluid_layers_sequence_pool` .
 
     Args:
-        input (Variable): 2-D LoDTensor, the input of sequence_conv, 
-            which supports variable-time length input sequence. 
+        input (Variable): 2-D LoDTensor, the input of sequence_conv,
+            which supports variable-time length input sequence.
             The underlying of input is a matrix with shape
             (T, N), where T is the total time steps in this mini-batch and N is
             the input_hidden_size. The data type is float32 or float64.
         num_filters(int): The number of filter.
         filter_size (int): The filter size.
         param_attr (ParamAttr): The parameters of the sequence_conv Layer. Default: None.
-        act (str|None): Activation type for Sequence_conv Layer. 
+        act (str|None): Activation type for Sequence_conv Layer.
                         If set to None, no activation will be applied. Default: "sigmoid".
         pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
             average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
@@ -292,7 +292,7 @@ def sequence_conv_pool(input,
             is not set, the bias is initialized zero. Default: None.
 
     Returns:
-        The final result after sequence_conv and sequence_pool. 
+        The final result after sequence_conv and sequence_pool.
         It is a 2-D Tensor, with the same data type as :attr:`input`
 
     Return Type:
@@ -333,8 +333,8 @@ def glu(input, dim=-1):
     r"""
 	:api_attr: Static Graph
 
-    The Gated Linear Units(GLU) composed by :ref:`api_fluid_layers_split` , 
-    :ref:`api_fluid_layers_sigmoid`  and :ref:`api_fluid_layers_elementwise_mul` . 
+    The Gated Linear Units(GLU) composed by :ref:`api_fluid_layers_split` ,
+    :ref:`api_fluid_layers_sigmoid`  and :ref:`api_fluid_layers_elementwise_mul` .
     Specifically, GLU will plit the input into two equal-sized parts,
     :math:`a` and :math:`b`, along the given dimension and then compute as
     following:
@@ -347,8 +347,8 @@ def glu(input, dim=-1):
     <https://arxiv.org/pdf/1612.08083.pdf>`_.
 
     Args:
-        input (Variable): The input variable which is a Tensor or LoDTensor. 
-                          The supported data types include float32, float64 
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+                          The supported data types include float32, float64
                           and float16 (only for GPU).
         dim (int, optional): The dimension along which to split. If :math:`dim < 0`, the
             dimension to split along is :math:`rank(input) + dim`. Default -1.
@@ -362,7 +362,7 @@ def glu(input, dim=-1):
             import paddle.fluid as fluid
             import paddle
             paddle.enable_static()
-            
+
             data = fluid.data(
                 name="words", shape=[-1, 6, 3, 9], dtype="float32")
             # shape of output: [-1, 3, 3, 9]
@@ -444,7 +444,7 @@ def scaled_dot_product_attention(queries,
             import paddle.fluid as fluid
             import paddle
             paddle.enable_static()
-            
+
             queries = fluid.data(name="queries", shape=[3, 5, 9], dtype="float32")
             keys = fluid.data(name="keys", shape=[3, 6, 9], dtype="float32")
             values = fluid.data(name="values", shape=[3, 6, 10], dtype="float32")
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index c53872c0e54..f1e4e621774 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -76,8 +76,8 @@ class Optimizer(object):
                  name=None):
         """
         Args:
-            flatten_param_grads (bool, optional): Whether to flatten all the parameters and grads. 
-                If true, the parameters and gradients will be coalesce to contiguous mempry, 
+            flatten_param_grads (bool, optional): Whether to flatten all the parameters and grads.
+                If true, the parameters and gradients will be coalesce to contiguous mempry,
                 and the grad_clip ops / optimizer ops will be fuse to one operator.
         """
         # Because of the loop import, so place it in the function body
@@ -158,7 +158,7 @@ class Optimizer(object):
         Args: None
         Return:
             state_dict(dict) : dict contains all the variable used by optimizer
-        
+
         Examples:
             .. code-block:: python
 
@@ -204,11 +204,11 @@ class Optimizer(object):
         '''
         Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.
 
-        Args: 
+        Args:
             state_dict(dict) : Dict contains all the Variable needed by optimizer
         Return:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -222,7 +222,7 @@ class Optimizer(object):
                 state_dict = emb.state_dict()
                 fluid.save_dygraph(state_dict, "paddle_dy")
 
-                scheduler = paddle.optimizer.lr.NoamDecay(	
+                scheduler = paddle.optimizer.lr.NoamDecay(
                     d_model=0.01, warmup_steps=100, verbose=True)
                 adam = paddle.optimizer.Adam(
                     learning_rate=scheduler,
@@ -385,7 +385,7 @@ class Optimizer(object):
     def set_lr(self, value):
         """
         :api_attr: imperative
-        
+
         Set the value of the learning rate manually in the optimizer. If the optimizer use LearningRateDecay,
         this API cannot be invoked, because it will lead to conflict.
 
@@ -394,12 +394,12 @@ class Optimizer(object):
 
         Returns:
             None
-          
+
         Examples:
             .. code-block:: python
 
                 import paddle.fluid as fluid
-                        
+
                 with fluid.dygraph.guard():
                     linear = fluid.dygraph.nn.Linear(10, 10)
 
@@ -473,7 +473,7 @@ class Optimizer(object):
     def current_step_lr(self):
         """
         :api_attr: imperative
-        
+
         Get current step learning rate. The return value is all the same When LearningRateDecay is not used,
         otherwise return the step learning rate.
 
@@ -500,7 +500,7 @@ class Optimizer(object):
                     inp = fluid.dygraph.to_variable(inp)
                     out = linear(inp)
                     loss = fluid.layers.reduce_mean(out)
-                    
+
                     bd = [2, 4, 6, 8]
                     value = [0.2, 0.4, 0.6, 0.8, 1.0]
                     adam = fluid.optimizer.Adam(fluid.dygraph.PiecewiseDecay(bd, value, 0),
@@ -930,7 +930,7 @@ class Optimizer(object):
 
     def _create_regularization_of_grad(self, param, grad, regularization=None):
         """ Create and add backward regularization Operators
-    
+
         Function helper of append_regularization_ops.
         """
         # If no gradient or no regularization is specified,  then we don't need to do anything
@@ -974,22 +974,22 @@ class Optimizer(object):
                                   parameters_and_grads,
                                   regularization=None):
         r"""Create and add backward regularization Operators
-    
+
         Creates and adds backward regularization operators in the BlockDesc.
         This will add gradients of the regularizer function to the gradients
         of the parameters and return these modified gradients. This is the
         same as implementing weight decay in optimizers for regularization.
-    
+
         Args:
             parameters_and_grads: A list of (parameters, gradients) pairs
                                   that need to be regularized.
             regularization: A global regularizer. If the parameter is not
                             set. It will be applied with regularizer.
-    
+
         Returns:
             list[(Variable, Variable)]: list of (parameters, gradients) \
             pair with the regularized gradient
-    
+
         Raises:
             Exception: Unknown regularization type
         """
@@ -1177,10 +1177,10 @@ class Optimizer(object):
         Clear the gradients of all optimized parameters for model.
 
         If not, new gradient will accumulat on previous gradient.
-        
+
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -1192,7 +1192,7 @@ class Optimizer(object):
                     a = fluid.dygraph.to_variable(value)
                     linear = fluid.Linear(13, 5, dtype="float32")
                     # This can be any optimizer supported by dygraph.
-                    adam = fluid.optimizer.Adam(learning_rate = 0.01, 
+                    adam = fluid.optimizer.Adam(learning_rate = 0.01,
                                                 parameter_list = linear.parameters())
                     out = linear(a)
                     out.backward()
@@ -1228,8 +1228,8 @@ class Optimizer(object):
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) variable pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
-            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
@@ -1271,9 +1271,9 @@ class SGDOptimizer(Optimizer):
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
             ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -1443,9 +1443,9 @@ class MomentumOptimizer(Optimizer):
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
             ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -1589,8 +1589,8 @@ class DGCMomentumOptimizer(Optimizer):
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
             ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipByNorm, optional): Gradient cliping strategy. ``DGCMomentumOptimizer`` only support 
-            :ref:`api_fluid_clip_GradientClipByNorm` , and if not, it will raise TypeError. Default None, 
+        grad_clip (GradientClipByNorm, optional): Gradient cliping strategy. ``DGCMomentumOptimizer`` only support
+            :ref:`api_fluid_clip_GradientClipByNorm` , and if not, it will raise TypeError. Default None,
             meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -2000,9 +2000,9 @@ class LarsMomentumOptimizer(Optimizer):
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
             ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -2011,7 +2011,7 @@ class LarsMomentumOptimizer(Optimizer):
         multi_precision (bool, optional): Whether to use multi-precision during weight updating.
         rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \
             before updating. Often choose to be `1.0/batch_size`.
-        
+
     Examples:
         .. code-block:: python
 
@@ -2221,9 +2221,9 @@ class AdagradOptimizer(Optimizer):
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
             ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
@@ -2322,7 +2322,7 @@ class AdamOptimizer(Optimizer):
     of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
     it can dynamically adjusts the learning rate of each parameter using
     the 1st moment estimates and the 2nd moment estimates of the gradient.
-    
+
     The parameter ``param_out`` update rule with gradient ``grad``:
 
     .. math::
@@ -2360,9 +2360,9 @@ class AdamOptimizer(Optimizer):
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
             ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
@@ -2374,11 +2374,11 @@ class AdamOptimizer(Optimizer):
             gradient in current mini-batch, so it will be much more faster. But this mode has
             different semantics with the original Adam algorithm and may lead to different result.
             The default value is False.
-        use_global_beta_pow (bool, optional): Whether to use global beta_pow. If true, Adam will use global beta_pow 
+        use_global_beta_pow (bool, optional): Whether to use global beta_pow. If true, Adam will use global beta_pow
             for whole model instead of creating beta_pow for each parameter. Default is false.
         flatten_param_grads (bool, optional): Whether to flatten all parameters and gradients. Default is false.
         align_size (int, optional): The alignment size when flatten parameters and gradients. Default is -1, which means
-            use same align_size as allocator. 
+            use same align_size as allocator.
 
     Examples:
         .. code-block:: python
@@ -2686,7 +2686,7 @@ class AdamOptimizer(Optimizer):
 
 class AdamaxOptimizer(Optimizer):
     r"""
-    The Adamax optimizer is implemented based on the Adamax Optimization 
+    The Adamax optimizer is implemented based on the Adamax Optimization
     in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
     The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
     which makes the learning rate update algorithm more stable and simple.
@@ -2727,9 +2727,9 @@ class AdamaxOptimizer(Optimizer):
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
             ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
@@ -3013,9 +3013,9 @@ class DecayedAdagradOptimizer(Optimizer):
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
             ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
@@ -3130,9 +3130,9 @@ class AdadeltaOptimizer(Optimizer):
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
             ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): The default value is None. Normally there is no need for user
                 to set this property. For more information, please refer to
@@ -3305,9 +3305,9 @@ class RMSPropOptimizer(Optimizer):
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
             ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -3495,9 +3495,9 @@ class FtrlOptimizer(Optimizer):
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
             ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -3619,16 +3619,16 @@ class LambOptimizer(AdamOptimizer):
     r"""
     LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.
 
-    LAMB Optimizer is designed to scale up the batch size of training without losing 
-    accuracy, which supports adaptive element-wise updating and accurate layer-wise 
-    correction. For more information, please refer to `Large Batch Optimization for 
+    LAMB Optimizer is designed to scale up the batch size of training without losing
+    accuracy, which supports adaptive element-wise updating and accurate layer-wise
+    correction. For more information, please refer to `Large Batch Optimization for
     Deep Learning: Training BERT in 76 minutes <https://arxiv.org/abs/1904.00962>`_ .
 
     The updating of parameters follows:
 
     ..  math::
 
-        m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t 
+        m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t
 
         v_t &= \\beta_2 v_{t - 1}  + (1 - \\beta_2)g_t^2
 
@@ -3641,7 +3641,7 @@ class LambOptimizer(AdamOptimizer):
         w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1})
 
 
-    where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the 
+    where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the
     learning rate, :math:`\\lambda` the LAMB weight decay rate.
 
     Args:
@@ -3661,21 +3661,21 @@ class LambOptimizer(AdamOptimizer):
             regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
             ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_paddle_fluid_clip_ClipGradByGlobalNorm` , :ref:`api_paddle_fluid_clip_ClipGradByNorm` ,
             :ref:`api_paddle_fluid_clip_ClipGradByValue` ). If you want better convergence, it is recommended
             to use :ref:`api_paddle_fluid_clip_ClipGradByGlobalNorm` . Default None, meaning there is no gradient clipping.
-        exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight 
-            decay when **exclude_from_weight_decay_fn(parameter)** returns true. 
+        exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight
+            decay when **exclude_from_weight_decay_fn(parameter)** returns true.
             Default None.
-        name(str|None): For detailed information, please refer to 
+        name(str|None): For detailed information, please refer to
             :ref:`api_guide_Name` . Usually name is no need to set and None by default.
 
     Examples:
         .. code-block:: python
-            
-            import paddle.fluid as fluid 
+
+            import paddle.fluid as fluid
 
             data = fluid.data(name='x', shape=[-1, 5], dtype='float32')
             hidden = fluid.layers.fc(input=data, size=10)
@@ -4068,7 +4068,7 @@ class ModelAverage(Optimizer):
     def restore(self, executor):
         """
         Restore ``Parameter`` values of current model.
-        
+
         Args:
             executor(fluid.Executor): The current network executor.
 
@@ -4132,28 +4132,28 @@ class ExponentialMovingAverage(object):
 
 	\\text{EMA}_t & = \\text{decay} * \\text{EMA}_{t-1} + (1 - \\text{decay}) * \\theta_t
 
-    The average results calculated by **update()** method will be saved in 
-    temporary variables which are created and maintained by the object, and can 
-    be applied to parameters of current model by calling **apply()** method. And 
+    The average results calculated by **update()** method will be saved in
+    temporary variables which are created and maintained by the object, and can
+    be applied to parameters of current model by calling **apply()** method. And
     the **restore()** method is used to restore the parameters.
 
-    **Bias correction**. All EMAs are initialized to :math:`0` and hence they will be 
-    zero biased, which can be corrected by divided by a factor 
-    :math:`(1 - \\text{decay}^t)` , i.e., the actual EMAs applied to parameters 
-    when calling **apply()** method would be 
+    **Bias correction**. All EMAs are initialized to :math:`0` and hence they will be
+    zero biased, which can be corrected by divided by a factor
+    :math:`(1 - \\text{decay}^t)` , i.e., the actual EMAs applied to parameters
+    when calling **apply()** method would be
 
     ..  math::
-    
+
         \\widehat{\\text{EMA}}_t = \\frac{\\text{EMA}_t}{1 - \\text{decay}^t}
 
-    **Decay rate scheduling**. A large decay rate very close to 1 would result 
-    in that the averages move very slowly. And a better strategy is to set a 
+    **Decay rate scheduling**. A large decay rate very close to 1 would result
+    in that the averages move very slowly. And a better strategy is to set a
     relative smaller decay rate in the very beginning. The argument **thres_steps**
-    allows users to pass a Variable to schedule the decay rate, in this case, 
+    allows users to pass a Variable to schedule the decay rate, in this case,
     the actual decay rate becomes
-     
+
     ..  math::
-    
+
         \\min(\\text{decay}, \\frac{1 + \\text{thres_steps}}{10 + \\text{thres_steps}})
 
     Usually **thres_steps** can be the global training steps.
@@ -4195,21 +4195,21 @@ class ExponentialMovingAverage(object):
                 for batch_id in range(6):
                     data = numpy.random.random(size=(10, 5)).astype('float32')
                     exe.run(program=static.default_main_program(),
-                    feed={'x': data}, 
+                    feed={'x': data},
                     fetch_list=[cost.name])
 
                 # usage 1
                 with ema.apply(exe):
                     data = numpy.random.random(size=(10, 5)).astype('float32')
                     exe.run(program=test_program,
-                        feed={'x': data}, 
+                        feed={'x': data},
                         fetch_list=[hidden.name])
 
                 # usage 2
                 with ema.apply(exe, need_restore=False):
                     data = numpy.random.random(size=(10, 5)).astype('float32')
                     exe.run(program=test_program,
-                        feed={'x': data}, 
+                        feed={'x': data},
                         fetch_list=[hidden.name])
                 ema.restore(exe)
 
@@ -4308,8 +4308,8 @@ class ExponentialMovingAverage(object):
         return param_ema
 
     def update(self):
-        """ 
-        Update Exponential Moving Average. Should only call this method in 
+        """
+        Update Exponential Moving Average. Should only call this method in
         train program.
         """
         global_step = layers.autoincreased_step_counter(
@@ -4342,10 +4342,10 @@ class ExponentialMovingAverage(object):
     def apply(self, executor, need_restore=True):
         """
         Apply moving average to parameters for evaluation.
-        
+
         Args:
             executor (Executor): The Executor to execute applying.
-            need_restore (bool, optional): Whether to restore parameters after 
+            need_restore (bool, optional): Whether to restore parameters after
                 applying. Default True.
         """
         executor.run(self.apply_program)
@@ -4357,7 +4357,7 @@ class ExponentialMovingAverage(object):
 
     def restore(self, executor):
         """Restore parameters.
-        
+
         Args:
             executor (Executor): The Executor to execute restoring.
         """
@@ -4377,7 +4377,7 @@ class PipelineOptimizer(object):
         optimizer (Optimizer): The optimizer to use, such as SGD.
         num_microbatches (int): Number of microbatches. [Optional. Default:1].
         start_cpu_core_id (int): The first cpu core id to use. [Optional. Default:0].
-    
+
     Examples:
         .. code-block:: python
 
@@ -4890,7 +4890,7 @@ class PipelineOptimizer(object):
 
     def _add_op_device_attr(self, block):
         """
-        Add op_device attrribute for ops in block that have 
+        Add op_device attrribute for ops in block that have
         not that attribute set.
         """
         for idx, op in enumerate(list(block.ops)):
@@ -4909,7 +4909,7 @@ class PipelineOptimizer(object):
 
     def _check_validation(self, block):
         """
-        Check whether ops in a block have both the op_device and the 
+        Check whether ops in a block have both the op_device and the
         op_role attributes set.
         Then, return all devices in order.
         """
@@ -6154,18 +6154,18 @@ class RecomputeOptimizer(Optimizer):
     Recompute Optimizer Wrapper
 
     Normally, a training step contains three sub-steps: first, run forward
-    Operators to calculate the loss; second, run backward Operators to 
+    Operators to calculate the loss; second, run backward Operators to
     calculate gradient of the parameters; third, apply optimization method
     to update the value of the parameters.
 
-    In the forward computation process, all variables that are needed by 
+    In the forward computation process, all variables that are needed by
     backward computation process will be kept in memory, which occupy a great
     amount of memory when the network becomes very deep.
 
-    Recompute split the network to k segments. In each segment, It will 
+    Recompute split the network to k segments. In each segment, It will
     recompute the forward Operators, before running backward operators. It is
     very helpful for saving memory.
- 
+
     The Variables that separate a network to segments are called as checkpoints,
     and users should set it manually. The usage is very simple:
 
@@ -6222,7 +6222,7 @@ class RecomputeOptimizer(Optimizer):
     def _set_checkpoints(self, checkpoints):
         """
         Args:
-            checkpoints (list): List of Variable or string    
+            checkpoints (list): List of Variable or string
         """
         assert isinstance(
             checkpoints, list
@@ -6254,19 +6254,19 @@ class RecomputeOptimizer(Optimizer):
 
                 import paddle.fluid as fluid
                 import paddle.compat as cpt
-                
+
                 def mlp(input_x, input_y, hid_dim=128, label_dim=2):
                     fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
                     prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
                     cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
                     sum_cost = fluid.layers.reduce_mean(cost)
                     return sum_cost, fc_1, prediction
-                
+
                 input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
                 input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
                 cost, fc_1, pred = mlp(input_x, input_y)
                 print("Finished FF")
-                
+
                 sgd = fluid.optimizer.Adam(learning_rate=0.01)
                 sgd = fluid.optimizer.RecomputeOptimizer(sgd)
                 sgd._set_checkpoints([fc_1, pred])
@@ -6351,9 +6351,9 @@ class RecomputeOptimizer(Optimizer):
         add fill_constant_ops to the end of the prog
 
         we should fill the pinned vars before runing the main_prog
-        to instantiate their tensor hold_, which could tell us whether 
-        the host memory could hold all the checkpoints from all the 
-        GPU devices in this node. 
+        to instantiate their tensor hold_, which could tell us whether
+        the host memory could hold all the checkpoints from all the
+        GPU devices in this node.
         """
         op_role = 0
         block = startup_program.global_block()
@@ -6640,7 +6640,7 @@ class RecomputeOptimizer(Optimizer):
     def _offload(self, loss, startup_program=None):
         """
         core steps for recompute offload
-        1. create pinned vars and temp vars 
+        1. create pinned vars and temp vars
         2. parse & update Forward pass: offload, sync
         3. parse & update Backward pass: rename, fetch, sync
         4. verify the correctness
@@ -6702,20 +6702,20 @@ class RecomputeOptimizer(Optimizer):
             .. code-block:: python
 
                 import paddle.fluid as fluid
-    
+
                 def mlp(input_x, input_y, hid_dim=128, label_dim=2):
                     fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
                     prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
                     cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
                     sum_cost = fluid.layers.reduce_mean(cost)
                     return sum_cost, fc_1, prediction
-    
-    
+
+
                 input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
                 input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
                 cost, fc_1, pred = mlp(input_x, input_y)
                 print("Finished FF")
-    
+
                 sgd = fluid.optimizer.Adam(learning_rate=0.01)
                 sgd = fluid.optimizer.RecomputeOptimizer(sgd)
                 sgd._set_checkpoints([fc_1, pred])
@@ -6773,19 +6773,19 @@ class RecomputeOptimizer(Optimizer):
         Examples:
             .. code-block:: python
                 import paddle.fluid as fluid
-                
+
                 def mlp(input_x, input_y, hid_dim=128, label_dim=2):
                     fc_1 = fluid.layers.fc(input=input_x, size=hid_dim)
                     prediction = fluid.layers.fc(input=[fc_1], size=label_dim, act='softmax')
                     cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
                     sum_cost = fluid.layers.reduce_mean(cost)
-                    return sum_cost, fc_1, prediction                
-                
+                    return sum_cost, fc_1, prediction
+
                 input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
                 input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
                 cost, fc_1, pred = mlp(input_x, input_y)
                 print("Finished FF")
-                
+
                 sgd = fluid.optimizer.Adam(learning_rate=0.01)
                 sgd = fluid.optimizer.RecomputeOptimizer(sgd)
                 sgd._set_checkpoints([fc_1, pred])
@@ -6794,10 +6794,10 @@ class RecomputeOptimizer(Optimizer):
                     startup_program=None,
                     parameter_list=None,
                     no_grad_set=None)
-                
+
                 optimize_ops = sgd.apply_optimize(
                     cost, startup_program=None, params_grads=params_grads)
-                
+
                 print("Finished apply_optimize")
         """
 
@@ -6839,18 +6839,18 @@ class LookaheadOptimizer(object):
     paper : https://arxiv.org/abs/1907.08610.
 
     Lookahead keeps two sets of params: the fast_params and
-    the slow_params. inner_optimizer update fast_params every 
-    training step. Lookahead updates the slow_params and fast_params 
+    the slow_params. inner_optimizer update fast_params every
+    training step. Lookahead updates the slow_params and fast_params
     every k training steps as follows:
 
     .. math::
-        
+
         slow\_param_t &= slow\_param_{t-1} + \\alpha * (fast\_param_{t-1} - slow\_param_{t-1})
-	
+
 	fast\_param_t &=  slow\_param_t
 
     Args:
-        inner_optimizer (Optimizer): The optimizer that update fast params step by step. 
+        inner_optimizer (Optimizer): The optimizer that update fast params step by step.
         alpha (float): The learning rate of Lookahead.
         k (int): The slow params is updated every k steps.
 
@@ -6863,7 +6863,7 @@ class LookaheadOptimizer(object):
             import numpy.random as random
 
             paddle.enable_static()
-        
+
             x = fluid.layers.data(name='x', shape=[2], dtype='float32')
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
             y = fluid.layers.fc(input=[x], size=2, act="softmax")
@@ -6882,10 +6882,10 @@ class LookaheadOptimizer(object):
             def train_reader(limit=5):
                 for i in range(limit):
                     yield random.random([2]).astype('float32'), random.random([1]).astype('int64')
-            
+
             feeder = fluid.DataFeeder(feed_list=[x, label], place=place)
             reader = paddle.batch(paddle.reader.shuffle(train_reader, buf_size=50000),batch_size=1)
-            
+
             for batch_data in reader():
                 exe.run(fluid.default_main_program(),
                 feed=feeder.feed(batch_data))
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 6580c82536a..1061252408e 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -32,9 +32,9 @@ class ParamAttr(object):
     """
 
     Note:
-        ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. 
+        ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
         Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-        There are three clipping strategies: :ref:`api_paddle_nn_ClipGradByGlobalNorm` , 
+        There are three clipping strategies: :ref:`api_paddle_nn_ClipGradByGlobalNorm` ,
         :ref:`api_paddle_nn_ClipGradByNorm` , :ref:`api_paddle_nn_ClipGradByValue` .
 
     Create a object to represent the attribute of parameter. The attributes are:
@@ -50,10 +50,10 @@ class ParamAttr(object):
         learning_rate (float, optional): The parameter's learning rate. The learning rate when
                 optimize is the global learning rates times the parameter's learning rate times
                 the factor of learning rate scheduler. Default 1.0.
-        regularizer (WeightDecayRegularizer, optional): Regularization strategy. There are two method: 
-                :ref:`api_paddle_regularizer_L1Decay` , :ref:`api_paddle_regularizer_L2Decay` . If 
-                regularizer is also set in ``optimizer`` (such as :ref:`api_paddle_optimizer_SGD` ), 
-                that regularizer setting in optimizer will be ignored. Default None, meaning there is 
+        regularizer (WeightDecayRegularizer, optional): Regularization strategy. There are two method:
+                :ref:`api_paddle_regularizer_L1Decay` , :ref:`api_paddle_regularizer_L2Decay` . If
+                regularizer is also set in ``optimizer`` (such as :ref:`api_paddle_optimizer_SGD` ),
+                that regularizer setting in optimizer will be ignored. Default None, meaning there is
                 no regularization.
         trainable (bool, optional): Whether this parameter is trainable. Default True.
         do_model_average (bool, optional): Whether this parameter should do model average
@@ -64,7 +64,7 @@ class ParamAttr(object):
        ParamAttr Object.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
@@ -218,13 +218,13 @@ class WeightNormParamAttr(ParamAttr):
 
     Note:
         Please use 'paddle.nn.utils.weight_norm' in dygraph mode.
-	
+
     Note:
-        ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. 
+        ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0.
         Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-        There are three clipping strategies: :ref:`api_paddle_nn_ClipGradByGlobalNorm` , 
+        There are three clipping strategies: :ref:`api_paddle_nn_ClipGradByGlobalNorm` ,
         :ref:`api_paddle_nn_ClipGradByNorm` , :ref:`api_paddle_nn_ClipGradByValue` .
-	
+
     Parameter of weight Norm. Weight Norm is a reparameterization of the weight vectors
     in a neural network that decouples the magnitude of those weight vectors from
     their direction. Weight Norm has been implemented as discussed in this
@@ -258,9 +258,9 @@ class WeightNormParamAttr(ParamAttr):
         need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.
 
     Examples:
-    
+
         .. code-block:: python
-            
+
             import paddle
 
             paddle.enable_static()
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index c590d69a621..0eed36fb12e 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -129,7 +129,7 @@ class DataLoaderBase(object):
 
     def next(self):
         '''
-        Get the next item in the DataLoader object. This method    
+        Get the next item in the DataLoader object. This method
         should not be called by users directly. It is used for
         implementing iterator protocol of Python 2.x inside
         PaddlePaddle framework.
@@ -312,7 +312,7 @@ class DataLoader(object):
         do nothing to data from dataset.
 
 
-    Args:  
+    Args:
         dataset(Dataset): the dataset to load data from, should be an
             instance of subclass of :code:`paddle.io.Dataset` or
             :code:`paddle.io.IterableDataset`.
@@ -321,15 +321,15 @@ class DataLoader(object):
             :attr:`feed_list` must be set if :attr:`return_list` is
             False. Default None.
         places(list(Place)|tuple(Place)|list(str), optional): a list of Place,
-            to put data onto, :attr:`places` can be None, if 
+            to put data onto, :attr:`places` can be None, if
             :attr:`places` is None, default place(CPUPlace or CUDAPlace(0))
             will be used. Default None. If ``places`` is list of string,
             the string in the list can be ``cpu``, ``gpu:x`` and ``gpu_pinned``,
             where ``x`` is the index of the GPUs.
-        return_list (bool, optional): whether the return value on each device is 
+        return_list (bool, optional): whether the return value on each device is
             presented as a list. If :attr:`return_list=False`, the return
             value on each device would be a dict of str -> Tensor, where
-            the key of the dict is the name of each fed Tensors. If 
+            the key of the dict is the name of each fed Tensors. If
             :attr:`return_list=True`, the return value on each device would
             be a list(Tensor). :attr:`return_list` can only be True
             in dynamic graph mode. Default True.
@@ -352,9 +352,9 @@ class DataLoader(object):
             0(same as :attr::`np.stack(..., axis=0)`). Default None
         num_workers(int, optional): the number of subprocess to load data, 0 for no
             subprocess used and loading data in main process. Default 0
-        use_buffer_reader (bool, optional): whether to use bufferred reader. 
+        use_buffer_reader (bool, optional): whether to use bufferred reader.
             If use_buffer_reader=True, the DataLoader would prefetch
-            batch data asynchronously, so it would speed up data feeding 
+            batch data asynchronously, so it would speed up data feeding
             and occupies a little more CPU or GPU memory, i.e., the memory
             of one batch input data. Default True.
         prefetch_factor (int, optional): Number of batch data the DataLoader would prefetch
@@ -375,7 +375,7 @@ class DataLoader(object):
         DataLoader: an iterable object for data iterating, each elemnet of the generated data is a Tensor.
 
     Examples:
-        
+
         .. code-block:: python
 
             import numpy as np
@@ -585,60 +585,60 @@ class DataLoader(object):
         .. note::
           **The framework ensures that the data loading order of DataLoader is exactly the same as the user-defined data source.**
 
-        Create a DataLoader object for loading data from Python generator. 
+        Create a DataLoader object for loading data from Python generator.
         Data would be prefetched using Python thread and be pushed
         into a queue asynchronously.
 
         The created DataLoader object provides 3 methods to set the data source
-        :code:`set_sample_generator` , :code:`set_sample_list_generator` and 
+        :code:`set_sample_generator` , :code:`set_sample_list_generator` and
         :code:`set_batch_generator` . Please see the following example codes
         to know their usages.
-        
+
         If iterable = True, the created DataLoader object is a Python generator
         object, which is iterable using for-range loop.
 
-        If iterable = False, the created DataLoader object provides 
+        If iterable = False, the created DataLoader object provides
         :code:`start()` and :code:`reset()` method to control the data reading
         process.
 
-        Args:  
+        Args:
             feed_list (list(Tensor)|tuple(Tensor)): feed Tensor list.
                 The Tensors should be created by :code:`fluid.data()`.
             capacity (int): capacity of the queue maintained in DataLoader.
-                The unit is batch number. Set larger capacity if your reader 
-                is fast. 
-            use_double_buffer (bool): whether to use double_buffer_reader. 
-                If use_double_buffer=True, the DataLoader would prefetch next 
-                batch data asynchronously, so it would speed up data feeding 
+                The unit is batch number. Set larger capacity if your reader
+                is fast.
+            use_double_buffer (bool): whether to use double_buffer_reader.
+                If use_double_buffer=True, the DataLoader would prefetch next
+                batch data asynchronously, so it would speed up data feeding
                 and occupies a little more CPU or GPU memory, i.e., the memory
-                of one batch input data. 
-            iterable (bool): whether the created DataLoader is iterable. 
-            return_list (bool): whether the return value on each device is 
-                presented as a list. It is only valid when iterable=True. 
-                If return_list=False, the return value on each device would 
-                be a dict of str -> LoDTensor, where the key of the dict is 
-                the name of each fed Tensors. If return_list=True, the 
+                of one batch input data.
+            iterable (bool): whether the created DataLoader is iterable.
+            return_list (bool): whether the return value on each device is
+                presented as a list. It is only valid when iterable=True.
+                If return_list=False, the return value on each device would
+                be a dict of str -> LoDTensor, where the key of the dict is
+                the name of each fed Tensors. If return_list=True, the
                 return value on each device would be a list(LoDTensor). It is
                 recommended to use return_list=False in static graph mode and
-                use return_list=True in dygraph mode.  
+                use return_list=True in dygraph mode.
             use_multiprocess (bool): whether to use multi-process to speed up
                 the data loading process in dygraph. Note: this parameter only
                 can be used in the dygraph mode. In the static graph mode,
                 whether this parameter is set or not has no effect.
                 The Default value is False.
             drop_last (bool): whether to drop the last batches whose number is
-                less than the CPU core/GPU card number. The default value is 
+                less than the CPU core/GPU card number. The default value is
                 True. In training phase, users should not set drop_last=False,
-                because all CPU cores/GPU cards must read data from DataLoader. 
+                because all CPU cores/GPU cards must read data from DataLoader.
                 In inference phase, users can set drop_last=False, so that the
                 last batches whose number is less than the CPU core/GPU card
-                number can be tested. 
+                number can be tested.
 
         Returns:
             loader (DataLoader): the created DataLoader object.
 
         Examples 1:
-            
+
             .. code-block:: python
 
                 '''
@@ -651,7 +651,7 @@ class DataLoader(object):
                 import paddle.nn.functional as F
 
 
-                BATCH_NUM = 10 
+                BATCH_NUM = 10
                 BATCH_SIZE = 16
                 EPOCH_NUM = 4
 
@@ -660,7 +660,7 @@ class DataLoader(object):
                 ITERABLE = True # whether the created DataLoader object is iterable
                 USE_GPU = False # whether to use GPU
 
-                DATA_FORMAT = 'batch_generator' # data format of data source user provides 
+                DATA_FORMAT = 'batch_generator' # data format of data source user provides
 
                 paddle.enable_static()
 
@@ -679,7 +679,7 @@ class DataLoader(object):
 
                 # If the data generator yields one sample each time,
                 # use DataLoader.set_sample_generator to set the data source.
-                def sample_generator_creator(): 
+                def sample_generator_creator():
                     def __reader__():
                         for _ in range(BATCH_NUM * BATCH_SIZE):
                             image, label = get_random_images_and_labels([784], [1])
@@ -691,7 +691,7 @@ class DataLoader(object):
                 # use DataLoader.set_sample_list_generator to set the data source.
                 def sample_list_generator_creator():
                     def __reader__():
-                        for _ in range(BATCH_NUM): 
+                        for _ in range(BATCH_NUM):
                             sample_list = []
                             for _ in range(BATCH_SIZE):
                                 image, label = get_random_images_and_labels([784], [1])
@@ -699,25 +699,25 @@ class DataLoader(object):
 
                             yield sample_list
 
-                    return __reader__ 
+                    return __reader__
 
-                # If the data generator yields a batch each time, 
+                # If the data generator yields a batch each time,
                 # use DataLoader.set_batch_generator to set the data source.
                 def batch_generator_creator():
                     def __reader__():
                         for _ in range(BATCH_NUM):
-                            batch_image, batch_label = get_random_images_and_labels([BATCH_SIZE, 784], [BATCH_SIZE, 1]) 
+                            batch_image, batch_label = get_random_images_and_labels([BATCH_SIZE, 784], [BATCH_SIZE, 1])
                             yield batch_image, batch_label
 
                     return __reader__
 
-                # If DataLoader is iterable, use for loop to train the network 
+                # If DataLoader is iterable, use for loop to train the network
                 def train_iterable(exe, prog, loss, loader):
                     for _ in range(EPOCH_NUM):
                         for data in loader():
                             exe.run(prog, feed=data, fetch_list=[loss])
 
-                # If DataLoader is not iterable, use start() and reset() method to control the process 
+                # If DataLoader is not iterable, use start() and reset() method to control the process
                 def train_non_iterable(exe, prog, loss, loader):
                     for _ in range(EPOCH_NUM):
                         loader.start() # call DataLoader.start() before each epoch starts
@@ -725,7 +725,7 @@ class DataLoader(object):
                             while True:
                                 exe.run(prog, fetch_list=[loss])
                         except paddle.core.EOFException:
-                            loader.reset() # call DataLoader.reset() after catching EOFException 
+                            loader.reset() # call DataLoader.reset() after catching EOFException
 
                 def set_data_source(loader, places):
                     if DATA_FORMAT == 'sample_generator':
@@ -740,7 +740,7 @@ class DataLoader(object):
                 image = static.data(name='image', shape=[None, 784], dtype='float32')
                 label = static.data(name='label', shape=[None, 1], dtype='int64')
 
-                # Define DataLoader 
+                # Define DataLoader
                 loader = paddle.io.DataLoader.from_generator(feed_list=[image, label], capacity=16, iterable=ITERABLE)
 
                 # Define network
@@ -748,10 +748,10 @@ class DataLoader(object):
 
                 # Set data source of DataLoader
                 #
-                # If DataLoader is iterable, places must be given and the number of places must be the same with device number.  
-                #  - If you are using GPU, call `paddle.static.cuda_places()` to get all GPU places. 
-                #  - If you are using CPU, call `paddle.static.cpu_places()` to get all CPU places. 
-                # 
+                # If DataLoader is iterable, places must be given and the number of places must be the same with device number.
+                #  - If you are using GPU, call `paddle.static.cuda_places()` to get all GPU places.
+                #  - If you are using CPU, call `paddle.static.cpu_places()` to get all CPU places.
+                #
                 # If DataLoader is not iterable, places can be None.
                 places = static.cuda_places() if USE_GPU else static.cpu_places()
                 set_data_source(loader, places)
@@ -772,7 +772,7 @@ class DataLoader(object):
             .. code-block:: python
 
                 '''
-                Example in dynamic graph mode. 
+                Example in dynamic graph mode.
                 '''
                 import numpy as np
 
@@ -850,21 +850,21 @@ class DataLoader(object):
                 import numpy as np
                 import os
 
-                # We use 2 CPU cores to run inference network 
+                # We use 2 CPU cores to run inference network
                 os.environ['CPU_NUM'] = '2'
 
                 paddle.enable_static()
 
                 # The data source has only 3 batches, which can not be
                 # divided evenly to each CPU core
-                def batch_generator():  
+                def batch_generator():
                     for i in range(3):
-                        yield np.array([i+1]).astype('float32'), 
+                        yield np.array([i+1]).astype('float32'),
 
-                x = static.data(name='x', shape=[None], dtype='float32')  
+                x = static.data(name='x', shape=[None], dtype='float32')
                 y = x * x
 
-                def run_inference(drop_last): 
+                def run_inference(drop_last):
                     loader = paddle.io.DataLoader.from_generator(feed_list=[x],
                             capacity=8, drop_last=drop_last)
                     loader.set_batch_generator(batch_generator, static.cpu_places())
@@ -902,21 +902,21 @@ class DataLoader(object):
           This API will be deprecated in the future, it is recommended to use
           :code:`paddle.io.DataLoader` which supports multi-processes acceleration.
 
-        Create an iterable DataLoader object for loading data from Dataset.    
+        Create an iterable DataLoader object for loading data from Dataset.
         Dataset is only supported in Linux system currently.
 
         Args:
             dataset (InMemoryDataset|QueueDataset): the dataset object.
-            places (list(CUDAPlace)|list(CPUPlace)|list(str)): places where the result 
-                data should be converted. If places is list of string, the string in the list 
-                can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where x is the index of the GPUs.   
-            drop_last (bool): whether to drop the last batch whose sample 
+            places (list(CUDAPlace)|list(CPUPlace)|list(str)): places where the result
+                data should be converted. If places is list of string, the string in the list
+                can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where x is the index of the GPUs.
+            drop_last (bool): whether to drop the last batch whose sample
                 number is less than batch size. If drop_last = True, they
-                would be dropped. If drop_last = False, they would be kept. 
+                would be dropped. If drop_last = False, they would be kept.
 
         Returns:
-            loader (DataLoader): the created DataLoader object, which can be 
-                treated as a Python generator.   
+            loader (DataLoader): the created DataLoader object, which can be
+                treated as a Python generator.
 
         Examples:
 
@@ -946,7 +946,7 @@ class DygraphGeneratorLoader(DataLoaderBase):
     """
     The GeneratorLoader of dygraph
 
-    The multiprocess dygraph GeneratorLoader's most functions are different from 
+    The multiprocess dygraph GeneratorLoader's most functions are different from
     static graph GeneratorLoader, Separate implementation to keep code readable.
     """
 
@@ -1527,31 +1527,31 @@ class GeneratorLoader(DataLoaderBase):
 
 class PyReader(DataLoaderBase):
     r"""
-    Create a reader object for data feeding in Python. 
+    Create a reader object for data feeding in Python.
     Data would be prefetched using Python thread and be pushed
-    into a queue asynchronously. Data in the queue would be extracted 
+    into a queue asynchronously. Data in the queue would be extracted
     automatically when `Executor.run(...)` is called.
 
-    Args:  
+    Args:
         feed_list (list(Variable)|tuple(Variable)): feed variable list.
             The variables should be created by :code:`fluid.layers.data()`.
         capacity (int): capacity of the queue maintained in PyReader.
-            The unit is batch number. Set larger capacity if your reader 
-            is fast. 
-        use_double_buffer (bool): whether to use double_buffer_reader. 
-            If use_double_buffer=True, PyReader would prefetch next 
-            batch data asynchronously, so it would speed up data feeding 
+            The unit is batch number. Set larger capacity if your reader
+            is fast.
+        use_double_buffer (bool): whether to use double_buffer_reader.
+            If use_double_buffer=True, PyReader would prefetch next
+            batch data asynchronously, so it would speed up data feeding
             and occupies a little more CPU or GPU memory, i.e., the memory
-            of one batch input data. 
-        iterable (bool): whether the created PyReader is iterable. 
-        return_list (bool): whether the return value on each device is 
-            presented as a list. It is only valid when iterable=True. 
-            If return_list=False, the return value on each device would 
-            be a dict of str -> LoDTensor, where the key of the dict is 
-            the name of each fed variables. If return_list=True, the 
+            of one batch input data.
+        iterable (bool): whether the created PyReader is iterable.
+        return_list (bool): whether the return value on each device is
+            presented as a list. It is only valid when iterable=True.
+            If return_list=False, the return value on each device would
+            be a dict of str -> LoDTensor, where the key of the dict is
+            the name of each fed variables. If return_list=True, the
             return value on each device would be a list(LoDTensor). It is
             recommended to use return_list=False in static graph mode and
-            use return_list=True in dygraph mode. 
+            use return_list=True in dygraph mode.
 
     Returns:
         the created reader object.
@@ -1561,11 +1561,11 @@ class PyReader(DataLoaderBase):
 
     Examples:
         1. If iterable = False, the created PyReader object is almost the
-           same as :code:`fluid.layers.py_reader()`. Operators would be 
-           inserted into the program. User should call :code:`start()` 
+           same as :code:`fluid.layers.py_reader()`. Operators would be
+           inserted into the program. User should call :code:`start()`
            before each epoch and catch :code:`fluid.core.EOFException`
-           thrown by :code:`Executor.run()` when epoch ends. Once the 
-           exception is caught, user should call :code:`reset()` to reset 
+           thrown by :code:`Executor.run()` when epoch ends. Once the
+           exception is caught, user should call :code:`reset()` to reset
            the reader manually.
 
         .. code-block:: python
@@ -1577,10 +1577,10 @@ class PyReader(DataLoaderBase):
            EPOCH_NUM = 3
            ITER_NUM = 5
            BATCH_SIZE = 3
-           
+
            def network(image, label):
                # User-defined network, here is an example of softmax regression.
-               predict = fluid.layers.fc(input=image, size=10, act='softmax')           
+               predict = fluid.layers.fc(input=image, size=10, act='softmax')
                return fluid.layers.cross_entropy(input=predict, label=label)
 
            def reader_creator_random_image_and_label(height, width):
@@ -1615,12 +1615,12 @@ class PyReader(DataLoaderBase):
                        reader.reset()
                        break
 
- 
+
         2. If iterable=True, the created PyReader object is decoupled with
-           the program. No operator would be inserted into the program. 
-           In this case, the created reader is a Python generator, which 
-           is iterable. User should feed the data yielded from PyReader 
-           object into :code:`Executor.run(feed=...)`.  
+           the program. No operator would be inserted into the program.
+           In this case, the created reader is a Python generator, which
+           is iterable. User should feed the data yielded from PyReader
+           object into :code:`Executor.run(feed=...)`.
 
         .. code-block:: python
 
@@ -1634,7 +1634,7 @@ class PyReader(DataLoaderBase):
 
            def network(image, label):
                # User-defined network, here is an example of softmax regression.
-               predict = fluid.layers.fc(input=image, size=10, act='softmax')           
+               predict = fluid.layers.fc(input=image, size=10, act='softmax')
                return fluid.layers.cross_entropy(input=predict, label=label)
 
            def reader_creator_random_image(height, width):
@@ -1642,7 +1642,7 @@ class PyReader(DataLoaderBase):
                    for i in range(ITER_NUM):
                        fake_image = np.random.uniform(low=0, high=255, size=[height, width])
                        fake_label = np.ones([1])
-                       yield fake_image, fake_label 
+                       yield fake_image, fake_label
                return reader
 
            image = fluid.data(name='image', shape=[None, 784, 784], dtype='float32')
@@ -1653,17 +1653,17 @@ class PyReader(DataLoaderBase):
            reader.decorate_sample_list_generator(
                paddle.batch(user_defined_reader, batch_size=BATCH_SIZE),
                    fluid.core.CPUPlace())
-           
+
            loss = network(image, label)
            executor = fluid.Executor(fluid.CPUPlace())
            executor.run(fluid.default_startup_program())
-           
+
            for _ in range(EPOCH_NUM):
                for data in reader():
                    executor.run(feed=data, fetch_list=[loss])
 
 
-        3. If return_list=True, the return values would be presented as list instead of dict. 
+        3. If return_list=True, the return values would be presented as list instead of dict.
            This is usually used in dygraph mode.
 
         .. code-block:: python
@@ -1719,12 +1719,12 @@ class PyReader(DataLoaderBase):
 
     def start(self):
         '''
-        Start the data feeding thread. 
-        Can only call when the reader object is not iterable.  
-        
+        Start the data feeding thread.
+        Can only call when the reader object is not iterable.
+
 	Example:
 	    .. code-block:: python
-    
+
                 import paddle
                 import paddle.fluid as fluid
                 import numpy as np
@@ -1756,9 +1756,9 @@ class PyReader(DataLoaderBase):
 
     def reset(self):
         '''
-        Reset the reader object when :code:`fluid.core.EOFException` raises. 
+        Reset the reader object when :code:`fluid.core.EOFException` raises.
         Can only call when the reader object is not iterable.
-        
+
         Example:
             .. code-block:: python
 
@@ -1786,7 +1786,7 @@ class PyReader(DataLoaderBase):
                             executor.run(feed=None)
                         except fluid.core.EOFException:
                             reader.reset()
-                            break        
+                            break
 
         '''
         self._loader.reset()
@@ -1798,13 +1798,13 @@ class PyReader(DataLoaderBase):
                                   places=None):
         '''
         Set the data source of the PyReader object.
-        
+
         The provided :code:`sample_generator` should be a Python generator,
         which yields list(numpy.ndarray)-typed data of each sample.
 
         :code:`places` must be set when the PyReader object is iterable.
 
-        If all inputs have no lods, this method is faster than 
+        If all inputs have no lods, this method is faster than
         :code:`decorate_sample_list_generator(paddle.batch(sample_generator, ...))` .
 
         Args:
@@ -1812,7 +1812,7 @@ class PyReader(DataLoaderBase):
                 list(numpy.ndarray)-typed sample data.
             batch_size (int): batch size. Must be larger than 0.
             drop_last (bool): Whether to drop the last batch when sample number
-                is less than batch_size. 
+                is less than batch_size.
             places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
                 be provided when PyReader is iterable.
 
@@ -1825,10 +1825,10 @@ class PyReader(DataLoaderBase):
                 EPOCH_NUM = 3
                 ITER_NUM = 15
                 BATCH_SIZE = 3
-        
+
                 def network(image, label):
                     # User-defined network, here is an example of softmax regression.
-                    predict = fluid.layers.fc(input=image, size=10, act='softmax')           
+                    predict = fluid.layers.fc(input=image, size=10, act='softmax')
                     return fluid.layers.cross_entropy(input=predict, label=label)
 
                 def random_image_and_label_generator(height, width):
@@ -1856,26 +1856,26 @@ class PyReader(DataLoaderBase):
                 for _ in range(EPOCH_NUM):
                     for data in reader():
                         executor.run(feed=data, fetch_list=[loss])
-    
+
         '''
         self._loader.set_sample_generator(sample_generator, batch_size,
                                           drop_last, places)
 
     def decorate_sample_list_generator(self, reader, places=None):
         '''
-        Set the data source of the PyReader object. 
+        Set the data source of the PyReader object.
 
         The provided :code:`reader` should be a Python generator,
-        which yields list(numpy.ndarray) typed batched data. 
-        
+        which yields list(numpy.ndarray) typed batched data.
+
         :code:`places` must be set when the PyReader object is iterable.
 
         Args:
-            reader (generator): Python generator that yields 
-                list(numpy.ndarray)-typed batched data. 
+            reader (generator): Python generator that yields
+                list(numpy.ndarray)-typed batched data.
             places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
                 be provided when PyReader is iterable.
-        
+
         Example:
             .. code-block:: python
 
@@ -1889,7 +1889,7 @@ class PyReader(DataLoaderBase):
 
                 def network(image, label):
                     # User-defined network, here is an example of softmax regression.
-                    predict = fluid.layers.fc(input=image, size=10, act='softmax')           
+                    predict = fluid.layers.fc(input=image, size=10, act='softmax')
                     return fluid.layers.cross_entropy(input=predict, label=label)
 
                 def random_image_and_label_generator(height, width):
@@ -1910,7 +1910,7 @@ class PyReader(DataLoaderBase):
                 reader.decorate_sample_list_generator(
                     paddle.batch(user_defined_generator, batch_size=BATCH_SIZE),
                     fluid.core.CPUPlace())
-                
+
                 loss = network(image, label)
                 executor = fluid.Executor(fluid.core.CPUPlace())
                 executor.run(fluid.default_startup_program())
@@ -1918,7 +1918,7 @@ class PyReader(DataLoaderBase):
                 for _ in range(EPOCH_NUM):
                     for data in reader():
                         executor.run(feed=data, fetch_list=[loss])
-                 
+
         '''
         self._loader.set_sample_list_generator(reader, places)
 
@@ -1946,10 +1946,10 @@ class PyReader(DataLoaderBase):
                 EPOCH_NUM = 3
                 ITER_NUM = 15
                 BATCH_SIZE = 3
-               
+
                 def network(image, label):
                     # User-defined network, here is an example of softmax regression.
-                    predict = fluid.layers.fc(input=image, size=10, act='softmax')           
+                    predict = fluid.layers.fc(input=image, size=10, act='softmax')
                     return fluid.layers.cross_entropy(input=predict, label=label)
 
                 def random_image_and_label_generator(height, width):
@@ -1970,7 +1970,7 @@ class PyReader(DataLoaderBase):
 
                 user_defined_generator = random_image_and_label_generator(784, 784)
                 reader.decorate_batch_generator(user_defined_generator, fluid.CPUPlace())
-                
+
                 loss = network(image, label)
                 executor = fluid.Executor(fluid.CPUPlace())
                 executor.run(fluid.default_startup_program())
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 1c7b3558753..3f965320ecc 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -49,14 +49,14 @@ class WeightDecayRegularizer(object):
 
 
 class L2DecayRegularizer(WeightDecayRegularizer):
-    r""" 
+    r"""
     Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
 
-    It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_fluid_optimizer_SGDOptimizer` ). 
-    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
-    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
+    It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_fluid_optimizer_SGDOptimizer` ).
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has
     higher priority than ``optimizer`` .
-    
+
     In the implementation, the formula of L2 Weight Decay Regularization is as follows:
 
     .. math::
@@ -94,7 +94,7 @@ class L2DecayRegularizer(WeightDecayRegularizer):
             l1 = fluid.regularizer.L1Decay(regularization_coeff=0.1)
             l2 = fluid.regularizer.L2Decay(regularization_coeff=0.1)
             x = fluid.layers.uniform_random([3,4])
-            
+
             # set L1 regularization in fluid.ParamAttr
             w_param = fluid.ParamAttr(regularizer=l1)
             hidden1 = fluid.layers.fc(x, 8, param_attr=w_param)  # fc_0.w_0(L1), fc_0.b_0
@@ -105,9 +105,9 @@ class L2DecayRegularizer(WeightDecayRegularizer):
             # set L2 regularization in optimizer
             optimizer = fluid.optimizer.SGD(learning_rate=1e-4, regularization=l2)
             optimizer.minimize(avg_loss)
-            
+
             # it will Print Message:
-            # Regularization of [fc_0.w_0, fc_1.w_0] have been set by ParamAttr or WeightNormParamAttr already. 
+            # Regularization of [fc_0.w_0, fc_1.w_0] have been set by ParamAttr or WeightNormParamAttr already.
             # So, the Regularization of Optimizer will not take effect for these parameters!
 
     """
@@ -160,21 +160,21 @@ class L2DecayRegularizer(WeightDecayRegularizer):
 class L1DecayRegularizer(WeightDecayRegularizer):
     r"""
     Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
-    
-    It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_fluid_optimizer_SGDOptimizer` ). 
-    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
-    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
+
+    It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_fluid_optimizer_SGDOptimizer` ).
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has
     higher priority than ``optimizer`` .
-    
+
     In the implementation, the formula of L1 Weight Decay Regularization is as follows:
-	
+
     .. math::
 
         L1WeightDecay = reg\_coeff * sign(parameter)
 
     Args:
         regularization_coeff(float, optional): regularization coeff. Default:0.0.
-	
+
     Examples:
         .. code-block:: python
 
@@ -195,7 +195,7 @@ class L1DecayRegularizer(WeightDecayRegularizer):
                 regularization=fluid.regularizer.L1DecayRegularizer(
                     regularization_coeff=0.1))
             optimizer.minimize(avg_loss)
- 
+
 
             # Example2: set Regularizer both in ParamAttr and optimizer
             import paddle.fluid as fluid
@@ -203,7 +203,7 @@ class L1DecayRegularizer(WeightDecayRegularizer):
             l1 = fluid.regularizer.L1Decay(regularization_coeff=0.1)
             l2 = fluid.regularizer.L2Decay(regularization_coeff=0.1)
             x = fluid.layers.uniform_random([3,4])
-            
+
             # set L1 regularization in fluid.ParamAttr
             w_param = fluid.ParamAttr(regularizer=l1)
             hidden1 = fluid.layers.fc(x, 8, param_attr=w_param)  # fc_0.w_0(L1), fc_0.b_0
@@ -214,9 +214,9 @@ class L1DecayRegularizer(WeightDecayRegularizer):
             # set L2 regularization in optimizer
             optimizer = fluid.optimizer.SGD(learning_rate=1e-4, regularization=l2)
             optimizer.minimize(avg_loss)
-            
+
             # it will Print Message:
-            # Regularization of [fc_0.w_0, fc_1.w_0] have been set by ParamAttr or WeightNormParamAttr already. 
+            # Regularization of [fc_0.w_0, fc_1.w_0] have been set by ParamAttr or WeightNormParamAttr already.
             # So, the Regularization of Optimizer will not take effect for these parameters!
 
     """
diff --git a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
index 0a0a2e8e6e3..a3d939df124 100644
--- a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
@@ -35,7 +35,7 @@ multi_out_module = load(
     name='multi_out_jit',
     sources=['multi_out_test_op.cc'],
     extra_include_paths=paddle_includes,  # add for Coverage CI
-    extra_cxx_cflags=extra_cc_args,  # test for cflags 
+    extra_cxx_cflags=extra_cc_args,  # test for cflags
     verbose=True)
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
index f893088782d..781b5e6b712 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
@@ -81,12 +81,12 @@ cluster_json = """
 mapping_josn = """
 [
   {
-    "hostname": "machine1", 
-    "addr": "127.0.0.1", 
-    "port": "768", 
-    "ranks": 
+    "hostname": "machine1",
+    "addr": "127.0.0.1",
+    "port": "768",
+    "ranks":
       {
-        "0": [1], 
+        "0": [1],
         "1": [0]
       }
   }
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
index 2fa01bdfa6a..debff51a2ca 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
@@ -22,7 +22,7 @@ from paddle.distributed.auto_parallel.cluster import Cluster
 from paddle.distributed.auto_parallel.cluster import get_default_cluster
 
 cluster_json = """
-{ 
+{
     "alpha_latency": {"inter": {"ring": "NET", "tree": "NET"},
                     "intra": {"ring": "NVL", "tree": "PHB"},
                     "base": {"ring": 8.4, "tree": 0},
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
index 9b2098d37b8..df637036529 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
@@ -115,11 +115,11 @@ class TestVJPException(unittest.TestCase):
 
 
 def approx_jacobian(f, xs, dtype, eps=1e-5, batch=False):
-    r"""Computes an approximate Jacobian matrix of a multi-valued function 
+    r"""Computes an approximate Jacobian matrix of a multi-valued function
     using finite differences.
 
-    The function input is required to be an np array or a list of list of np 
-    arrays. 
+    The function input is required to be an np array or a list of list of np
+    arrays.
     """
 
     def flatten(x):
diff --git a/python/paddle/fluid/tests/unittests/autograd/utils.py b/python/paddle/fluid/tests/unittests/autograd/utils.py
index 6afd0ff3922..d410dd32ddb 100644
--- a/python/paddle/fluid/tests/unittests/autograd/utils.py
+++ b/python/paddle/fluid/tests/unittests/autograd/utils.py
@@ -280,7 +280,7 @@ TEST_CASE_NAME = 'suffix'
 
 
 def place(devices, key='place'):
-    """A Decorator for a class which will make the class running on different 
+    """A Decorator for a class which will make the class running on different
     devices .
 
     Args:
@@ -308,7 +308,7 @@ def place(devices, key='place'):
 
 
 def parameterize(fields, values=None):
-    """Decorator for a unittest class which make the class running on different 
+    """Decorator for a unittest class which make the class running on different
     test cases.
 
     Args:
@@ -365,7 +365,7 @@ def _np_transpose_matrix_format(src, src_format, des_format):
 
 
 def _np_concat_matrix_sequence(src, src_format=MatrixFormat.NM):
-    """Convert a sequence of sequence of Jacobian/Hessian matrix into one huge 
+    """Convert a sequence of sequence of Jacobian/Hessian matrix into one huge
     matrix."""
 
     def concat_col(xs):
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
index 922c424e178..6df6de57121 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
@@ -24,7 +24,7 @@ from paddle.optimizer.lr import NoamDecay
 
 from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
 """
-Note(chenweihang): To compare loss of single-card and multi-card 
+Note(chenweihang): To compare loss of single-card and multi-card
     in our dist test framework, two parameters need to be adjusted:
   1. set the dropout rate to 0.
   2. set the weights for Transformer.forward to constant.
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
index e08e069692a..b6a85e94d85 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
@@ -1303,7 +1303,7 @@ class TestLayerNormFp16(unittest.TestCase):
     "skip bf16 test if cuda is in use but bf16 is not supported by gpu arch.")
 class TestBf16(unittest.TestCase):
     '''
-    test amp for BF16 
+    test amp for BF16
     '''
 
     def train(self, enable_amp=True, amp_level='O1'):
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
index 541ca91f996..39982264736 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
@@ -1290,7 +1290,7 @@ class TestLayerNormFp16(unittest.TestCase):
 
 class TestBf16(unittest.TestCase):
     '''
-    test amp for BF16 
+    test amp for BF16
     '''
 
     def train(self, enable_amp=True, amp_level='O1'):
diff --git a/python/paddle/fluid/tests/unittests/distribution/mock_data.py b/python/paddle/fluid/tests/unittests/distribution/mock_data.py
index a5a6b5542cd..60299505f21 100644
--- a/python/paddle/fluid/tests/unittests/distribution/mock_data.py
+++ b/python/paddle/fluid/tests/unittests/distribution/mock_data.py
@@ -16,7 +16,7 @@ import paddle
 
 
 class Exponential(paddle.distribution.ExponentialFamily):
-    """mock exponential distribution, which support computing entropy and 
+    """mock exponential distribution, which support computing entropy and
        kl use bregman divergence
     """
     _mean_carrier_measure = 0
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
index 5d58ee3481d..2772cff8459 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
@@ -40,7 +40,7 @@ class PredictorTools(object):
 
     def _load_model_and_set_config(self):
         '''
-        load model from file and set analysis config 
+        load model from file and set analysis config
         '''
         if os.path.exists(os.path.join(self.model_path, self.params_file)):
             config = AnalysisConfig(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
index a2ec446c728..2f1343563d9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -269,7 +269,7 @@ class FC(Layer):
             out.data = [[[0.182996 -0.474117]]]
             out.shape = (1, 1, 2)
     Parameters:
-        
+
         size(int): The number of output units in this layer.
         num_flatten_dims (int, optional): The fc layer can accept an input tensor with more than
             two dimensions. If this happens, the multi-dimension tensor will first be flattened
@@ -293,7 +293,7 @@ class FC(Layer):
         **bias** (Parameter or None): the learnable bias of this layer.
     Returns:
         None
-    
+
     Examples:
         .. code-block:: python
           from paddle.fluid.dygraph.base import to_variable
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
index 4d919383013..b82a7b663bd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
@@ -266,7 +266,7 @@ class FC(paddle.nn.Layer):
             out.data = [[[0.182996 -0.474117]]]
             out.shape = (1, 1, 2)
     Parameters:
-        
+
         size(int): The number of output units in this layer.
         num_flatten_dims (int, optional): The fc layer can accept an input tensor with more than
             two dimensions. If this happens, the multi-dimension tensor will first be flattened
@@ -290,7 +290,7 @@ class FC(paddle.nn.Layer):
         **bias** (Parameter or None): the learnable bias of this layer.
     Returns:
         None
-    
+
     """
 
     def __init__(self,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py
index 227191a68fe..d34cd5104e5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py
@@ -132,7 +132,7 @@ def test_push_pop_2(x, *args, **kargs):
 def test_push_pop_3(x, *args, **kargs):
     """ push_pop_vars in main_function is : `k`
         NOTE: One may expect `k` and `l` because l
-              is nonlocal. Name bind analysis is 
+              is nonlocal. Name bind analysis is
               not implemented yet.
     """
     l = []
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_multi_forward.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_multi_forward.py
index 15a4e690a2c..843f5ca4e5e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_multi_forward.py
@@ -35,8 +35,8 @@ class MyLayer(paddle.nn.Layer):
 class TestBackward(unittest.TestCase):
 
     def test_order_0(self):
-        """ 
-        loss = 1 * w * 1 + 2 * w * 2 
+        """
+        loss = 1 * w * 1 + 2 * w * 2
         delta_w = 5
         """
         model = MyLayer()
@@ -49,8 +49,8 @@ class TestBackward(unittest.TestCase):
         self.assertEqual(model.linear.weight.grad, 5)
 
     def test_order_1(self):
-        """ 
-        loss = 2 * w * 2  + 1 * w * 1 
+        """
+        loss = 2 * w * 2  + 1 * w * 1
         delta_w = 5
         """
         model = MyLayer()
diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py
index e261fd8165d..e7a8b8580e9 100644
--- a/python/paddle/fluid/tests/unittests/gradient_checker.py
+++ b/python/paddle/fluid/tests/unittests/gradient_checker.py
@@ -642,10 +642,10 @@ def get_eager_double_grad(func,
         place (fluid.CPUPlace or fluid.CUDAPlace): the device.
         return_mid_result (bool): A flag that controls the return content.
     Returns:
-        If 'return_mid_result' set True. 
+        If 'return_mid_result' set True.
         the second order derivative and the inputs of second order derivative's calculation
         will be returned for higher order derivative's calculation.
-        If 'return_mid_result' set False. 
+        If 'return_mid_result' set False.
         A list of numpy array that stores second derivative result calulated by dygraph.
     """
     if isinstance(place, fluid.CPUPlace):
@@ -709,8 +709,8 @@ def double_grad_check_for_dygraph(func,
                                   rtol=1e-3,
                                   raise_exception=True):
     """
-    Check second order gradients of dygraph. This function will compare the 
-    second order gradients of dygraph and second order gradients of static graph 
+    Check second order gradients of dygraph. This function will compare the
+    second order gradients of dygraph and second order gradients of static graph
     to validate dygraph's correctness
 
     Args:
@@ -841,7 +841,7 @@ def get_eager_triple_grad(func,
         x_init (numpy.array|list[numpy.array]|None): the init value for input x.
         dy_init (numpy.array|list[numpy.array]|None): the init value for gradient of output.
         place (fluid.CPUPlace or fluid.CUDAPlace): the device.
-        return_mid_result (list[Tensor], list[Tensor]): If set True, the 
+        return_mid_result (list[Tensor], list[Tensor]): If set True, the
     Returns:
         A list of numpy array that stores second derivative result calulated by dygraph
     """
@@ -876,8 +876,8 @@ def triple_grad_check_for_dygraph(func,
                                   rtol=1e-3,
                                   raise_exception=True):
     """
-    Check third order gradients of dygraph. This function will compare the 
-    third order gradients of dygraph and third order gradients of static graph 
+    Check third order gradients of dygraph. This function will compare the
+    third order gradients of dygraph and third order gradients of static graph
     to validate dygraph's correctness
 
     Args:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
index 715a370f53d..ed191d55603 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -505,7 +505,7 @@ class TrtLayerAutoScanTest(AutoScanTest):
 
     class TensorRTParam:
         '''
-        TensorRT subgraph engine parameters. 
+        TensorRT subgraph engine parameters.
         '''
 
         def __init__(self, workspace_size, max_batch_size, min_subgraph_size,
@@ -519,7 +519,7 @@ class TrtLayerAutoScanTest(AutoScanTest):
 
     class DynamicShapeParam:
         '''
-         Prepare TensorRT subgraph engine dynamic shape parameters. 
+         Prepare TensorRT subgraph engine dynamic shape parameters.
          '''
 
         def __init__(self, min_input_shape, max_input_shape, opt_input_shape,
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
index 044451695d4..60f2a93bc3a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
@@ -66,7 +66,7 @@ class InferencePassTest(unittest.TestCase):
 
     def _get_paddle_outs(self, executor, program, scope):
         '''
-        Return PaddlePaddle outputs. 
+        Return PaddlePaddle outputs.
         '''
         with fluid.scope_guard(scope):
             outs = executor.run(program=program,
@@ -77,7 +77,7 @@ class InferencePassTest(unittest.TestCase):
 
     def _get_inference_outs(self, config):
         '''
-        Return AnalysisPredictor outputs. 
+        Return AnalysisPredictor outputs.
         '''
         predictor = create_paddle_predictor(config)
         tensor_shapes = predictor.get_input_tensor_shape()
@@ -106,7 +106,7 @@ class InferencePassTest(unittest.TestCase):
                              use_trt=False,
                              use_mkldnn=False):
         '''
-        Return a new object of AnalysisConfig. 
+        Return a new object of AnalysisConfig.
         '''
         config = AnalysisConfig(self.path)
         config.disable_gpu()
@@ -147,9 +147,9 @@ class InferencePassTest(unittest.TestCase):
 
     def check_output(self, atol=1e-5):
         '''
-        Check whether calculating on CPU and GPU, enable TensorRT 
-        or disable TensorRT, enable MKLDNN or disable MKLDNN 
-        are all the same. 
+        Check whether calculating on CPU and GPU, enable TensorRT
+        or disable TensorRT, enable MKLDNN or disable MKLDNN
+        are all the same.
         '''
         self.assertFalse(self.feeds is None,
                          "The inputs of the model is None. ")
@@ -164,9 +164,9 @@ class InferencePassTest(unittest.TestCase):
                                  quant=False,
                                  rtol=1e-5):
         '''
-        Check whether calculating on CPU and GPU, enable TensorRT 
-        or disable TensorRT, enable MKLDNN or disable MKLDNN 
-        are all the same. 
+        Check whether calculating on CPU and GPU, enable TensorRT
+        or disable TensorRT, enable MKLDNN or disable MKLDNN
+        are all the same.
         '''
         place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
         executor = fluid.Executor(place)
@@ -253,7 +253,7 @@ class InferencePassTest(unittest.TestCase):
 
     class TensorRTParam:
         '''
-        Prepare TensorRT subgraph engine parameters. 
+        Prepare TensorRT subgraph engine parameters.
         '''
 
         def __init__(self,
@@ -274,7 +274,7 @@ class InferencePassTest(unittest.TestCase):
 
     class DynamicShapeParam:
         '''
-        Prepare TensorRT subgraph engine dynamic shape parameters. 
+        Prepare TensorRT subgraph engine dynamic shape parameters.
         '''
 
         def __init__(self, min_input_shape, max_input_shape, optim_input_shape,
@@ -286,7 +286,7 @@ class InferencePassTest(unittest.TestCase):
 
     class LiteParam:
         '''
-        Prepare Lite subgraph engine parameters. 
+        Prepare Lite subgraph engine parameters.
         '''
 
         def __init__(self, precision, passes_filter, ops_filter):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
index d51a4acca0e..b01dd68143d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
@@ -40,7 +40,7 @@ class TensorConfig:
         '''
         shape: The shape of the tensor.
         dtype: The data type of the tensor.
-        data: The value of WeightVar. for input, it should be None 
+        data: The value of WeightVar. for input, it should be None
         '''
         self.lod = lod
         if data_gen is not None:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py b/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py
index 6516206350b..4725a1dae8b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py
@@ -121,7 +121,7 @@ class QuantDequantTest(unittest.TestCase):
 
     def _get_paddle_outs(self, feed, fetch_list, executor, program, scope):
         '''
-        Return PaddlePaddle outputs. 
+        Return PaddlePaddle outputs.
         '''
         with fluid.scope_guard(scope):
             outs = executor.run(program=program,
@@ -132,7 +132,7 @@ class QuantDequantTest(unittest.TestCase):
 
     def _get_inference_outs(self, config):
         '''
-        Return AnalysisPredictor outputs. 
+        Return AnalysisPredictor outputs.
         '''
         predictor = create_paddle_predictor(config)
         tensor_shapes = predictor.get_input_tensor_shape()
@@ -160,7 +160,7 @@ class QuantDequantTest(unittest.TestCase):
                              use_trt=False,
                              use_mkldnn=False):
         '''
-        Return a new object of AnalysisConfig. 
+        Return a new object of AnalysisConfig.
         '''
         config = AnalysisConfig(self.path)
         config.disable_gpu()
@@ -201,9 +201,9 @@ class QuantDequantTest(unittest.TestCase):
                                  quant=False,
                                  rtol=1e-5):
         '''
-        Check whether calculating on CPU and GPU, enable TensorRT 
-        or disable TensorRT, enable MKLDNN or disable MKLDNN 
-        are all the same. 
+        Check whether calculating on CPU and GPU, enable TensorRT
+        or disable TensorRT, enable MKLDNN or disable MKLDNN
+        are all the same.
         '''
         place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
         executor = fluid.Executor(place)
@@ -352,7 +352,7 @@ class QuantDequantTest(unittest.TestCase):
 
     class TensorRTParam:
         '''
-        Prepare TensorRT subgraph engine parameters. 
+        Prepare TensorRT subgraph engine parameters.
         '''
 
         def __init__(self, workspace_size, max_batch_size, min_subgraph_size,
@@ -366,7 +366,7 @@ class QuantDequantTest(unittest.TestCase):
 
     class DynamicShapeParam:
         '''
-        Prepare TensorRT subgraph engine dynamic shape parameters. 
+        Prepare TensorRT subgraph engine dynamic shape parameters.
         '''
 
         def __init__(self, min_input_shape, max_input_shape, optim_input_shape,
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py
index 2a3e349a18a..1696908f4e7 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py
@@ -29,9 +29,9 @@ class TestConvActMkldnnFusePass(PassAutoScanTest):
     """
     x_var   f_var(persistable)
       \       /
-        conv2d 
+        conv2d
           |
-      conv2d_var    
+      conv2d_var
           |
          act
           |
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
index 098cec71159..4b73a9f896e 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
@@ -29,7 +29,7 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
     """
     x_var   f_var(persistable)
       \       /
-        conv2d 
+        conv2d
           |
       conv2d_var  bias_var(persistable)
               \      /
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
index 56ce8f3ea3b..90848f58a54 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
@@ -27,9 +27,9 @@ import hypothesis.strategies as st
 
 class TestConvElementwiseAdd2ActPass(PassAutoScanTest):
     """
-        x_var   f_var(persistable)   
+        x_var   f_var(persistable)
             \       /
-                conv2d 
+                conv2d
                 |
                 conv2d_var    y_var(persistable)
                     \          /
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
index f1d2192a4c7..72e15c04d99 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
@@ -29,7 +29,7 @@ class TestConvElementwiseAddActPass(PassAutoScanTest):
     """
     x_var   f_var(persistable)
       \       /
-         conv2d 
+         conv2d
            |
         conv2d_var    y_var(persistable)
             \          /
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py
index 4463f954371..363ba4d765e 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py
@@ -29,13 +29,13 @@ class TestConvEltwiseaddBnFusePass(PassAutoScanTest):
     """
           x_var   f_var(persistable)
             \       /
-                conv2d 
+                conv2d
                 |
                 conv2d_var    bias_var(persistable)
                     \          /
                 elementwise_add
                         |
-                elementwise_add_var Scale(persistable) Bias(persistable) Mean(persistable) Variance(persistable)  
+                elementwise_add_var Scale(persistable) Bias(persistable) Mean(persistable) Variance(persistable)
                         |
                     batch_norm
                         |
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_bn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_bn_fuse_pass.py
index 6ecfa50d653..008d19356f5 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_bn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_bn_fuse_pass.py
@@ -30,7 +30,7 @@ class TestConvTransposeBnFusePass(PassAutoScanTest):
     '''
     conv_input   conv_weight_var(persistable)
       \       /
-         conv_op     
+         conv_op
           |
       conv_out_var  (bn_scale_var, bn_bias_var, bn_mean_var,bn_variance_var)
                 |            /
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
index 29099b9a7a5..cba1ce4a85f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
@@ -30,9 +30,9 @@ class TestConvTransposeEltwiseaddBnFusePass(PassAutoScanTest):
     '''
     conv_input   conv_weight_var(persistable)
       \       /
-         conv_op     
+         conv_op
           |
-      conv_out_var  elementwise_add_y 
+      conv_out_var  elementwise_add_y
           |       /
     elementwise_add
           |
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
index c6be25f9ff0..a2d8213de72 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
@@ -29,12 +29,12 @@ class TestFCElementwiseLayerNormFusePass(PassAutoScanTest):
     """
     x_var   w(persistable) bias_var(persistable)
       \     |              /
-          fc     
+          fc
           |
       fc_out_var  bias_var(persistable)
             \        /
           elementwise_add  bias_var(persistable)  scale_var(persistable)
-                  \            |                       /                    
+                  \            |                       /
                            layer_norm
                          /      |         \
                         Y    mean_var  variance_var
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_flatten2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_flatten2_matmul_fuse_pass.py
index 181ed89c65e..17401e61f10 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_flatten2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_flatten2_matmul_fuse_pass.py
@@ -27,15 +27,15 @@ import hypothesis.strategies as st
 
 class TestFlatten2MatmulFusePass(PassAutoScanTest):
     """
-        x_var  
-          |          
-       flatten2 
+        x_var
+          |
+       flatten2
           \
     flatten2_out_var    y_var
              \           /
                  matmul      bias_var
                     \          /
-                   elementwise_add  
+                   elementwise_add
     """
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_layernorm_shift_partition_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_layernorm_shift_partition_pass.py
index a4d74611fee..d74e90fa5f2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_layernorm_shift_partition_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_layernorm_shift_partition_pass.py
@@ -32,8 +32,8 @@ class TestLayernormShiftPartitionPass(PassAutoScanTest):
        |
     reshape2
        |
-    reshape2 
-       | 
+    reshape2
+       |
     transpose2
        |
     reshape2
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_to_mul_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_to_mul_pass.py
index 2dc0556e9e2..ab1a61bcc51 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_to_mul_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_to_mul_pass.py
@@ -29,7 +29,7 @@ class TestMapMatmulToMulPass(PassAutoScanTest):
     """
      x_var    y_var(persistable)
        \       /
-         matmul  
+         matmul
     """
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
index 2f0de50610f..c16e1408387 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
@@ -29,7 +29,7 @@ class TestMapMatmulToMulPass(PassAutoScanTest):
     """
      x_var    y_var(persistable)
        \       /
-        matmul_v2  
+        matmul_v2
     """
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
index d8dd7a0eac9..77422428ca5 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
@@ -29,7 +29,7 @@ class TestMapMatmulToMulPass(PassAutoScanTest):
     """
      x_var    y_var(persistable)
        \       /
-       matmul_v2  
+       matmul_v2
     """
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_scale_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_scale_fuse_pass.py
index ef837e1892b..2dd25c687fe 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_scale_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_scale_fuse_pass.py
@@ -31,7 +31,7 @@ class TestMatmulScaleFusePass(PassAutoScanTest):
        \       /
         matmul
           |
-        scale  
+        scale
     """
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
index 52da377599d..a02f460280c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
@@ -33,7 +33,7 @@ class TestMatmulV2ScaleFusePass(PassAutoScanTest):
            |                   =>              |
          scale                             scale_out
            ｜
-        scale_out 
+        scale_out
     """
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_depthwise_conv_pass.py
index 312b77acaa4..bfe2f7d0350 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_depthwise_conv_pass.py
@@ -30,9 +30,9 @@ class DepthwiseConvMKLDNNPass(PassAutoScanTest):
     '''
     conv_input   conv_weight_var(persistable)
       \       /
-         conv_op     
+         conv_op
           |
-      conv_out_var  
+      conv_out_var
     '''
 
     def test(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_reshape2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_reshape2_matmul_fuse_pass.py
index 79652c53e12..18cc733476e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_reshape2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_reshape2_matmul_fuse_pass.py
@@ -27,15 +27,15 @@ import hypothesis.strategies as st
 
 class TestReshape2MatmulFusePass(PassAutoScanTest):
     """
-        x_var  
-          |          
-       reshape2 
+        x_var
+          |
+       reshape2
           \
     reshape2_out_var    y_var
              \           /
                  matmul      bias_var
                     \          /
-                   elementwise_add  
+                   elementwise_add
     """
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_squeeze2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_squeeze2_matmul_fuse_pass.py
index e06a242395f..466bb5b7100 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_squeeze2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squeeze2_matmul_fuse_pass.py
@@ -27,15 +27,15 @@ import hypothesis.strategies as st
 
 class TestSqueeze2MatmulFusePass(PassAutoScanTest):
     """
-        x_var  
-          |          
-       squeeze2 
+        x_var
+          |
+       squeeze2
           \
     squeeze2_out_var    y_var
              \           /
                  matmul      bias_var
                     \          /
-                   elementwise_add  
+                   elementwise_add
     """
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
index 198c4e5c742..5121cbda8d8 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
@@ -28,12 +28,12 @@ class TestTransposeFlattenConcatFusePass(PassAutoScanTest):
         x_1_var              x_2_var
           |                     |
       transpose2            transpose2
-          |                     | 
+          |                     |
        flatten2              flatten2
           \                     /
     flatten2_out_var    flatten2_out_var
               \              /
-                   concat 
+                   concat
     """
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py
index 730babf2aab..dee1102e9d6 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py
@@ -27,15 +27,15 @@ import hypothesis.strategies as st
 
 class TestFlatten2MatmulFusePass(PassAutoScanTest):
     """
-        x_var  
-          |          
-       flatten2 
+        x_var
+          |
+       flatten2
           \
     flatten2_out_var    y_var
              \           /
                  matmul      bias_var
                     \          /
-                   elementwise_add  
+                   elementwise_add
     """
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
index 1911155ca70..91ca32ef899 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
@@ -86,9 +86,9 @@ def multiclass_nms(bboxes,
                           step. -1 means keeping all bboxes after NMS step.
         normalized (bool): Whether detections are normalized. Default: True
         return_index(bool): Whether return selected index. Default: False
-        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. 
+        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image.
             The shape is [B] and data type is int32. B is the number of images.
-            If it is not None then return a list of 1-D Tensor. Each element 
+            If it is not None then return a list of 1-D Tensor. Each element
             is the output RoIs' number of each image on the corresponding level
             and the shape is [B]. None by default.
         name(str): Name of the multiclass nms op. Default: None.
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape2_matmul_fuse_pass.py
index d2dca92345a..5bbd31bd58f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape2_matmul_fuse_pass.py
@@ -27,15 +27,15 @@ import hypothesis.strategies as st
 
 class TestReshape2MatmulFusePass(PassAutoScanTest):
     """
-        x_var  
-          |          
-       reshape2 
+        x_var
+          |
+       reshape2
           \
     reshape2_out_var    y_var
              \           /
                  matmul      bias_var
                     \          /
-                   elementwise_add  
+                   elementwise_add
     """
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py
index a52dd0aed84..9fd1dd2bf42 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py
@@ -27,15 +27,15 @@ import hypothesis.strategies as st
 
 class TestSqueeze2MatmulFusePass(PassAutoScanTest):
     """
-        x_var  
-          |          
-       squeeze2 
+        x_var
+          |
+       squeeze2
           \
     squeeze2_out_var    y_var
              \           /
                  matmul      bias_var
                     \          /
-                   elementwise_add  
+                   elementwise_add
     """
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py
index e69091ed855..a850e9602d0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py
@@ -27,13 +27,13 @@ import hypothesis.strategies as st
 
 class TestUnsqueezeEltwiseFusePass(PassAutoScanTest):
     """
-        y_var  
-          |          
-       unsqueeze2 
+        y_var
+          |
+       unsqueeze2
           \
     unsqueeze2_out_var    x_var
              \           /
-            elementwise_mul 
+            elementwise_mul
     """
 
     def sample_predictor_configs(self, program_config):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
index 04d6be13001..24ec7cc39fa 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
@@ -86,7 +86,7 @@ class TestMKLDNNMulOpS8S8(OpTest):
 
 
 '''
- test case for  s8 * u8 
+ test case for  s8 * u8
 '''
 
 
@@ -98,7 +98,7 @@ class TestMKLDNNMulOpS8U8(TestMKLDNNMulOpS8S8):
 
 
 '''
- test case for  s8 * s8 
+ test case for  s8 * s8
 '''
 
 
@@ -154,7 +154,7 @@ class TestMKLDNNMulOpS8S8WithFlatten(TestMKLDNNMulOpS8S8):
 
 
 '''
- test case for  s8 * u8 
+ test case for  s8 * u8
 '''
 
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_tril_triu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_tril_triu_op_mlu.py
index da6557beb68..a908655c7e7 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_tril_triu_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_tril_triu_op_mlu.py
@@ -132,7 +132,7 @@ for _op_type in ['tril', 'triu']:
 
 
 class TestTrilTriuOpAPI(unittest.TestCase):
-    """ test case by using API and has -1 dimension 
+    """ test case by using API and has -1 dimension
     """
 
     def test_api(self):
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index bed1fe8841b..d672dccec9f 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -744,7 +744,7 @@ class OpTest(unittest.TestCase):
         def prepare_python_api_arguments(api, op_proto_ins, op_proto_attrs,
                                          kernel_sig):
             """ map from `op proto inputs and attrs` to `api input list and api attrs dict`
-                
+
                 NOTE: the op_proto_attrs and op_proto_ins is a default dict. default value is []
             """
 
@@ -916,7 +916,7 @@ class OpTest(unittest.TestCase):
             args = prepare_python_api_arguments(self.python_api,
                                                 eager_tensor_inputs,
                                                 attrs_outputs, kernel_sig)
-            """ we directly return the cal_python_api value because the value is already tensor. 
+            """ we directly return the cal_python_api value because the value is already tensor.
             """
             return cal_python_api(self.python_api, args, kernel_sig)
 
diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
index 929d37da0af..32f24cd2605 100755
--- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
+++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
@@ -486,11 +486,11 @@ class DnnTrainer(object):
 
         else:
             pass
-        '''          
+        '''
             print("entering run_the_one_ps -- old")
             fleet_obj = fleet.distributed_optimizer(
-                inner_optimizer, user_defined_strategy)  
-            fleet_obj.minimize(loss)  
+                inner_optimizer, user_defined_strategy)
+            fleet_obj.minimize(loss)
             if fleet.is_worker():
                 worker_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=False, is_sync=False)
                 server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False)
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
index 176f4193416..e6e509942ab 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
@@ -82,7 +82,7 @@ cluster_json = """
           "source_global_id": 1,
           "target_global_id": 2,
           "type": "PHB",
-          "bandwidth": 12 
+          "bandwidth": 12
         },
         {
           "source_global_id": 0,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 36923212fdf..97855c8a8f1 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -136,7 +136,7 @@ cluster_json = """
           "source_global_id": 0,
           "target_global_id": 4,
           "type": "PHB",
-          "bandwidth": 12 
+          "bandwidth": 12
         },
         {
           "source_global_id": 1,
@@ -160,7 +160,7 @@ cluster_json = """
           "source_global_id": 1,
           "target_global_id": 4,
           "type": "PHB",
-          "bandwidth": 12 
+          "bandwidth": 12
         },
         {
           "source_global_id": 2,
@@ -208,13 +208,13 @@ cluster_json = """
           "source_global_id": 3,
           "target_global_id": 4,
           "type": "PHB",
-          "bandwidth": 12 
+          "bandwidth": 12
         },
         {
           "source_global_id": 4,
           "target_global_id": 9,
           "type": "NET",
-          "bandwidth": 1 
+          "bandwidth": 1
         }
       ]
     },
@@ -288,7 +288,7 @@ cluster_json = """
           "source_global_id": 5,
           "target_global_id": 9,
           "type": "PHB",
-          "bandwidth": 12 
+          "bandwidth": 12
         },
         {
           "source_global_id": 6,
@@ -312,7 +312,7 @@ cluster_json = """
           "source_global_id": 6,
           "target_global_id": 9,
           "type": "PHB",
-          "bandwidth": 12 
+          "bandwidth": 12
         },
         {
           "source_global_id": 7,
@@ -336,7 +336,7 @@ cluster_json = """
           "source_global_id": 7,
           "target_global_id": 9,
           "type": "PHB",
-          "bandwidth": 12 
+          "bandwidth": 12
         },
         {
           "source_global_id": 8,
@@ -360,16 +360,16 @@ cluster_json = """
           "source_global_id": 8,
           "target_global_id": 9,
           "type": "PHB",
-          "bandwidth": 12 
+          "bandwidth": 12
         },
         {
           "source_global_id": 9,
           "target_global_id": 4,
           "type": "NET",
-          "bandwidth": 1 
+          "bandwidth": 1
         }
       ]
-    } 
+    }
   ]
 }
 """
diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py
index 86031db0f96..687def36071 100644
--- a/python/paddle/fluid/tests/unittests/test_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_cond.py
@@ -277,7 +277,7 @@ class TestCondNestedControlFlow(unittest.TestCase):
             a = 2 * i
             if i < 5:
                 if i >= 3:
-                    return a + a 
+                    return a + a
                 else:
                     return a - a
             else:
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py b/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py
index 87c4f6cee5b..099f19bd03e 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py
@@ -112,8 +112,8 @@ class TestCUDAEvent(unittest.TestCase):
 
 class TestStreamGuard(unittest.TestCase):
     '''
-    Note: 
-        The asynchronous execution property of CUDA Stream can only be tested offline. 
+    Note:
+        The asynchronous execution property of CUDA Stream can only be tested offline.
     '''
 
     def test_stream_guard_normal(self):
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
index 202fa349090..4e8bae051ef 100755
--- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
@@ -42,7 +42,7 @@ paddle.disable_static()
 
 def get_warning_index(api):
     """
-    Given an paddle API, return the index of the Warinng information in its doc string if exists; 
+    Given an paddle API, return the index of the Warinng information in its doc string if exists;
     If Warinng information doesn't exist, return the default ERROR_WARNING_POSTION, sys.maxsize.
 
     Args:
@@ -72,7 +72,7 @@ class TestDeprecatedDocorator(unittest.TestCase):
 
     def test_fluid_data(self):
         """
-        test old fluid elementwise_mul api, it should fire Warinng function, 
+        test old fluid elementwise_mul api, it should fire Warinng function,
         which insert the Warinng info on top of API's doc string.
         """
         paddle.enable_static()
@@ -91,7 +91,7 @@ class TestDeprecatedDocorator(unittest.TestCase):
 
     def test_fluid_elementwise_mul(self):
         """
-        test old fluid elementwise_mul api, it should trigger Warinng function, 
+        test old fluid elementwise_mul api, it should trigger Warinng function,
         which insert the Warinng info on top of API's doc string.
         """
 
@@ -133,7 +133,7 @@ class TestDeprecatedDocorator(unittest.TestCase):
 
     def test_ops_elementwise_mul(self):
         """
-        Test for new C++ elementwise_op, expected result should be True, 
+        Test for new C++ elementwise_op, expected result should be True,
         because not matter what fluid.layers.elementwise_mul is deprecated.
         """
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index ac1bf486182..b616299b946 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -139,7 +139,7 @@ class FleetDistRunnerBase(object):
                     learning_rate=LEARNING_RATE,
                     decay_steps=500,
                     decay_rate=0.969,
-                    staircase=True)) 
+                    staircase=True))
             """
         else:
             optimizer = fluid.optimizer.SGD(LEARNING_RATE, grad_clip=grad_clip)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adagrad.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adagrad.py
index 9f372fea81f..725df99f719 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adagrad.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adagrad.py
@@ -28,7 +28,7 @@ from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
 
 
 class TestSparseLoadProgramAdagrad(TestSparseLoadProgram):
-    """ 
+    """
     Test Sparse load operator.
     """
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adam.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adam.py
index 60c3f7fc9f1..47c3bdaa4b3 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adam.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adam.py
@@ -28,7 +28,7 @@ from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
 
 
 class TestSparseLoadProgramAdam(TestSparseLoadProgram):
-    """ 
+    """
     Test Sparse load operator.
     """
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_ftrl.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_ftrl.py
index a08af52263c..32ee13c009f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_ftrl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_ftrl.py
@@ -28,7 +28,7 @@ from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
 
 
 class TestSparseLoadProgramFtrl(TestSparseLoadProgram):
-    """ 
+    """
     Test Sparse load operator.
     """
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_momentum.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_momentum.py
index 960857df928..11fd66e7440 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_momentum.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_momentum.py
@@ -28,7 +28,7 @@ from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
 
 
 class TestSparseLoadProgramMomentum(TestSparseLoadProgram):
-    """ 
+    """
     Test Sparse load operator.
     """
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_rmsprop.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_rmsprop.py
index 5516832ef21..6d4a22d4a21 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_rmsprop.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_rmsprop.py
@@ -28,7 +28,7 @@ from test_dist_sparse_tensor_load_sgd import TestSparseLoadProgram
 
 
 class TestSparseLoadProgramRmsprop(TestSparseLoadProgram):
-    """ 
+    """
     Test Sparse load operator.
     """
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py
index 416a6290715..9c1714ca4bf 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py
@@ -27,7 +27,7 @@ from paddle.distributed.fleet import fleet
 
 
 class TestSparseLoadProgram(unittest.TestCase):
-    """ 
+    """
     Test Sparse load operator.
     """
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
index 965b46ee843..2c5617a1c59 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -616,7 +616,7 @@ class PaddingRNNTestBase(unittest.TestCase):
                                     parallel=True,
                                     use_program_cache=True):
         '''
-        Test that train ppl of padding mode is same to that of static mode 
+        Test that train ppl of padding mode is same to that of static mode
         '''
         config = RNNConfig('test', 'padding')
         with fluid.scope_guard(fluid.Scope()):
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
index 7230cd97ebd..e710fc11bb7 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
@@ -25,8 +25,8 @@ os.environ['FLAGS_new_einsum'] = "1"
 
 
 def error_trans(func, *args, **kargs):
-    """ 
-    transport C++ exception into Python exception. 
+    """
+    transport C++ exception into Python exception.
     because einsum_v2 raise different exception with einsum_v1.
     """
     try:
diff --git a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
index e3e9db14db9..65a09e662dc 100755
--- a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
@@ -36,8 +36,8 @@ from tokenizer.bert_tokenizer import BertTokenizer
 def to_string_tensor(string_values, name):
     """
     Create the tensor that the value holds the list of string.
-    NOTICE: The value will be holded in the cpu place. 
- 
+    NOTICE: The value will be holded in the cpu place.
+
     Args:
         string_values(list[string]): The value will be setted to the tensor.
         name(string): The name of the tensor.
@@ -51,9 +51,9 @@ def to_string_tensor(string_values, name):
 def to_map_tensor(string_dict, name):
     """
     Create the tensor that the value holds the map, the type of key is the string
-    and the value is the int. 
-    NOTICE: The value will be holded in the cpu place. 
- 
+    and the value is the int.
+    NOTICE: The value will be holded in the cpu place.
+
     Args:
         string_dict(dict): The value will be setted to the tensor.
         name(string): The name of the tensor.
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index 460f58d87b7..51fbb60a7b4 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -112,7 +112,7 @@ def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
 
 def box_coder(all_anchors, bbox_deltas, variances, pixel_offset=True):
     """
-    Decode proposals by anchors and bbox_deltas from RPN 
+    Decode proposals by anchors and bbox_deltas from RPN
     """
     offset = 1 if pixel_offset else 0
     #proposals: xmin, ymin, xmax, ymax
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op.py b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
index 6a3b94825e5..8484044ec77 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
@@ -184,7 +184,7 @@ class TestGroupNormOpBigEps3(TestGroupNormOp):
 
 @skip_check_grad_ci(
     reason=
-    '''This test case is used to ensure whether the gradient checking results between CPU and GPU  
+    '''This test case is used to ensure whether the gradient checking results between CPU and GPU
             are consistent when using the same inputs, thus, it doesn't need to call check_grad.'''
 )
 class TestGroupNormOpLargeData(TestGroupNormOp):
@@ -234,7 +234,7 @@ class TestGroupNormOpBigEps3_With_NHWC(TestGroupNormOp):
 
 @skip_check_grad_ci(
     reason=
-    '''This test case is used to ensure whether the gradient checking results between CPU and GPU  
+    '''This test case is used to ensure whether the gradient checking results between CPU and GPU
             are consistent when using the same inputs, thus, it doesn't need to call check_grad.'''
 )
 class TestGroupNormOpLargeData_With_NHWC(TestGroupNormOp):
diff --git a/python/paddle/fluid/tests/unittests/test_launch_coverage.py b/python/paddle/fluid/tests/unittests/test_launch_coverage.py
index e4c35a63471..62826c0858b 100644
--- a/python/paddle/fluid/tests/unittests/test_launch_coverage.py
+++ b/python/paddle/fluid/tests/unittests/test_launch_coverage.py
@@ -29,17 +29,17 @@ from paddle.distributed.fleet.launch_utils import find_free_ports
 
 def _parse_args():
     parser = ArgumentParser(
-        description='''start paddle training using multi-process mode.	
-NOTE: your train program ***must*** run as distributed nccl2 mode,	
-see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-	
-And your train program must read environment variables below in order to let different	
-process init properly:	
-FLAGS_selected_gpus	
-PADDLE_TRAINER_ID	
-PADDLE_CURRENT_ENDPOINT	
-PADDLE_TRAINERS_NUM	
-PADDLE_TRAINER_ENDPOINTS	
-POD_IP (current node ip address, not needed for local training)	
+        description='''start paddle training using multi-process mode.
+NOTE: your train program ***must*** run as distributed nccl2 mode,
+see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
+And your train program must read environment variables below in order to let different
+process init properly:
+FLAGS_selected_gpus
+PADDLE_TRAINER_ID
+PADDLE_CURRENT_ENDPOINT
+PADDLE_TRAINERS_NUM
+PADDLE_TRAINER_ENDPOINTS
+POD_IP (current node ip address, not needed for local training)
 ''')
 
     #Optional arguments for the launch helper
@@ -81,7 +81,7 @@ POD_IP (current node ip address, not needed for local training)
         "--log_level",
         type=int,
         default=
-        20,  # logging.INFO, details are here:https://docs.python.org/3/library/logging.html#levels	
+        20,  # logging.INFO, details are here:https://docs.python.org/3/library/logging.html#levels
         help="Logging level, default is logging.INFO")
 
     parser.add_argument(
diff --git a/python/paddle/fluid/tests/unittests/test_lu_op.py b/python/paddle/fluid/tests/unittests/test_lu_op.py
index 917f2b1dbfd..95116ead309 100644
--- a/python/paddle/fluid/tests/unittests/test_lu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lu_op.py
@@ -213,7 +213,7 @@ class TestLUAPI(unittest.TestCase):
         tensor_shapes = [
             (3, 5),
             (5, 5),
-            (5, 3),  # 2-dim Tensors 
+            (5, 3),  # 2-dim Tensors
             (2, 3, 5),
             (3, 5, 5),
             (4, 5, 3),  # 3-dim Tensors
@@ -279,7 +279,7 @@ class TestLUAPI(unittest.TestCase):
         tensor_shapes = [
             (3, 5),
             (5, 5),
-            (5, 3),  # 2-dim Tensors 
+            (5, 3),  # 2-dim Tensors
             (2, 3, 5),
             (3, 5, 5),
             (4, 5, 3),  # 3-dim Tensors
diff --git a/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py b/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
index 89f406b0a2f..64f14a13700 100644
--- a/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
@@ -225,7 +225,7 @@ class TestLU_UnpackAPI(unittest.TestCase):
         tensor_shapes = [
             (3, 5),
             (5, 5),
-            (5, 3),  # 2-dim Tensors 
+            (5, 3),  # 2-dim Tensors
             (2, 3, 5),
             (3, 5, 5),
             (4, 5, 3),  # 3-dim Tensors
@@ -282,7 +282,7 @@ class TestLU_UnpackAPI(unittest.TestCase):
         tensor_shapes = [
             (3, 5),
             (5, 5),
-            (5, 3),  # 2-dim Tensors 
+            (5, 3),  # 2-dim Tensors
             (2, 3, 5),
             (3, 5, 5),
             (4, 5, 3),  # 3-dim Tensors
diff --git a/python/paddle/fluid/tests/unittests/test_prune.py b/python/paddle/fluid/tests/unittests/test_prune.py
index 6d7518f1b43..197de106230 100644
--- a/python/paddle/fluid/tests/unittests/test_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_prune.py
@@ -194,7 +194,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase):
 
     def test_prune_fetches_without_optimizer(self):
         """
-        Prune operators and variables which are not needed to generate 'fetches'. 
+        Prune operators and variables which are not needed to generate 'fetches'.
         """
         program = framework.Program()
         startup_program = framework.Program()
@@ -224,7 +224,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase):
 
     def test_prune_fetches_with_optimizer(self):
         """
-        Prune operators and operators which are not needed to generate 'fetches'. 
+        Prune operators and operators which are not needed to generate 'fetches'.
         In train mode, the operators and operators in backward and optimization should be kept.
         """
         program = framework.Program()
@@ -345,7 +345,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase):
         If in next run, the program, feed, fetch are not changed, Executor use the cached pruned program,
         and needn't to call  _prune_program() to prune the program.
         In this test, we hack the Executor._prune_program with a mock function which do nothing but increase
-        Executor.prune_called_times, and we check prune_called_times equals 1 even if we called exe.run() 
+        Executor.prune_called_times, and we check prune_called_times equals 1 even if we called exe.run()
         10 times with the same input arguments.
         '''
         with _mock_guard(mock):
@@ -379,7 +379,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase):
     def test_prune_with_cache_program2(self):
         '''
         When use_prune=True, Executor should cache the pruned program.
-        If the only difference in fetch_list is  optimize_ops during multiple runs, 
+        If the only difference in fetch_list is  optimize_ops during multiple runs,
         the cache_keys should be different and get different pruned program.
         '''
         with _mock_guard(mock):
@@ -435,7 +435,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase):
         If in next run, the program, feed, fetch are not changed, Executor use the cached pruned program,
         and needn't to call  _prune_program() to prune the program.
         In this test, we hack the Executor._prune_program with a mock function which do nothing but increase
-        Executor.prune_called_times, and we check prune_called_times equals 1 even if we called exe.run() 
+        Executor.prune_called_times, and we check prune_called_times equals 1 even if we called exe.run()
         10 times with the same input arguments.
         '''
         with _mock_guard(mock):
@@ -471,7 +471,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase):
 
     def test_prune_with_multi_optimizers(self):
         '''
-        If there are multiple optimizers in the program, we can run specific one by 
+        If there are multiple optimizers in the program, we can run specific one by
         pass the return of optimize.minimize() to fetch_list.
         '''
         exe = fluid.Executor(fluid.CPUPlace())
@@ -602,7 +602,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase):
 
     def test_prune_program_with_tupe_in_fetch_list(self):
         '''
-        If there are multiple optimizers in the program, we can run specific one by 
+        If there are multiple optimizers in the program, we can run specific one by
         pass the return of optimize.minimize() to fetch_list.
         '''
         exe = fluid.Executor(fluid.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_svd_op.py b/python/paddle/fluid/tests/unittests/test_svd_op.py
index 2594bea76dd..b784e22544d 100644
--- a/python/paddle/fluid/tests/unittests/test_svd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_svd_op.py
@@ -91,7 +91,7 @@ class TestSvdOp(OpTest):
                         check_eager=True)
 
     def test_check_grad(self):
-        """ 
+        """
         remember the input matrix must be the full rank matrix, otherwise the gradient will stochatic because the u / v 's  (n-k) freedom  vectors
         """
         self.check_S_grad()
@@ -106,7 +106,7 @@ class TestSvdCheckGrad2(TestSvdOp):
     no_need_check_grad = True
 
     def generate_input(self):
-        """ return a deterministic  matrix, the range matrix; 
+        """ return a deterministic  matrix, the range matrix;
             vander matrix must be a full rank matrix.
         """
         self._input_shape = (5, 5)
@@ -117,7 +117,7 @@ class TestSvdCheckGrad2(TestSvdOp):
 class TestSvdNormalMatrixSmall(TestSvdCheckGrad2):
 
     def generate_input(self):
-        """ small matrix SVD. 
+        """ small matrix SVD.
         """
         self._input_shape = (1, 1)
         self._input_data = np.random.random(self._input_shape).astype("float64")
@@ -126,7 +126,7 @@ class TestSvdNormalMatrixSmall(TestSvdCheckGrad2):
 class TestSvdNormalMatrix6x3(TestSvdCheckGrad2):
 
     def generate_input(self):
-        """ return a deterministic  matrix, the range matrix; 
+        """ return a deterministic  matrix, the range matrix;
             vander matrix must be a full rank matrix.
         """
         self._input_shape = (6, 3)
@@ -139,7 +139,7 @@ class TestSvdNormalMatrix6x3(TestSvdCheckGrad2):
 class TestSvdNormalMatrix3x6(TestSvdCheckGrad2):
 
     def generate_input(self):
-        """ return a deterministic  matrix, the range matrix; 
+        """ return a deterministic  matrix, the range matrix;
             vander matrix must be a full rank matrix.
         """
         self._input_shape = (3, 6)
@@ -169,7 +169,7 @@ class TestSvdNormalMatrix6x3Batched(TestSvdOp):
 class TestSvdNormalMatrix3x6Batched(TestSvdOp):
 
     def generate_input(self):
-        """ return a deterministic  matrix, the range matrix; 
+        """ return a deterministic  matrix, the range matrix;
             vander matrix must be a full rank matrix.
         """
         self._input_shape = (10, 3, 6)
@@ -189,7 +189,7 @@ class TestSvdNormalMatrix3x6Batched(TestSvdOp):
 class TestSvdNormalMatrix3x3x3x6Batched(TestSvdOp):
 
     def generate_input(self):
-        """ return a deterministic  matrix, the range matrix; 
+        """ return a deterministic  matrix, the range matrix;
             vander matrix must be a full rank matrix.
         """
         self._input_shape = (3, 3, 3, 6)
@@ -214,8 +214,8 @@ class TestSvdNormalMatrix3x3x3x6Batched(TestSvdOp):
 class TestSvdNormalMatrixBig(TestSvdOp):
 
     def generate_input(self):
-        """ big matrix SVD. 
-            
+        """ big matrix SVD.
+
         """
         self._input_shape = (2, 200, 300)
         self._input_data = np.random.random(self._input_shape).astype("float64")
@@ -232,7 +232,7 @@ class TestSvdNormalMatrixBig(TestSvdOp):
 class TestSvdNormalMatrixBig2(TestSvdOp):
 
     def generate_input(self):
-        """ big matrix SVD. 
+        """ big matrix SVD.
         """
         self._input_shape = (1, 100)
         self._input_data = np.random.random(self._input_shape).astype("float64")
@@ -269,7 +269,7 @@ class TestSvdFullMatriceGrad(TestSvdNormalMatrix6x3):
         pass
 
     def test_check_grad(self):
-        """ 
+        """
         remember the input matrix must be the full rank matrix, otherwise the gradient will stochatic because the u / v 's  (n-k) freedom  vectors
         """
         self.check_S_grad()
diff --git a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
index 40f8bc3593a..d68179d033d 100644
--- a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
@@ -127,7 +127,7 @@ for _op_type in ['tril', 'triu']:
 
 
 class TestTrilTriuOpAPI(unittest.TestCase):
-    """ test case by using API and has -1 dimension 
+    """ test case by using API and has -1 dimension
     """
 
     def test_api(self):
diff --git a/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py b/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py
index f396e892ecf..a47ca33acd0 100755
--- a/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py
+++ b/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py
@@ -43,7 +43,7 @@ class BasicTokenizer(object):
         Tokenizes a piece of text using basic tokenizer.
         Args:
             text (str): A piece of text.
-        Returns: 
+        Returns:
             list(str): A list of tokens.
         Examples:
             .. code-block::
@@ -389,7 +389,7 @@ class BertTokenizer(PretrainedTokenizer):
         End-to-end tokenization for BERT models.
         Args:
             text (str): The text to be tokenized.
-        
+
         Returns:
             list: A list of string representing converted tokens.
         """
@@ -404,7 +404,7 @@ class BertTokenizer(PretrainedTokenizer):
         Converts a string to a list of tokens.
         Args:
             text (str): The text to be tokenized.
-        
+
         Returns:
             List(str): A list of string representing converted tokens.
         Examples:
@@ -412,7 +412,7 @@ class BertTokenizer(PretrainedTokenizer):
                 from paddlenlp.transformers import BertTokenizer
                 berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
                 tokens = berttokenizer.tokenize('He was a puppeteer')
-                
+
                 '''
                 ['he', 'was', 'a', 'puppet', '##eer']
                 '''
@@ -439,8 +439,8 @@ class BertTokenizer(PretrainedTokenizer):
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. 
-        
+        adding special tokens.
+
         A BERT sequence has the following format:
         - single sequence:      ``[CLS] X [SEP]``
         - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
@@ -462,7 +462,7 @@ class BertTokenizer(PretrainedTokenizer):
                                              token_ids_0,
                                              token_ids_1=None):
         """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. 
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
         A BERT sequence pair mask has the following format:
         ::
             0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
@@ -495,7 +495,7 @@ class BertTokenizer(PretrainedTokenizer):
                 A list of `inputs_ids` for the first sequence.
             token_ids_1 (List[int], optinal):
                 Optional second list of IDs for sequence pairs. Defaults to None.
-            already_has_special_tokens (bool, optional): Whether or not the token list is already 
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
                 formatted with special tokens for the model. Defaults to None.
         Returns:
             List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
diff --git a/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py b/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py
index d2cf118b632..6f750e17bf2 100644
--- a/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py
+++ b/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py
@@ -343,7 +343,7 @@ class PretrainedTokenizer(object):
 
     @property
     def all_special_tokens(self):
-        """ 
+        """
         list: All the special tokens ('<unk>', '<cls>'...) corresponding to
             special token arguments in `__init__` (arguments end with '_end').
         """
@@ -357,7 +357,7 @@ class PretrainedTokenizer(object):
 
     @property
     def all_special_ids(self):
-        """ 
+        """
         list: All the token ids corresponding to all the special tokens.
         """
         all_toks = self.all_special_tokens
@@ -504,7 +504,7 @@ class PretrainedTokenizer(object):
         `tokenizer_config_file` indicating file (thus `tokenizer_config.json`),
         and resources would be saved into `resource_files_names` indicating files
         by using `self.save_resources(save_directory)`.
-        
+
         The `save_directory` can be used in `from_pretrained` as argument value
         of `pretrained_model_name_or_path` to re-load the tokenizer.
         Args:
@@ -756,7 +756,7 @@ class PretrainedTokenizer(object):
             text (str, List[str] or List[int]):
                 The sequence to be processed. One sequence is a string, a list
                 of strings, or a list of integers depending on whether it has
-                been pretokenized and converted to ids. 
+                been pretokenized and converted to ids.
             text_pair (str, List[str] or List[List[str]]):
                 Same as `text` argument, while it represents for the latter
                 sequence of the sequence pair.
@@ -1208,7 +1208,7 @@ class PretrainedTokenizer(object):
                 Input text.
         Returns:
             list: The offset map of input text.
-            
+
         """
         split_tokens = []
         for token in self.basic_tokenizer.tokenize(text):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py
index 3d1e4863292..90c18dbff2f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py
@@ -35,7 +35,7 @@ paddle.enable_static()
 
 def box_coder(all_anchors, bbox_deltas, variances, pixel_offset=True):
     """
-    Decode proposals by anchors and bbox_deltas from RPN 
+    Decode proposals by anchors and bbox_deltas from RPN
     """
     offset = 1 if pixel_offset else 0
     # proposals: xmin, ymin, xmax, ymax
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 31d3c817d1e..6549a7dedc5 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -1044,14 +1044,14 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
 
     def get_trainer_program(self, wait_port=True):
         """
-        Get transpiled trainer side program. The program on trainer side compared with origin program 
+        Get transpiled trainer side program. The program on trainer side compared with origin program
         has following difference:
 
             - Delete optimizer related op, because parameter updated on Pserver
-            - After the op which computed gradient of each parameter, add ``Send_op`` and ``Recv_op`` 
+            - After the op which computed gradient of each parameter, add ``Send_op`` and ``Recv_op``
 
         Args:
-            wait_port(bool): Whether to wait for the parameter server to be ready before returning to program, 
+            wait_port(bool): Whether to wait for the parameter server to be ready before returning to program,
             default is True
 
         Returns:
@@ -1179,10 +1179,10 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
 
     def get_pserver_program(self, endpoint):
         """
-        Get parameter server side program.The program on pserver side compared with origin program 
+        Get parameter server side program.The program on pserver side compared with origin program
         has following difference:
 
-            - Only the following op is included: optimize-related op and communication-related op 
+            - Only the following op is included: optimize-related op and communication-related op
             - NO.0 block only has variable definitions and ``listen_and_serv_op``
             - Every variable which need to be updated has a unique block
 
@@ -1450,7 +1450,7 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
     def get_pserver_programs(self, endpoint):
         """
         Get pserver side main program and startup program for distributed training.
-        The ``main_program`` returned by this function is consistent with the 
+        The ``main_program`` returned by this function is consistent with the
         return value of the function ``get_pserver_program`` .
 
         Args:
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index 090d0e8dcbb..638c0c5c4b7 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -87,13 +87,13 @@ def generate(key):
     names of the same key by numbering it from zero. For example, when key=fc,
     it continuously generates fc_0, fc_1, fc_2, etc.
 
-    Args: 
+    Args:
         key(str): The prefix of generated name.
 
-    Returns: 
+    Returns:
         str: A unique string with the prefix key.
 
-    Examples: 
+    Examples:
 
         .. code-block:: python
 
@@ -134,23 +134,23 @@ def generate_with_ignorable_key(key):
 def switch(new_generator=None, new_para_name_checker=None):
     """
     Switch the namespace of in current context to a new namespace. Though
-    :code:`switch()` and :code:`guard()` can both change namespace, 
-    :code:`guard()` is recommended since it can manage the context better 
+    :code:`switch()` and :code:`guard()` can both change namespace,
+    :code:`guard()` is recommended since it can manage the context better
     together with :code:`with` statement.
 
-    Args: 
+    Args:
         new_generator(UniqueNameGenerator, optional): A new UniqueNameGenerator, not
             required normally. Default is None, which means switch to a new anonymous
             namespace.
         new_para_name_checker(DygraphParameterNameChecker, optional): A new DygraphParameterNameChecker,
-            not required normally. Default is None, which means  switch to a new parameter name 
+            not required normally. Default is None, which means  switch to a new parameter name
             checker.
 
-    Returns: 
+    Returns:
         UniqueNameGenerator: The previous UniqueNameGenerator.
         DygraphParameterNameChecker: The previous DygraphParameterNameChecker
 
-    Examples: 
+    Examples:
 
         .. code-block:: python
 
@@ -190,16 +190,16 @@ def guard(new_generator=None):
     a new namespace in the context of :code:`with` will be created, and it will number
     names from zero again when calling :code:`generate()` with same key.
 
-    Args: 
+    Args:
         new_generator(str|bytes, optional): New name of global namespace. Note that str
-            in Python2 was spilted into str and bytes in Python3, so here are two 
-            types. Default is None. If not None, new_generator will be added into 
+            in Python2 was spilted into str and bytes in Python3, so here are two
+            types. Default is None. If not None, new_generator will be added into
             the prefix of unique name generated by :code:`generate()`.
-    
+
     Returns:
         None.
 
-    Examples: 
+    Examples:
 
         .. code-block:: python
 
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index a0a778759b0..f16de1ce060 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -557,7 +557,7 @@ def _setitem_for_tensor_array(var, item, value):
         (1) int/Variable, which is a simple number/variable such as [1], [-2]
         (2) Slice, which is represented by bounds such as [2:-1]
         (3) Tuple, which includes the above two cases such as [2:-1, 1]
-        If item is case (1), we perform paddle.tensor.array_write, 
+        If item is case (1), we perform paddle.tensor.array_write,
         in other cases, we raise a NotImplementedError.
     """
     from ..framework import LayerHelper, core, _non_static_mode
diff --git a/python/paddle/framework/dtype.py b/python/paddle/framework/dtype.py
index 6abc8e6e1aa..d3517d7395c 100644
--- a/python/paddle/framework/dtype.py
+++ b/python/paddle/framework/dtype.py
@@ -39,7 +39,7 @@ bool = VarDesc.VarType.BOOL
 def iinfo(dtype):
     """
 
-    paddle.iinfo is a function that returns an object that represents the numerical properties of 
+    paddle.iinfo is a function that returns an object that represents the numerical properties of
     an integer paddle.dtype.
     This is similar to `numpy.iinfo <https://numpy.org/doc/stable/reference/generated/numpy.iinfo.html#numpy-iinfo>`_.
 
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 41fd0c0703b..62cbeea8d42 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -94,7 +94,7 @@ def set_grad_enabled(mode):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
             x = paddle.ones([3, 2])
             x.stop_gradient = False
@@ -127,9 +127,9 @@ def is_grad_enabled():
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
-            
+
             # Dygraph gradient calculation mode is enabled by default.
             paddle.is_grad_enabled() # True
 
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 09f3c512401..eef33397a7e 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -574,27 +574,27 @@ def _save_binary_var(obj, path):
 def save(obj, path, protocol=4, **configs):
     '''
     Save an object to the specified path.
-    
+
     .. note::
         Now supports saving ``state_dict`` of Layer/Optimizer, Tensor and nested structure containing Tensor, Program.
 
     .. note::
-        Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file, 
-        there is no need to distinguish multiple saved files by adding a suffix. The argument ``path`` 
-        of ``paddle.save`` will be directly used as the saved file name instead of a prefix. 
+        Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file,
+        there is no need to distinguish multiple saved files by adding a suffix. The argument ``path``
+        of ``paddle.save`` will be directly used as the saved file name instead of a prefix.
         In order to unify the saved file name format, we recommend using the paddle standard suffix:
-        1. for ``Layer.state_dict`` , recommend to use ``.pdparams`` ; 
-        2. for ``Optimizer.state_dict`` , recommend to use ``.pdopt`` . 
+        1. for ``Layer.state_dict`` , recommend to use ``.pdparams`` ;
+        2. for ``Optimizer.state_dict`` , recommend to use ``.pdopt`` .
         For specific examples, please refer to API code examples.
-    
+
     Args:
         obj(Object) : The object to be saved.
-        path(str|BytesIO) : The path/buffer of the object to be saved. 
-          If saved in the current directory, the input path string will be used as the file name. 
+        path(str|BytesIO) : The path/buffer of the object to be saved.
+          If saved in the current directory, the input path string will be used as the file name.
         protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
                                  Default: 4
         **configs(dict, optional): optional keyword arguments. The following options are currently supported:
-          use_binary_format(bool): When the saved object is static graph variable, you can specify ``use_binary_for_var``. 
+          use_binary_format(bool): When the saved object is static graph variable, you can specify ``use_binary_for_var``.
           If True, save the file in the c++ binary format when saving a single static graph variable; otherwise, save it in pickle format.
           Default: False
 
@@ -687,7 +687,7 @@ def save(obj, path, protocol=4, **configs):
             paddle.save(state_dict, byio)
             tensor = paddle.randn([2, 3], dtype='float32')
             paddle.save(tensor, byio)
-    
+
     '''
     if _is_file_path(path):
         # 1. input check
@@ -796,42 +796,42 @@ def load(path, **configs):
         Now supports loading ``state_dict`` of Layer/Optimizer, Tensor and nested structure containing Tensor, Program.
 
     .. note::
-        In order to use the model parameters saved by paddle more efficiently, 
-        ``paddle.load`` supports loading ``state_dict`` of Layer from the result of 
-        other save APIs except ``paddle.save`` , but the argument ``path`` format is 
+        In order to use the model parameters saved by paddle more efficiently,
+        ``paddle.load`` supports loading ``state_dict`` of Layer from the result of
+        other save APIs except ``paddle.save`` , but the argument ``path`` format is
         different:
-        1. loading from ``paddle.static.save`` or ``paddle.Model().save(training=True)`` ,  
-        ``path`` needs to be a complete file name, such as ``model.pdparams`` or 
-        ``model.pdopt`` ; 
-        2. loading from ``paddle.jit.save`` or ``paddle.static.save_inference_model`` 
-        or ``paddle.Model().save(training=False)`` , ``path`` need to be a file prefix, 
-        such as ``model/mnist``, and ``paddle.load`` will get information from 
+        1. loading from ``paddle.static.save`` or ``paddle.Model().save(training=True)`` ,
+        ``path`` needs to be a complete file name, such as ``model.pdparams`` or
+        ``model.pdopt`` ;
+        2. loading from ``paddle.jit.save`` or ``paddle.static.save_inference_model``
+        or ``paddle.Model().save(training=False)`` , ``path`` need to be a file prefix,
+        such as ``model/mnist``, and ``paddle.load`` will get information from
         ``mnist.pdmodel`` and ``mnist.pdiparams`` ;
-        3. loading from paddle 1.x APIs ``paddle.fluid.io.save_inference_model`` or 
-        ``paddle.fluid.io.save_params/save_persistables`` , ``path`` need to be a 
+        3. loading from paddle 1.x APIs ``paddle.fluid.io.save_inference_model`` or
+        ``paddle.fluid.io.save_params/save_persistables`` , ``path`` need to be a
         directory, such as ``model`` and model is a directory.
 
     .. note::
-        If you load ``state_dict`` from the saved result of static mode API such as 
-        ``paddle.static.save`` or ``paddle.static.save_inference_model`` , 
-        the structured variable name in dynamic mode will cannot be restored. 
-        You need to set the argument ``use_structured_name=False`` when using 
+        If you load ``state_dict`` from the saved result of static mode API such as
+        ``paddle.static.save`` or ``paddle.static.save_inference_model`` ,
+        the structured variable name in dynamic mode will cannot be restored.
+        You need to set the argument ``use_structured_name=False`` when using
         ``Layer.set_state_dict`` later.
 
     Args:
-        path(str|BytesIO) : The path/buffer to load the target object. Generally, the path is the target 
-            file path. When loading state_dict from the saved result of the API used to save 
+        path(str|BytesIO) : The path/buffer to load the target object. Generally, the path is the target
+            file path. When loading state_dict from the saved result of the API used to save
             the inference model, the path may be a file prefix or directory.
-        **configs (dict, optional): other load configuration options for compatibility. We do not 
-            recommend using these configurations, they may be removed in the future. If not necessary, 
+        **configs (dict, optional): other load configuration options for compatibility. We do not
+            recommend using these configurations, they may be removed in the future. If not necessary,
             DO NOT use them. Default None.
             The following options are currently supported:
-            (1) model_filename (str): The inference model file name of the paddle 1.x 
-            ``save_inference_model`` save format. Default file name is :code:`__model__` . 
-            (2) params_filename (str): The persistable variables file name of the paddle 1.x 
-            ``save_inference_model`` save format. No default file name, save variables separately 
-            by default.            
-            (3) return_numpy(bool): If specified as True, return tensor as numpy.ndarray, otherwise return tensor as paddle.Tensor. 
+            (1) model_filename (str): The inference model file name of the paddle 1.x
+            ``save_inference_model`` save format. Default file name is :code:`__model__` .
+            (2) params_filename (str): The persistable variables file name of the paddle 1.x
+            ``save_inference_model`` save format. No default file name, save variables separately
+            by default.
+            (3) return_numpy(bool): If specified as True, return tensor as numpy.ndarray, otherwise return tensor as paddle.Tensor.
             Default False.
 
     Returns:
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index 6c5ff2c8efb..50235591cf2 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -106,13 +106,13 @@ def set_cuda_rng_state(state_list):
 def _manual_program_seed(seed):
     """
     Sets global seed for generating random numbers.
-  
+
     NOTE(zhiqiu): This is the original implemention of seed. Keeps it temporally
     since CUDA generator is not developed, so we need it in the unittest.
 
     Args:
         seed(int): The random seed to set. It is recommend to set a large int number.
-    
+
     Returns:
         None
     """
diff --git a/python/paddle/geometric/math.py b/python/paddle/geometric/math.py
index 7a6db7d10aa..673b83f980e 100644
--- a/python/paddle/geometric/math.py
+++ b/python/paddle/geometric/math.py
@@ -32,9 +32,9 @@ def segment_sum(data, segment_ids, name=None):
     Args:
         data (Tensor): A tensor, available data type float32, float64, int32, int64, float16.
         segment_ids (Tensor): A 1-D tensor, which have the same size
-                            with the first dimension of input data. 
+                            with the first dimension of input data.
                             Available data type is int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -92,10 +92,10 @@ def segment_mean(data, segment_ids, name=None):
 
     Args:
         data (tensor): a tensor, available data type float32, float64, int32, int64, float16.
-        segment_ids (tensor): a 1-d tensor, which have the same size 
-                            with the first dimension of input data. 
+        segment_ids (tensor): a 1-d tensor, which have the same size
+                            with the first dimension of input data.
                             available data type is int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -154,9 +154,9 @@ def segment_min(data, segment_ids, name=None):
     Args:
         data (tensor): a tensor, available data type float32, float64, int32, int64, float16.
         segment_ids (tensor): a 1-d tensor, which have the same size
-                            with the first dimension of input data. 
+                            with the first dimension of input data.
                             available data type is int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -215,9 +215,9 @@ def segment_max(data, segment_ids, name=None):
     Args:
         data (tensor): a tensor, available data type float32, float64, int32, int64, float16.
         segment_ids (tensor): a 1-d tensor, which have the same size
-                            with the first dimension of input data. 
+                            with the first dimension of input data.
                             available data type is int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py
index 03a272aa6af..f2b3029e39a 100644
--- a/python/paddle/geometric/message_passing/send_recv.py
+++ b/python/paddle/geometric/message_passing/send_recv.py
@@ -34,9 +34,9 @@ def send_u_recv(x,
 
     Graph Learning message passing api.
 
-    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
+    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory
     consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index`
-    to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor 
+    to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor
     in different reduce ops, like sum, mean, max, or min. Besides, we can use `out_size` to set necessary output shape.
 
     .. code-block:: text
@@ -65,20 +65,20 @@ def send_u_recv(x,
         x (Tensor): The input tensor, and the available data type is float32, float64, int32, int64.
                     And we support float16 in gpu version.
         src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
-        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. 
-                            The available data type is int32, int64. 
+        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
+                            The available data type is int32, int64.
         reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`.
                          Default value is `sum`.
-        out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or 
+        out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or
                                     out_size is smaller or equal to 0, then this input will not be used.
-                                    Otherwise, `out_size` should be equal with or larger than 
+                                    Otherwise, `out_size` should be equal with or larger than
                                     max(dst_index) + 1.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out (Tensor): The output tensor, should have the same shape and same dtype as input tensor `x`. 
-                      If `out_size` is set correctly, then it should have the same shape as `x` except 
+        out (Tensor): The output tensor, should have the same shape and same dtype as input tensor `x`.
+                      If `out_size` is set correctly, then it should have the same shape as `x` except
                       the 0th dimension.
 
     Examples:
@@ -174,10 +174,10 @@ def send_ue_recv(x,
 
     Graph Learning message passing api.
 
-    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
+    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory
     consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index`
-    to gather the corresponding data, after computing with `y` in different message ops like add/sub/mul/div, then use `dst_index` to 
-    update the corresponding position of output tensor in different reduce ops, like sum, mean, max, or min. 
+    to gather the corresponding data, after computing with `y` in different message ops like add/sub/mul/div, then use `dst_index` to
+    update the corresponding position of output tensor in different reduce ops, like sum, mean, max, or min.
     Besides, we can use `out_size` to set necessary output shape.
 
     .. code-block:: text
@@ -211,7 +211,7 @@ def send_ue_recv(x,
         y (Tensor): The input edge feature tensor, and the available data type is float32, float64, int32, int64.
                     And we support float16 in gpu version.
         src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
-        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. 
+        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
                             The available data type is int32, int64.
         message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
         reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`.
@@ -337,8 +337,8 @@ def send_uv(x, y, src_index, dst_index, message_op="add", name=None):
 
     Graph Learning message passing api.
 
-    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
-    consumption in the process of message passing. Take `x` as the source node feature tensor, take `y` as 
+    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory
+    consumption in the process of message passing. Take `x` as the source node feature tensor, take `y` as
     the destination node feature tensor. Then we use `src_index` and `dst_index` to gather the corresponding data,
     and then compute the edge features in different message_ops like `add`, `sub`, `mul`, `div`.
 
@@ -371,8 +371,8 @@ def send_uv(x, y, src_index, dst_index, message_op="add", name=None):
         x (Tensor): The source node feature tensor, and the available data type is float32, float64, int32, int64. And we support float16 in gpu version.
         y (Tensor): The destination node feature tensor, and the available data type is float32, float64, int32, int64. And we support float16 in gpu version.
         src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
-        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. 
-                            The available data type is int32, int64. 
+        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
+                            The available data type is int32, int64.
         message_op (str): Different message ops for x and y, including `add`, `sub`, `mul` and `div`.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
diff --git a/python/paddle/geometric/reindex.py b/python/paddle/geometric/reindex.py
index 9580ff5c4ee..a575e6916c4 100644
--- a/python/paddle/geometric/reindex.py
+++ b/python/paddle/geometric/reindex.py
@@ -33,17 +33,17 @@ def reindex_graph(x,
 
     This API is mainly used in Graph Learning domain, which should be used
     in conjunction with `graph_sample_neighbors` API. And the main purpose
-    is to reindex the ids information of the input nodes, and return the 
+    is to reindex the ids information of the input nodes, and return the
     corresponding graph edges after reindex.
 
-    **Notes**: 
+    **Notes**:
         The number in x should be unique, otherwise it would cause potential errors.
-    We will reindex all the nodes from 0. 
+    We will reindex all the nodes from 0.
 
     Take input nodes x = [0, 1, 2] as an example.
-    If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2], 
+    If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2],
     then we know that the neighbors of 0 is [8, 9], the neighbors of 1
-    is [0, 4, 7], and the neighbors of 2 is [6, 7]. 
+    is [0, 4, 7], and the neighbors of 2 is [6, 7].
     Then after graph_reindex, we will have 3 different outputs:
         1. reindex_src: [3, 4, 0, 5, 6, 7, 6]
         2. reindex_dst: [0, 0, 1, 1, 1, 2, 2]
@@ -56,17 +56,17 @@ def reindex_graph(x,
                     data type is int32, int64.
         neighbors (Tensor): The neighbors of the input nodes `x`. The data type
                             should be the same with `x`.
-        count (Tensor): The neighbor count of the input nodes `x`. And the 
+        count (Tensor): The neighbor count of the input nodes `x`. And the
                         data type should be int32.
         value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
                                     and should be filled with -1. Only useful for gpu version.
         index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
                                     and should be filled with -1. Only useful for gpu version.
-                                    `value_buffer` and `index_buffer` should be both not None 
+                                    `value_buffer` and `index_buffer` should be both not None
                                     if you want to speed up by using hashtable buffer.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         reindex_src (Tensor): The source node index of graph edges after reindex.
         reindex_dst (Tensor): The destination node index of graph edges after reindex.
@@ -75,7 +75,7 @@ def reindex_graph(x,
                             nodes in the back.
 
     Examples:
-        
+
         .. code-block:: python
 
         import paddle
@@ -156,7 +156,7 @@ def reindex_heter_graph(x,
 
     **Notes**:
         The number in x should be unique, otherwise it would cause potential errors.
-    We support multi-edge-types neighbors reindexing in reindex_heter_graph api. 
+    We support multi-edge-types neighbors reindexing in reindex_heter_graph api.
     We will reindex all the nodes from 0.
 
     Take input nodes x = [0, 1, 2] as an example.
@@ -169,14 +169,14 @@ def reindex_heter_graph(x,
     We will get following outputs:
         1. reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
         2. reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
-        3. out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5] 
+        3. out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5]
 
     Args:
         x (Tensor): The input nodes which we sample neighbors for. The available
                     data type is int32, int64.
-        neighbors (list|tuple): The neighbors of the input nodes `x` from different graphs. 
+        neighbors (list|tuple): The neighbors of the input nodes `x` from different graphs.
                                 The data type should be the same with `x`.
-        count (list|tuple): The neighbor counts of the input nodes `x` from different graphs. 
+        count (list|tuple): The neighbor counts of the input nodes `x` from different graphs.
                             And the data type should be int32.
         value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
                                     and should be filled with -1. Only useful for gpu version.
diff --git a/python/paddle/geometric/sampling/neighbors.py b/python/paddle/geometric/sampling/neighbors.py
index a9619d54a85..62c01a11c69 100644
--- a/python/paddle/geometric/sampling/neighbors.py
+++ b/python/paddle/geometric/sampling/neighbors.py
@@ -33,13 +33,13 @@ def sample_neighbors(row,
     Graph Sample Neighbors API.
 
     This API is mainly used in Graph Learning domain, and the main purpose is to
-    provide high performance of graph sampling method. For example, we get the 
-    CSC(Compressed Sparse Column) format of the input graph edges as `row` and 
+    provide high performance of graph sampling method. For example, we get the
+    CSC(Compressed Sparse Column) format of the input graph edges as `row` and
     `colptr`, so as to convert graph data into a suitable format for sampling.
-    `input_nodes` means the nodes we need to sample neighbors, and `sample_sizes` 
+    `input_nodes` means the nodes we need to sample neighbors, and `sample_sizes`
     means the number of neighbors and number of layers we want to sample.
 
-    Besides, we support fisher-yates sampling in GPU version. 
+    Besides, we support fisher-yates sampling in GPU version.
 
     Args:
         row (Tensor): One of the components of the CSC format of the input graph, and
@@ -50,10 +50,10 @@ def sample_neighbors(row,
                          The data type should be the same with `row`.
         input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
                               data type should be the same with `row`.
-        sample_size (int): The number of neighbors we need to sample. Default value is -1, 
+        sample_size (int): The number of neighbors we need to sample. Default value is -1,
                            which means returning all the neighbors of the input nodes.
         eids (Tensor): The eid information of the input graph. If return_eids is True,
-                            then `eids` should not be None. The data type should be the 
+                            then `eids` should not be None. The data type should be the
                             same with `row`. Default is None.
         return_eids (bool): Whether to return eid information of sample edges. Default is False.
         perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
@@ -67,7 +67,7 @@ def sample_neighbors(row,
         out_neighbors (Tensor): The sample neighbors of the input nodes.
         out_count (Tensor): The number of sampling neighbors of each input node, and the shape
                             should be the same with `input_nodes`.
-        out_eids (Tensor): If `return_eids` is True, we will return the eid information of the 
+        out_eids (Tensor): If `return_eids` is True, we will return the eid information of the
                            sample edges.
 
     Examples:
@@ -83,7 +83,7 @@ def sample_neighbors(row,
         colptr = paddle.to_tensor(colptr, dtype="int64")
         nodes = paddle.to_tensor(nodes, dtype="int64")
         out_neighbors, out_count = \
-            paddle.geometric.sample_neighbors(row, colptr, nodes, 
+            paddle.geometric.sample_neighbors(row, colptr, nodes,
                                               sample_size=sample_size)
 
     """
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index 1ba33a6b52b..7bae53370ee 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -133,7 +133,7 @@ class Callback(object):
     Examples:
 
         .. code-block:: python
-            
+
             import paddle
 
             # build a simple model checkpoint callback
@@ -306,7 +306,7 @@ class ProgBarLogger(Callback):
             the logs such as loss, metrics are printed. Default: 1.
         verbose (int): The verbosity mode, should be 0, 1, or 2.
             0 = silent, 1 = progress bar, 2 = one line each printing, 3 = 2 +
-            time counter, such as average reader cost, samples per second. 
+            time counter, such as average reader cost, samples per second.
             Default: 2.
 
     Examples:
@@ -598,7 +598,7 @@ class ModelCheckpoint(Callback):
 
 class LRScheduler(Callback):
     """Lr scheduler callback function
-    
+
     Args:
         by_step(bool, optional): whether to update learning rate scheduler
             by step. Default: True.
@@ -628,7 +628,7 @@ class LRScheduler(Callback):
             base_lr = 1e-3
             boundaries = [5, 8]
             wamup_steps = 4
-            
+
             def make_optimizer(parameters=None):
                 momentum = 0.9
                 weight_decay = 5e-4
@@ -647,13 +647,13 @@ class LRScheduler(Callback):
                     momentum=momentum,
                     parameters=parameters)
                 return optimizer
-                
+
             optim = make_optimizer(parameters=lenet.parameters())
             model.prepare(optimizer=optim,
                         loss=paddle.nn.CrossEntropyLoss(),
                         metrics=paddle.metric.Accuracy())
 
-            # if LRScheduler callback not set, an instance LRScheduler update by step 
+            # if LRScheduler callback not set, an instance LRScheduler update by step
             # will be created auto.
             model.fit(train_dataset, batch_size=64)
 
@@ -690,7 +690,7 @@ class LRScheduler(Callback):
 class EarlyStopping(Callback):
     """Stop training when the given monitor stopped improving during evaluation
     by setting `model.stop_training=True`.
-    
+
     Args:
         monitor(str): Quantity to be monitored. Default: 'loss'.
         mode(str|None): Mode should be one of 'auto', 'min' or 'max'. In 'min'
@@ -711,7 +711,7 @@ class EarlyStopping(Callback):
             Training will stop if the model doesn't show improvement over the
             baseline. Default: None.
         save_best_model(bool): Whether to save best model. Default: True.
-        
+
     Examples:
         .. code-block:: python
 
@@ -872,7 +872,7 @@ class VisualDL(Callback):
             model.prepare(optimizer=optim,
                         loss=paddle.nn.CrossEntropyLoss(),
                         metrics=paddle.metric.Accuracy())
-            
+
             ## uncomment following lines to fit model with visualdl callback function
             # callback = paddle.callbacks.VisualDL(log_dir='visualdl_log_dir')
             # model.fit(train_dataset, eval_dataset, batch_size=64, callbacks=callback)
@@ -963,7 +963,7 @@ class ReduceLROnPlateau(Callback):
     of 2-10 once learning stagnates. This callback monitors a
     quantity and if no improvement is seen for a 'patience' number
     of epochs, the learning rate is reduced.
-    
+
     Args:
         monitor(str, optional): Quantity to be monitored. Default: 'loss'.
         factor(float, optional): factor by which the learning rate will be reduced.
@@ -973,21 +973,21 @@ class ReduceLROnPlateau(Callback):
         verbose(int, optional): The verbosity mode. 0: quiet, 1: update messages.
             Default: 1.
         mode(str, optional): one of `{'auto', 'min', 'max'}`. In `'min'` mode,
-            the learning rate will be reduced when the quantity monitored has 
-            stopped decreasing. In 'max' mode, learning rate will reduce until 
-            monitored quantity stops increasing. In 'auto' mode, exact mode 
-            can be inferred by the name of monitor. If 'acc' in monitor, the 
-            mode will be considered as 'max', otherwise the mode will be set 
+            the learning rate will be reduced when the quantity monitored has
+            stopped decreasing. In 'max' mode, learning rate will reduce until
+            monitored quantity stops increasing. In 'auto' mode, exact mode
+            can be inferred by the name of monitor. If 'acc' in monitor, the
+            mode will be considered as 'max', otherwise the mode will be set
             to 'min'. Default: 'auto'.
-        min_delta(int|float, optional): threshold for measuring the new optimum, 
+        min_delta(int|float, optional): threshold for measuring the new optimum,
             to only focus on significant changes. Default: 0.
         cooldown(int, optional): number of epochs to wait before resuming normal operation after
             lr has been reduced. Default: 0.
         min_lr(float, optional): lower bound on the learning rate. Default: 0.
-  
+
     Examples:
           .. code-block:: python
-  
+
               import paddle
               from paddle import Model
               from paddle.static import InputSpec
@@ -995,7 +995,7 @@ class ReduceLROnPlateau(Callback):
               from paddle.vision.datasets import MNIST
               from paddle.metric import Accuracy
               from paddle.nn.layer.loss import CrossEntropyLoss
-              import paddle.vision.transforms as T  
+              import paddle.vision.transforms as T
               sample_num = 200
               transform = T.Compose(
                   [T.Transpose(), T.Normalize([127.5], [127.5])])
@@ -1003,14 +1003,14 @@ class ReduceLROnPlateau(Callback):
               val_dataset = MNIST(mode='test', transform=transform)
               net = LeNet()
               optim = paddle.optimizer.Adam(
-                  learning_rate=0.001, parameters=net.parameters())  
+                  learning_rate=0.001, parameters=net.parameters())
               inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
-              labels = [InputSpec([None, 1], 'int64', 'label')]  
+              labels = [InputSpec([None, 1], 'int64', 'label')]
               model = Model(net, inputs=inputs, labels=labels)
               model.prepare(
                   optim,
                   loss=CrossEntropyLoss(),
-                  metrics=[Accuracy()])  
+                  metrics=[Accuracy()])
               callbacks = paddle.callbacks.ReduceLROnPlateau(patience=3, verbose=1)
               model.fit(train_dataset,
                           val_dataset,
@@ -1019,7 +1019,7 @@ class ReduceLROnPlateau(Callback):
                           save_freq=10,
                           epochs=20,
                           callbacks=[callbacks])
-  
+
     """
 
     def __init__(self,
diff --git a/python/paddle/hapi/dynamic_flops.py b/python/paddle/hapi/dynamic_flops.py
index 214af9f2f59..8e2bed5bc74 100644
--- a/python/paddle/hapi/dynamic_flops.py
+++ b/python/paddle/hapi/dynamic_flops.py
@@ -26,11 +26,11 @@ def flops(net, input_size, custom_ops=None, print_detail=False):
     """Print a table about the FLOPs of network.
 
     Args:
-        net (paddle.nn.Layer||paddle.static.Program): The network which could be a instance of paddle.nn.Layer in 
+        net (paddle.nn.Layer||paddle.static.Program): The network which could be a instance of paddle.nn.Layer in
                     dygraph or paddle.static.Program in static graph.
         input_size (list): size of input tensor. Note that the batch_size in argument ``input_size`` only support 1.
-        custom_ops (A dict of function, optional): A dictionary which key is the class of specific operation such as 
-                    paddle.nn.Conv2D and the value is the function used to count the FLOPs of this operation. This 
+        custom_ops (A dict of function, optional): A dictionary which key is the class of specific operation such as
+                    paddle.nn.Conv2D and the value is the function used to count the FLOPs of this operation. This
                     argument only work when argument ``net`` is an instance of paddle.nn.Layer. The details could be found
                     in following example code. Default is None.
         print_detail (bool, optional): Whether to print the detail information, like FLOPs per layer, about the net FLOPs.
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
index 3217059c647..42e6b7c3082 100644
--- a/python/paddle/hapi/hub.py
+++ b/python/paddle/hapi/hub.py
@@ -176,9 +176,9 @@ def list(repo_dir, source='github', force_reload=False):
 
             github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
             tag/branch. The default branch is `main` if not specified.
-            
+
             local path (str): local repo path
-        
+
         source (str): `github` | `gitee` | `local`, default is `github`.
         force_reload (bool, optional): whether to discard the existing cache and force a fresh download, default is `False`.
     Returns:
@@ -222,9 +222,9 @@ def help(repo_dir, model, source='github', force_reload=False):
 
             github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
             tag/branch. The default branch is `main` if not specified.
-            
+
             local path (str): local repo path.
-        
+
         model (str): model name.
         source (str): `github` | `gitee` | `local`, default is `github`.
         force_reload (bool, optional): default is `False`.
@@ -268,7 +268,7 @@ def load(repo_dir, model, source='github', force_reload=False, **kwargs):
             tag/branch. The default branch is `main` if not specified.
 
             local path (str): local repo path.
-        
+
         model (str): model name.
         source (str): `github` | `gitee` | `local`, default is `github`.
         force_reload (bool, optional): default is `False`.
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 16b3646a4a8..eeea53df3ed 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -921,7 +921,7 @@ class Model(object):
     instantiating a Model. The input description, i.e, paddle.static.InputSpec,
     must be required for static graph.
 
-    When training on GPU, auto mixed precision (AMP O1) and pure float16 
+    When training on GPU, auto mixed precision (AMP O1) and pure float16
     (AMP O2) training are both supported in static mode and dynamic mode.
     In static graph mode, before training with pure float16 (AMP O2),
     `multi_precision` could be set to True when creating optimizer, which can
@@ -966,7 +966,7 @@ class Model(object):
             # inputs and labels are not required for dynamic graph.
             input = InputSpec([None, 784], 'float32', 'x')
             label = InputSpec([None, 1], 'int64', 'label')
-            
+
             model = paddle.Model(net, input, label)
             optim = paddle.optimizer.SGD(learning_rate=1e-3,
                 parameters=model.parameters())
@@ -1058,12 +1058,12 @@ class Model(object):
         whether optimizer update gradients computing by this batch.
 
         Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
+                be a numpy array or paddle.Tensor, or a list of arrays or
                 tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be 
-                a numpy array or paddle.Tensor, or a list of arrays or tensors 
-                (in case the model has multiple labels). If has no labels, 
+            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
+                a numpy array or paddle.Tensor, or a list of arrays or tensors
+                (in case the model has multiple labels). If has no labels,
                 set None. Default: None.
             update (bool, optional): Whether update parameters after loss.backward() computing.
                 Set it to False to accumulate gradients. Default: True.
@@ -1076,7 +1076,7 @@ class Model(object):
         Examples:
 
             .. code-block:: python
-            
+
                 import paddle
                 import paddle.nn as nn
                 from paddle.static import InputSpec
@@ -1111,12 +1111,12 @@ class Model(object):
         Run one evaluating step on a batch of data.
 
         Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
+                be a numpy array or paddle.Tensor, or a list of arrays or
                 tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be 
-                a numpy array or paddle.Tensor, or a list of arrays or tensors 
-                (in case the model has multiple labels). If has no labels, 
+            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
+                a numpy array or paddle.Tensor, or a list of arrays or tensors
+                (in case the model has multiple labels). If has no labels,
                 set None. Default: None.
 
         Returns:
@@ -1163,8 +1163,8 @@ class Model(object):
         Run one predicting step on a batch of data.
 
         Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
+                be a numpy array or paddle.Tensor, or a list of arrays or
                 tensors (in case the model has multiple inputs).
 
         Returns:
@@ -1180,7 +1180,7 @@ class Model(object):
                 from paddle.static import InputSpec
 
                 device = paddle.set_device('cpu') # or 'gpu'
-                
+
                 input = InputSpec([None, 784], 'float32', 'x')
                 label = InputSpec([None, 1], 'int64', 'label')
 
@@ -1205,12 +1205,12 @@ class Model(object):
         return loss
 
     def save(self, path, training=True):
-        """  
-        This function saves parameters, optimizer information or model and 
+        """
+        This function saves parameters, optimizer information or model and
         paramters only for inference to path. It depends on the parameter
         `training`.
 
-        If `training` is set to True, the parameters saved contain all 
+        If `training` is set to True, the parameters saved contain all
         the trainable Variable, will save to a file with suffix ".pdparams".
         The optimizer information contains all the variable used by optimizer.
         For Adam optimizer, contains beta1, beta2, momentum etc. All the
@@ -1269,7 +1269,7 @@ class Model(object):
                     T.Normalize([127.5], [127.5])
                 ])
                 data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
-                
+
                 model.fit(data, epochs=1, batch_size=32, verbose=0)
                 model.save('checkpoint/test')  # save for training
                 model.save('inference_model', False)  # save for inference
@@ -1399,13 +1399,13 @@ class Model(object):
         Examples:
 
             .. code-block:: python
-            
+
                 import paddle
                 import paddle.nn as nn
                 from paddle.static import InputSpec
 
                 input = InputSpec([None, 784], 'float32', 'x')
-                
+
                 model = paddle.Model(nn.Sequential(
                     nn.Linear(784, 200),
                     nn.Tanh(),
@@ -1592,14 +1592,14 @@ class Model(object):
         evaluation will be done at the end of each epoch.
 
         Args:
-            train_data (Dataset|DataLoader, optional): An iterable data loader is used for 
-                train. An instance of paddle paddle.io.Dataset or 
+            train_data (Dataset|DataLoader, optional): An iterable data loader is used for
+                train. An instance of paddle paddle.io.Dataset or
                 paddle.io.Dataloader is recomended. Default: None.
             eval_data (Dataset|DataLoader, optional): An iterable data loader is used for
-                evaluation at the end of epoch. If None, will not do evaluation. 
-                An instance of paddle.io.Dataset or paddle.io.Dataloader 
+                evaluation at the end of epoch. If None, will not do evaluation.
+                An instance of paddle.io.Dataset or paddle.io.Dataloader
                 is recomended. Default: None.
-            batch_size (int, optional): The batch size of train_data and eval_data. When 
+            batch_size (int, optional): The batch size of train_data and eval_data. When
                 train_data and eval_data are both the instance of Dataloader, this
                 parameter will be ignored. Default: 1.
             epochs (int, optional): The number of epochs to train the model. Default: 1.
@@ -1627,7 +1627,7 @@ class Model(object):
             callbacks (Callback|None, optional): A list of `Callback` instances to apply
                 during training. If None, :ref:`api_paddle_callbacks_ProgBarLogger` and
                 :ref:`api_paddle_callbacks_ModelCheckpoint` are automatically inserted. Default: None.
-            accumulate_grad_batches (int, optional): The number of batches to accumulate gradident 
+            accumulate_grad_batches (int, optional): The number of batches to accumulate gradident
                 during training process before optimizer updates. It can mimic large batch
                 size. Default: 1.
             num_iters (int|None, optional): The number of iterations to evaluate the model.
@@ -1692,7 +1692,7 @@ class Model(object):
                 dynamic = True
                 if not dynamic:
                     paddle.enable_static()
-                
+
                 transform = T.Compose([
                         T.Transpose(),
                         T.Normalize([127.5], [127.5])
@@ -1812,7 +1812,7 @@ class Model(object):
 
         Args:
             eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation. An instance of paddle.io.Dataset or 
+                evaluation. An instance of paddle.io.Dataset or
                 paddle.io.Dataloader is recomended.
             batch_size (int, optional): The batch size of train_data and eval_data.
                 When eval_data is the instance of Dataloader, this argument will be
@@ -1920,7 +1920,7 @@ class Model(object):
                 is recomended.
             batch_size (int, optional): The batch size of test_data. When test_data is the
                 instance of Dataloader, this argument will be ignored. Default: 1.
-            num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess 
+            num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess
                 used and loading data in main process. When test_data is the instance of Dataloader,
                 this argument will be ignored. Default: 0.
             stack_outputs (bool, optional): Whether stack output field like a batch, as for an output
@@ -2159,10 +2159,10 @@ class Model(object):
         """Prints a string summary of the network.
 
         Args:
-            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor. 
-                    if not set, input_size will get from ``self._inputs`` if network only have 
-                    one input, input_size can be tuple or InputSpec. if model have multiple 
-                    input, input_size must be a list which contain every input's shape. 
+            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor.
+                    if not set, input_size will get from ``self._inputs`` if network only have
+                    one input, input_size can be tuple or InputSpec. if model have multiple
+                    input, input_size must be a list which contain every input's shape.
                     Default: None.
             dtype (str, optional): if dtype is None, 'float32' will be used, Default: None.
 
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index 6928bc75f5f..3822d13b989 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -30,11 +30,11 @@ def summary(net, input_size=None, dtypes=None, input=None):
 
     Args:
         net (Layer): the network which must be a subinstance of Layer.
-        input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor. if model only 
+        input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor. if model only
                     have one input, input_size can be tuple or InputSpec. if model
-                    have multiple input, input_size must be a list which contain 
+                    have multiple input, input_size must be a list which contain
                     every input's shape. Note that input_size only dim of
-                    batch_size can be None or -1. Default: None. Note that 
+                    batch_size can be None or -1. Default: None. Note that
                     input_size and input cannot be None at the same time.
         dtypes (str, optional): if dtypes is None, 'float32' will be used, Default: None.
         input: the input tensor. if input is given, input_size and dtype will be ignored, Default: None.
@@ -92,10 +92,10 @@ def summary(net, input_size=None, dtypes=None, input=None):
                         x = paddle.flatten(x, 1)
                         x = self.fc(x + y)
                     return x
-            
+
             lenet_multi_input = LeNetMultiInput()
 
-            params_info = paddle.summary(lenet_multi_input, [(1, 1, 28, 28), (1, 400)], 
+            params_info = paddle.summary(lenet_multi_input, [(1, 1, 28, 28), (1, 400)],
                                         dtypes=['float32', 'float32'])
             print(params_info)
 
@@ -109,7 +109,7 @@ def summary(net, input_size=None, dtypes=None, input=None):
                         x = paddle.flatten(x, 1)
                         x = self.fc(x + inputs[1])
                     return x
-            
+
             lenet_list_input = LeNetListInput()
             input_data = [paddle.rand([1, 1, 28, 28]), paddle.rand([1, 400])]
             params_info = paddle.summary(lenet_list_input, input=input_data)
diff --git a/python/paddle/hapi/static_flops.py b/python/paddle/hapi/static_flops.py
index 297199b7326..78630c1c2e0 100644
--- a/python/paddle/hapi/static_flops.py
+++ b/python/paddle/hapi/static_flops.py
@@ -79,7 +79,7 @@ class GraphWrapper(object):
     for paddle slim framework.
 
     Args:
-        program(framework.Program): A program with 
+        program(framework.Program): A program with
         in_nodes(dict): A dict to indicate the input nodes of the graph.
                         The key is user-defined and human-readable name.
                         The value is the name of Variable.
diff --git a/python/paddle/incubate/autograd/functional.py b/python/paddle/incubate/autograd/functional.py
index 79ba7d33097..b96a4633c1e 100644
--- a/python/paddle/incubate/autograd/functional.py
+++ b/python/paddle/incubate/autograd/functional.py
@@ -39,7 +39,7 @@ def vjp(func, xs, v=None):
 
     Returns:
         output(tuple):
-        
+
             - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` .
             - vjp(Tensor|tuple[Tensor]): The vjp result.
 
@@ -94,7 +94,7 @@ def jvp(func, xs, v=None):
             Sequence of Tensors.
         v(Tensor|Sequence[Tensor]|None, Optional): The tangent vector invovled
             in the JVP computation. The ``v`` matches the size and shape of
-            ``xs`` . Default value is None and in this case is equivalent to 
+            ``xs`` . Default value is None and in this case is equivalent to
             all ones the same size of ``xs`` .
 
     Returns:
@@ -154,7 +154,7 @@ def _double_backward_trick(ys, xs, v):
 
 
 def _zeros_like_with_grad(xs):
-    """Create a zero or zeros sequence Tensor like ``xs`` with a flag 
+    """Create a zero or zeros sequence Tensor like ``xs`` with a flag
     ``stop_graident=False`` .
     """
     if not isinstance(xs, typing.Sequence):
@@ -173,17 +173,17 @@ class Jacobian(object):
     r"""
     Computes the Jacobian matrix of a given function.
 
-    If the function has multiple inputs and multiple outputs, during internal 
-    implementation, all input tensors are concatenated after being flatten, 
-    the batch dimension is retained, and the output is subject to the same 
+    If the function has multiple inputs and multiple outputs, during internal
+    implementation, all input tensors are concatenated after being flatten,
+    the batch dimension is retained, and the output is subject to the same
     processing rules.
 
-    Once the Jacobian ``J`` is constructed, you can use a multidimensional index 
-    to retrieve the submatrix of ``J``, as same as slicing a Tensor. The 
-    submatrix is lazily evaluated along row axis, and will be cached once 
+    Once the Jacobian ``J`` is constructed, you can use a multidimensional index
+    to retrieve the submatrix of ``J``, as same as slicing a Tensor. The
+    submatrix is lazily evaluated along row axis, and will be cached once
     evaluated.
 
-    For examples, supposing ``is_batched=True``, you can retrieve the submatrix 
+    For examples, supposing ``is_batched=True``, you can retrieve the submatrix
     by following methods:
 
         * J[:], retrieving the full matrix.
@@ -203,11 +203,11 @@ class Jacobian(object):
 
     Args:
 
-        func (Callable): A python function that takes a Tensor or a sequence of 
+        func (Callable): A python function that takes a Tensor or a sequence of
             Tensors as inputs(the first dimension is batch size) and
             returns a Tensor  a sequence of Tensors.
         xs (Tensor|Sequence[Tensor]): The input to the function ``func`` .
-        is_batched (bool): If true, the first axis is batch axis. Defaults to 
+        is_batched (bool): If true, the first axis is batch axis. Defaults to
             False.
 
     Returns:
@@ -263,11 +263,11 @@ class Hessian(object):
     """
     Computes the Hessian matrix  with a given ``func`` with respect to ``xs`` .
 
-    If the function has multiple inputs, during internal implementation, 
-    all input tensors are concatenated after being flatten, the batch dimension 
+    If the function has multiple inputs, during internal implementation,
+    all input tensors are concatenated after being flatten, the batch dimension
     is retained.
 
-    The Hessian submatrix is lazily evaluated, and can be retrieved with a 
+    The Hessian submatrix is lazily evaluated, and can be retrieved with a
     multidimensional indexes. See details ``Jacobian`` .
 
     Warning:
@@ -275,11 +275,11 @@ class Hessian(object):
 
     Args:
         func (Callable): A python function that takes a Tensor or a Tensor
-            sequence as inputs and returns a Tensor with shape 
+            sequence as inputs and returns a Tensor with shape
             ``[batch_size, 1]`` with batch or ``[1]`` without batch.
-        xs (Tensor|Sequence(Tensor)): The input Tensor or Tensor sequence of 
+        xs (Tensor|Sequence(Tensor)): The input Tensor or Tensor sequence of
             the function ``func``.
-        is_batched (bool): If true, the first axis is batch axis. Defaults to 
+        is_batched (bool): If true, the first axis is batch axis. Defaults to
             False.
 
     Returns:
@@ -334,20 +334,20 @@ class Hessian(object):
 class _Jacobian(object):
     """The base class for computing Jacobian matrix.
 
-    ``_Jacobian`` implementes the core logic of multidimensional index and lazy 
-    evaluation for Jacobian matrix, subclass only need to overwrite following 
+    ``_Jacobian`` implementes the core logic of multidimensional index and lazy
+    evaluation for Jacobian matrix, subclass only need to overwrite following
     methods:
 
-        * ``_lazy_axis()``,  return the axis along which will be lazy 
+        * ``_lazy_axis()``,  return the axis along which will be lazy
             evaluating.
         * ``_flatten(xs)``, flattens the inputs ``xs``.
         * ``_evaluate(index)``, evaluates one slice along ``_lazy_axis`` .
 
     Notes:
 
-        Because currently PaddlePaddle only support reverse differentiation by 
-        ``paddle.grad``, so lazy evaluation is only supported along the row of 
-        Jacobian matrix, which means that slicing along row will get better 
+        Because currently PaddlePaddle only support reverse differentiation by
+        ``paddle.grad``, so lazy evaluation is only supported along the row of
+        Jacobian matrix, which means that slicing along row will get better
         performance.
 
     """
@@ -420,7 +420,7 @@ class _Jacobian(object):
 
 class _JacobianNoBatch(_Jacobian):
     """Compute Jacobian matrix without batch dimension.
-    Suppose the mapping is :math:`f: R^M \to R^N`, the output shape is 
+    Suppose the mapping is :math:`f: R^M \to R^N`, the output shape is
     ``(N, M)`` .
     """
 
@@ -447,7 +447,7 @@ class _JacobianNoBatch(_Jacobian):
 
 class _JacobianBatchFirst(_Jacobian):
     """Compute Jacobian matrix with batch at first axis.
-    Suppose the mapping is :math:`f: R^{B,M} \to R^{B,N}`, the output shape is 
+    Suppose the mapping is :math:`f: R^{B,M} \to R^{B,N}`, the output shape is
     ``(B, N, M)`` .
     """
 
@@ -475,13 +475,13 @@ def _multi_index(indexes, shape):
     """A tool for parsing N-dimensional index into a standard format.
 
     Currently supporting following input format:
-        * ([positive|negative|slice], ...), the right-most elements can be 
+        * ([positive|negative|slice], ...), the right-most elements can be
             omited.
 
     The standard format after converted is slice tuple which contains N elements:
         * ([positive|slice], ..., [positive|slice])
 
-    Notes: 
+    Notes:
         Ellipsis indexes such as ``(..., i), (i, ...)`` is not supported.
 
     Args:
@@ -539,8 +539,8 @@ def _grad(ys, xs, v=None):
         none in outputs will be replaced by zero tensor.
     * The ``create_graph`` flag is removed and set defaults to true internally,
         only makes sense in dynamic graph.
-    * When xs is a single Tensor, ``paddle.grad`` returns a list which only 
-        contains one Tensor. It may confuse users, thus in this case we improve 
+    * When xs is a single Tensor, ``paddle.grad`` returns a list which only
+        contains one Tensor. It may confuse users, thus in this case we improve
         to return a single Tensor in _grad interface.
 
     Args:
@@ -559,9 +559,9 @@ def _grad(ys, xs, v=None):
             grad_outputs is a Tensor. Default None.
 
     Returns:
-        Tensor|tuple[Tensor]: Tensor or a tuple of Tensors, whose length is the 
-            same as the Tensor number inside inputs, and the i-th returned 
-            Tensor is the sum of gradients of outputs with respect to the i-th 
+        Tensor|tuple[Tensor]: Tensor or a tuple of Tensors, whose length is the
+            same as the Tensor number inside inputs, and the i-th returned
+            Tensor is the sum of gradients of outputs with respect to the i-th
             inputs.
     """
     if paddle.fluid._non_static_mode():
@@ -579,21 +579,21 @@ def _grad(ys, xs, v=None):
 
 def _separate(xs):
     """
-    ``_separate`` separates ``xs`` from the computation graph through ``clone`` 
+    ``_separate`` separates ``xs`` from the computation graph through ``clone``
     or ``deteach`` .
 
-    Interally, ``paddle.grad(xs, ys)`` is stateful API implemented based on 
+    Interally, ``paddle.grad(xs, ys)`` is stateful API implemented based on
     computional graph, which will reduce gradients along all path from ys to xs.
 
-    However, funcional autograd API such as ``vjp``, ``jvp`` is stateless, and 
+    However, funcional autograd API such as ``vjp``, ``jvp`` is stateless, and
     only compute gradients with a given ``func`` .
 
     For example, given a ``func`` :math:`y0=f(x0)`, supposing forward path is:
     ``x0 -> y0``, ``x0 -> x1 -> y0`` .
-    ``paddle.grad(y0, x0)`` will reduce gradients along ``y0->x0`` and 
+    ``paddle.grad(y0, x0)`` will reduce gradients along ``y0->x0`` and
     ``y0->x1->x0``, and ``vjp`` only need reduce along ``y0->x0``.
 
-    So, it's needed to clone or detach xs for breaking the dependencies with 
+    So, it's needed to clone or detach xs for breaking the dependencies with
     other variables.
 
     Examples:
diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py
index ba7a2537df1..8806122b5a2 100644
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
@@ -28,8 +28,8 @@ def forward_grad(outputs, inputs, grad_inputs=None):
     Args:
         outputs(Tensor|Sequence[Tensor]): The output tensor or tensors.
         inputs(Tensor|Sequence[Tensor]): The input tensor or tensors.
-        grad_inputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or 
-            Tensors of inputs which has the same shape with inputs, Defaults to 
+        grad_inputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or
+            Tensors of inputs which has the same shape with inputs, Defaults to
             None, in this case is equivalent to all ones.
 
     Returns:
@@ -50,7 +50,7 @@ def forward_grad(outputs, inputs, grad_inputs=None):
 
             with paddle.static.program_guard(main_program, startup_program):
                 x = paddle.static.data('x', shape=[1], dtype='float32')
-                y = x * x 
+                y = x * x
                 y_grad = paddle.incubate.autograd.forward_grad(y, x)
                 paddle.incubate.autograd.prim2orig()
 
@@ -101,12 +101,12 @@ def grad(outputs, inputs, grad_outputs=None):
     Args:
         outputs(Tensor|Sequence[Tensor]): The output Tensor or Tensors.
         inputs(Tensor|Sequence[Tensor]): The input Tensor or Tensors.
-        grad_outputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or 
-            Tensors of outputs which has the same shape with outputs, Defaults 
+        grad_outputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or
+            Tensors of outputs which has the same shape with outputs, Defaults
             to None, in this case is equivalent to all ones.
 
     Returns:
-        grad_inputs(Tensor|Tensors): The gradients for inputs. 
+        grad_inputs(Tensor|Tensors): The gradients for inputs.
 
     Examples:
 
@@ -123,7 +123,7 @@ def grad(outputs, inputs, grad_outputs=None):
             with paddle.static.program_guard(main_program, startup_program):
                 x = paddle.static.data('x', shape=[1], dtype='float32')
                 x.stop_gradients = False
-                y = x * x 
+                y = x * x
                 x_grad = paddle.incubate.autograd.grad(y, x)
                 paddle.incubate.autograd.prim2orig()
 
@@ -132,7 +132,7 @@ def grad(outputs, inputs, grad_outputs=None):
             x_grad = exe.run(main_program, feed={'x': np.array([2.]).astype('float32')}, fetch_list=[x_grad])
             print(x_grad)
             # [array([4.], dtype=float32)]
-            
+
             paddle.incubate.autograd.disable_prim()
             paddle.disable_static()
     """
diff --git a/python/paddle/incubate/autograd/primreg.py b/python/paddle/incubate/autograd/primreg.py
index 6c3ece09a6b..4721500b2be 100644
--- a/python/paddle/incubate/autograd/primreg.py
+++ b/python/paddle/incubate/autograd/primreg.py
@@ -62,14 +62,14 @@ def lookup_transpose(optype):
 def op_position_inputs(op):
     """
     Returns the position inputs of `op` as registered with REGISTER_FN.
-    
+
     Args:
         op(Operator): The op that needs to get the inputs
 
     Returns:
         Tensor(s): Inputs of the op
 
-    Examples: 
+    Examples:
         .. code-block:: python
             @REGISTER_FN('div_p', 'X', 'Y', 'Z')
             def div(x, y, out=None):
@@ -77,7 +77,7 @@ def op_position_inputs(op):
 
     The registered inputs are ['X', 'Y'] for div_p and accordingly this
     function will return inputs in the order of X then Y.
-    
+
     """
     args = _primop_position_argnames.lookup(op.type)
     assert args is not None, 'args should not be None in op_position_inputs().'
@@ -100,14 +100,14 @@ def op_position_inputs(op):
 def op_position_output(op):
     """
     Returns the output of `op` as registered with REGISTER_FN.
-    
+
     Args:
         op(Operator): The op that needs to get the output
 
     Returns:
         Tensor(s): Output of the op
 
-    Examples: 
+    Examples:
         .. code-block:: python
             @REGISTER_FN('div_p', 'X', 'Y', 'Z')
             def div(x, y, out=None):
@@ -115,7 +115,7 @@ def op_position_output(op):
 
     The registered output is ['Z'] for div_p and accordingly this
     function will return output Z.
-    
+
     """
     args = _primop_position_argnames.lookup(op.type)
     assert args is not None, 'args should not be None in op_position_output().'
@@ -135,7 +135,7 @@ def op_position_output(op):
 
 def REGISTER_FN(op_type, *position_argnames):
     """
-    Decorator for registering the Python function for a primitive op.        
+    Decorator for registering the Python function for a primitive op.
 
     Args:
         op_type(str): The op name
@@ -144,12 +144,12 @@ def REGISTER_FN(op_type, *position_argnames):
     Returns:
         wrapper: Inner wrapper function
 
-    Examples: 
+    Examples:
         .. code-block:: python
         @REGISTER_FN('tanh_p', 'X', 'Y')
         def tanh(x, out=None):
             return _simple_unop(LayerHelper('tanh_p', **locals()))
-    
+
     """
 
     if not isinstance(op_type, str):
@@ -167,7 +167,7 @@ def REGISTER_FN(op_type, *position_argnames):
 def REGISTER_ORIG2PRIM(op_type):
     """
     Decorator for registering the lower function for an original op into sequence of primitive ops.
-    
+
     Args:
         op_type(str): The op name
 
@@ -199,7 +199,7 @@ def REGISTER_ORIG2PRIM(op_type):
 def REGISTER_PRIM2ORIG(op_type):
     """
     Decorator for registering the lower function for an primitive op into sequence of original ops.
-    
+
     Args:
         op_type(str): The op name
 
@@ -231,7 +231,7 @@ def REGISTER_PRIM2ORIG(op_type):
 def REGISTER_JVP(op_type):
     """
     Decorator for registering the JVP function for a primitive op.
-    
+
     Args:
         op_type(str): The op name
 
@@ -243,7 +243,7 @@ def REGISTER_JVP(op_type):
             @REGISTER_JVP('add_p')
             def add_jvp(op, x_dot, y_dot):
                 return primops.add(x_dot, y_dot)
-    
+
     """
     if not isinstance(op_type, str):
         raise TypeError(f'op_type must be str, but got {type(op_type)}.')
@@ -264,7 +264,7 @@ def REGISTER_TRANSPOSE(op_type):
     """
     Decorator for registering the transpose function for a primitive op
     that denotes a linear operation in the forward AD graph.
-    
+
     Args:
         op_type(str): The op name
 
@@ -276,7 +276,7 @@ def REGISTER_TRANSPOSE(op_type):
             @REGISTER_TRANSPOSE('add_p')
             def add_transpose(op, z_bar):
                 return z_bar, z_bar
-    
+
     """
     if not isinstance(op_type, str):
         raise TypeError(f'op_type must be str, but got {type(op_type)}.')
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index 565fcb0b4ed..28136f56821 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -29,9 +29,9 @@ from .utils import (flatten, flatten_and_remove_none, get_input_var_list,
 
 
 def topo_path(xs, ys, block=None):
-    """ Returns the list of ops on the path from `xs` to `ys` in topological 
+    """ Returns the list of ops on the path from `xs` to `ys` in topological
     order.
-    
+
     TODO(Tongxin): supporting control flow and nested blocks.
     Args:
         xs: a list|tuple of vars as source
@@ -89,7 +89,7 @@ def topo_path(xs, ys, block=None):
 def output_vars_on_path(path):
     """ Returns the output variables of all the ops on the path from `xs`
     to `ys`.
-    
+
     Args:
         path: a list of ops on which to find the output variables
 
@@ -106,7 +106,7 @@ def output_vars_on_path(path):
 
 class VarMap(object):
     """ A general map data structure for linking variables to variables.
-    
+
     An example is linking variables to their gradients.
     """
 
@@ -169,7 +169,7 @@ class VarMap(object):
 
 # TODO(lml): supporting control flow, nested blocks, and block other than current block of main program.
 class Transform(object):
-    """ An object that maintains the state of transformations applied to a 
+    """ An object that maintains the state of transformations applied to a
     primitve program. """
 
     def __init__(self, block):
@@ -244,9 +244,9 @@ class Transform(object):
         return bars
 
     def linearize(self, xs, ys, xs_dot=None):
-        """ Performs the linearization transform, a.k.a, forward mode AD 
+        """ Performs the linearization transform, a.k.a, forward mode AD
         transform, on a primitive lowered program.
-        
+
         Args:
             xs: a list of input variables
             ys: a list of output variables
@@ -256,9 +256,9 @@ class Transform(object):
 
         Returns:
             (xs_dot, ys_dot): a tuple of two lists. `xs_dot` is the list of
-            gradient inputs of the resulting linearized program. `ys_dot` is 
+            gradient inputs of the resulting linearized program. `ys_dot` is
             the list gradient outputs of the resulting linearized program
-            
+
         """
         if xs_dot is None:
             xs_dot = [fill_const(1.0, shape=x.shape, dtype=x.dtype) for x in xs]
@@ -300,23 +300,23 @@ class Transform(object):
         return xs_dot, ys_dot
 
     def transpose(self, ys_dot, xs_dot, ys_bar=None, retain_fwd=False):
-        """ Performs the transpose transform, a.k.a, reverse mode AD 
+        """ Performs the transpose transform, a.k.a, reverse mode AD
         transform, on a linearized primitive program.
 
         Note, `transpose` is supposed to be used in couple with `linearize`.
-        
+
         Args:
             ys_dot: a list of outputs of the linearized program.
             xs_dot: a list of inputs of the linearized program.
-            ys_bar: optional, a list of inputs of the resulting transposed 
+            ys_bar: optional, a list of inputs of the resulting transposed
                 program. The list size must be equal to `len(ys_dot)`. The shape
                 and dtype of each element must be the same as in `ys_dot`
 
         Returns:
             (ys_bar, xs_bar): a tuple of two lists. `ys_bar` is the list of
-            inputs of the resulting transposed program. `xs_bar` is 
+            inputs of the resulting transposed program. `xs_bar` is
             the list outputs of the resulting transposed program
-            
+
         """
         assert all(v is not None for v in xs_dot), f'`xs_dot` includes None.'
         assert all(v is not None for v in ys_dot), f'`ys_dot` includes None.'
@@ -519,7 +519,7 @@ def _lower(block, reverse, blacklist):
 
 @framework.static_only
 def orig2prim(block=None):
-    """ 
+    """
     .. note::
         **This API is ONLY available in the static mode.**
         **Args block must be None or current block of main program.**
@@ -528,7 +528,7 @@ def orig2prim(block=None):
     If it is an original operator, it will be transformed into
     one or a series of automatic differential basic operators with
     equivalent function.
-    
+
     Args:
         block(paddle.static.Block|None, optional): The
             target block to process on. Default None, and will
@@ -552,7 +552,7 @@ def prim2orig(block=None, blacklist=None):
     If it is an automatic differential basic operator, it will be
     transformed into one or a series of original operators with
     equivalent function to support execution.
-    
+
     Args:
         block(paddle.static.Block|None, optional): The
             target block to process on. Default None, and will
@@ -568,10 +568,10 @@ def prim2orig(block=None, blacklist=None):
 
             import paddle
             from paddle.incubate.autograd import enable_prim, prim_enabled, prim2orig
-            
+
             paddle.enable_static()
             enable_prim()
-            
+
             x = paddle.ones(shape=[2, 2], dtype='float32')
             x.stop_gradients = False
             y = x * x
diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py
index 96faf7f7440..61221aa0afa 100644
--- a/python/paddle/incubate/autograd/utils.py
+++ b/python/paddle/incubate/autograd/utils.py
@@ -38,9 +38,9 @@ def prim_enabled():
     .. note::
         **ONLY available in the static mode.**
 
-    Shows whether the automatic differentiation mechanism based on 
+    Shows whether the automatic differentiation mechanism based on
     automatic differential basic operators is ON. Defaults to OFF.
-     
+
     Returns:
         flag(bool): Whether the automatic differentiation mechanism based on automatic differential basic operators is ON.
 
@@ -50,7 +50,7 @@ def prim_enabled():
 
             import paddle
             from paddle.incubate.autograd import enable_prim, disable_prim, prim_enabled
-            
+
             paddle.enable_static()
             enable_prim()
 
@@ -69,16 +69,16 @@ def enable_prim():
     .. note::
         **ONLY available in the static mode.**
 
-    Turns ON automatic differentiation mechanism based on automatic 
+    Turns ON automatic differentiation mechanism based on automatic
     differential basic operators.
-    
+
     Examples:
 
         .. code-block:: python
 
             import paddle
             from paddle.incubate.autograd import enable_prim, prim_enabled
-            
+
             paddle.enable_static()
             enable_prim()
 
@@ -93,16 +93,16 @@ def disable_prim():
     .. note::
         **ONLY available in the static mode.**
 
-    Turns OFF automatic differentiation mechanism based on automatic 
+    Turns OFF automatic differentiation mechanism based on automatic
     differential basic operators.
-    
+
     Examples:
 
         .. code-block:: python
 
             import paddle
             from paddle.incubate.autograd import enable_prim, disable_prim, prim_enabled
-            
+
             paddle.enable_static()
             enable_prim()
 
diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py
index 83e491a0874..d9cfd87f709 100644
--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -26,17 +26,17 @@ import copy
 class ClipGradForMOEByGlobalNorm(ClipGradBase):
     r"""
     The Algrithm is the same as paddle.fluid.clip.ClipGradByGlobalNorm
-    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in 
+    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
     :math:`t\_list` , and limit it to ``clip_norm`` .
-    
+
     - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
-    
+
     - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
-    
+
     The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
     If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-    
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
     (for example: :ref:`api_paddle_optimizer_SGD`).
 
     The clipping formula is:
@@ -52,7 +52,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
         global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
 
     Note:
-        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. 
+        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
         Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
 
     Reference:
@@ -68,12 +68,12 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
 
             x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10, 
-                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
                                       bias_attr=paddle.ParamAttr(need_clip=False))
             out = linear(x)
             loss = paddle.mean(out)
diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py
index ebf300abf95..f25b00cb4be 100644
--- a/python/paddle/incubate/distributed/models/moe/moe_layer.py
+++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py
@@ -246,9 +246,9 @@ class MoELayer(nn.Layer):
     Args:
         d_model: (int) model dimention
         experts: (nn.LayerList) expert networks list
-        gate: (dict|NaiveGate|SwitchGate|NaiveGate): 
+        gate: (dict|NaiveGate|SwitchGate|NaiveGate):
                 if gate is a dict:
-                    gate is a gate network config, containing 2 keys: 
+                    gate is a gate network config, containing 2 keys:
                     `type`(str) value can be: "naive", "gshard", "switch" or None, default is "gshard"
                     `top_k`(int) default value is 2
                 else gate is an instance of NaiveGate|SwitchGate|NaiveGate:
@@ -277,7 +277,7 @@ class MoELayer(nn.Layer):
 
         class ExpertLayer(Layer):
             def __init__(self, d_model, d_hidden, name=None,rank=0, windex = 0, num_expert=1):
-                super(ExpertLayer, self).__init__()                
+                super(ExpertLayer, self).__init__()
                 self.htoh4 = nn.Linear(d_model, d_hidden)
                 self.h4toh = nn.Linear(d_hidden, d_model)
 
@@ -290,19 +290,19 @@ class MoELayer(nn.Layer):
                 "type": "gshard",
                 "top_k": top_k,
         }
-        
+
         experts_list = LayerList()
         for expi in range(num_experts):
             exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts)
             experts_list.append(exp_layer)
-        
+
         moeLayer = MoELayer(d_model = d_model,
                             experts=experts_list,
                             gate=gate_config,
                             moe_group=moe_group,
                             mp_group=mp_group,
                             recompute_interval=0)
-        
+
     """
 
     def __init__(self,
diff --git a/python/paddle/incubate/nn/functional/fused_matmul_bias.py b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
index 58e51c5fa5e..6d55e31790f 100644
--- a/python/paddle/incubate/nn/functional/fused_matmul_bias.py
+++ b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
@@ -26,20 +26,20 @@ def fused_matmul_bias(x,
                       name=None):
     """
     Applies matrix multiplication of two tensors and then bias addition if provided.
-    This method requires CUDA version >= 11.6. 
+    This method requires CUDA version >= 11.6.
 
     Args:
         x (Tensor): the first input Tensor to be multiplied.
-        y (Tensor): the second input Tensor to be multiplied. Its rank must be 2.  
+        y (Tensor): the second input Tensor to be multiplied. Its rank must be 2.
         bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
-            be performed. Otherwise, the bias is added to the matrix multiplication result.  
+            be performed. Otherwise, the bias is added to the matrix multiplication result.
         transpose_x (bool): Whether to transpose :math:`x` before multiplication.
-        transpose_y (bool): Whether to transpose :math:`y` before multiplication.    
-        name(str|None): For detailed information, please refer to 
-            :ref:`api_guide_Name` . Usually name is no need to set and None by default. 
+        transpose_y (bool): Whether to transpose :math:`y` before multiplication.
+        name(str|None): For detailed information, please refer to
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
 
     Returns:
-        Tensor: the output Tensor. 
+        Tensor: the output Tensor.
 
     Examples:
         .. code-block:: python
@@ -47,11 +47,11 @@ def fused_matmul_bias(x,
             # required: gpu
             import paddle
             from paddle.incubate.nn.functional import fused_matmul_bias
-            
-            x = paddle.randn([3, 4]) 
+
+            x = paddle.randn([3, 4])
             y = paddle.randn([4, 5])
             bias = paddle.randn([5])
-            out = fused_matmul_bias(x, y, bias) 
+            out = fused_matmul_bias(x, y, bias)
             print(out.shape) # [3, 5]
     """
     if bias is None:
@@ -79,19 +79,19 @@ def fused_matmul_bias(x,
 
 def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
     """
-    Fully-connected linear transformation operator. This method requires CUDA version >= 11.6. 
+    Fully-connected linear transformation operator. This method requires CUDA version >= 11.6.
 
     Args:
         x (Tensor): the input Tensor to be multiplied.
-        weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2.  
+        weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
         bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
-            be performed. Otherwise, the bias is added to the matrix multiplication result.  
-        transpose_weight (bool): Whether to transpose :math:`weight` before multiplication.    
-        name(str|None): For detailed information, please refer to 
-            :ref:`api_guide_Name` . Usually name is no need to set and None by default. 
+            be performed. Otherwise, the bias is added to the matrix multiplication result.
+        transpose_weight (bool): Whether to transpose :math:`weight` before multiplication.
+        name(str|None): For detailed information, please refer to
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
 
     Returns:
-        Tensor: the output Tensor. 
+        Tensor: the output Tensor.
 
     Examples:
         .. code-block:: python
@@ -99,11 +99,11 @@ def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
             # required: gpu
             import paddle
             from paddle.incubate.nn.functional import fused_linear
-            
-            x = paddle.randn([3, 4]) 
+
+            x = paddle.randn([3, 4])
             weight = paddle.randn([4, 5])
             bias = paddle.randn([5])
-            out = fused_linear(x, weight, bias) 
+            out = fused_linear(x, weight, bias)
             print(out.shape) # [3, 5]
     """
     return fused_matmul_bias(x, weight, bias, False, transpose_weight, name)
diff --git a/python/paddle/incubate/nn/layer/fused_linear.py b/python/paddle/incubate/nn/layer/fused_linear.py
index 8a8800afce6..65535d9318c 100644
--- a/python/paddle/incubate/nn/layer/fused_linear.py
+++ b/python/paddle/incubate/nn/layer/fused_linear.py
@@ -34,7 +34,7 @@ class FusedLinear(Layer):
             initialized to zero. For detailed information, please refer to
             paddle.ParamAttr.
         transpose_weight (bool): Whether to transpose the `weight` Tensor before
-            multiplication. 
+            multiplication.
         bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias
             of this layer. If it is set to False, no bias will be added to the output.
             If it is set to None or one kind of ParamAttr, a bias parameter will
@@ -55,14 +55,14 @@ class FusedLinear(Layer):
 
     Examples:
         .. code-block:: python
-       
+
             # required: gpu
             import paddle
             from paddle.incubate.nn import FusedLinear
 
-            x = paddle.randn([3, 4]) 
+            x = paddle.randn([3, 4])
             linear = FusedLinear(4, 5)
-            y = linear(x)            
+            y = linear(x)
             print(y.shape) # [3, 5]
     """
 
diff --git a/python/paddle/incubate/operators/graph_khop_sampler.py b/python/paddle/incubate/operators/graph_khop_sampler.py
index 58e0fdafab6..91fa9488ca8 100644
--- a/python/paddle/incubate/operators/graph_khop_sampler.py
+++ b/python/paddle/incubate/operators/graph_khop_sampler.py
@@ -30,22 +30,22 @@ def graph_khop_sampler(row,
     """
     Graph Khop Sampler API.
 
-    This API is mainly used in Graph Learning domain, and the main purpose is to 
+    This API is mainly used in Graph Learning domain, and the main purpose is to
     provide high performance graph khop sampling method with subgraph reindex step.
     For example, we get the CSC(Compressed Sparse Column) format of the input graph
-    edges as `row` and `colptr`, so as to covert graph data into a suitable format 
+    edges as `row` and `colptr`, so as to covert graph data into a suitable format
     for sampling. And the `input_nodes` means the nodes we need to sample neighbors,
     and `sample_sizes` means the number of neighbors and number of layers we want
-    to sample. 
+    to sample.
 
     Args:
-        row (Tensor): One of the components of the CSC format of the input graph, and 
+        row (Tensor): One of the components of the CSC format of the input graph, and
                       the shape should be [num_edges, 1] or [num_edges]. The available
                       data type is int32, int64.
         colptr (Tensor): One of the components of the CSC format of the input graph,
-                         and the shape should be [num_nodes + 1, 1] or [num_nodes]. 
+                         and the shape should be [num_nodes + 1, 1] or [num_nodes].
                          The data type should be the same with `row`.
-        input_nodes (Tensor): The input nodes we need to sample neighbors for, and the 
+        input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
                               data type should be the same with `row`.
         sample_sizes (list|tuple): The number of neighbors and number of layers we want
                                    to sample. The data type should be int, and the shape
@@ -58,7 +58,7 @@ def graph_khop_sampler(row,
                               For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        edge_src (Tensor): The src index of the output edges, also means the first column of 
+        edge_src (Tensor): The src index of the output edges, also means the first column of
                            the edges. The shape is [num_sample_edges, 1] currently.
         edge_dst (Tensor): The dst index of the output edges, also means the second column
                            of the edges. The shape is [num_sample_edges, 1] currently.
@@ -67,7 +67,7 @@ def graph_khop_sampler(row,
         edge_eids (Tensor): Return the id of the sample edges if `return_eids` is True.
 
     Examples:
-        
+
         .. code-block:: python
 
         import paddle
@@ -79,7 +79,7 @@ def graph_khop_sampler(row,
         row = paddle.to_tensor(row, dtype="int64")
         colptr = paddle.to_tensor(colptr, dtype="int64")
         nodes = paddle.to_tensor(nodes, dtype="int64")
-        
+
         edge_src, edge_dst, sample_index, reindex_nodes = \
             paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
 
diff --git a/python/paddle/incubate/operators/graph_reindex.py b/python/paddle/incubate/operators/graph_reindex.py
index e7e940c2750..8f223f36a04 100644
--- a/python/paddle/incubate/operators/graph_reindex.py
+++ b/python/paddle/incubate/operators/graph_reindex.py
@@ -37,17 +37,17 @@ def graph_reindex(x,
 
     This API is mainly used in Graph Learning domain, which should be used
     in conjunction with `graph_sample_neighbors` API. And the main purpose
-    is to reindex the ids information of the input nodes, and return the 
+    is to reindex the ids information of the input nodes, and return the
     corresponding graph edges after reindex.
 
-    **Notes**: 
+    **Notes**:
         The number in x should be unique, otherwise it would cause potential errors.
     Besides, we also support multi-edge-types neighbors reindexing. If we have different
-    edge_type neighbors for x, we should concatenate all the neighbors and count of x. 
-    We will reindex all the nodes from 0. 
+    edge_type neighbors for x, we should concatenate all the neighbors and count of x.
+    We will reindex all the nodes from 0.
 
-    Take input nodes x = [0, 1, 2] as an example. 
-    If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2], 
+    Take input nodes x = [0, 1, 2] as an example.
+    If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2],
     then we know that the neighbors of 0 is [8, 9], the neighbors of 1
     is [0, 4, 7], and the neighbors of 2 is [6, 7].
 
@@ -56,17 +56,17 @@ def graph_reindex(x,
                     data type is int32, int64.
         neighbors (Tensor): The neighbors of the input nodes `x`. The data type
                             should be the same with `x`.
-        count (Tensor): The neighbor count of the input nodes `x`. And the 
+        count (Tensor): The neighbor count of the input nodes `x`. And the
                         data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should 
+        value_buffer (Tensor|None): Value buffer for hashtable. The data type should
                                     be int32, and should be filled with -1.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should 
+        index_buffer (Tensor|None): Index buffer for hashtable. The data type should
                                     be int32, and should be filled with -1.
         flag_buffer_hashtable (bool): Whether to use buffer for hashtable to speed up.
                                       Default is False. Only useful for gpu version currently.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         reindex_src (Tensor): The source node index of graph edges after reindex.
         reindex_dst (Tensor): The destination node index of graph edges after reindex.
@@ -75,7 +75,7 @@ def graph_reindex(x,
                             nodes in the back.
 
     Examples:
-        
+
         .. code-block:: python
 
         import paddle
@@ -97,7 +97,7 @@ def graph_reindex(x,
         count_e2 = [1, 3, 1]
         neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64")
         count_e2 = paddle.to_tensor(count_e2, dtype="int32")
-        
+
         neighbors = paddle.concat([neighbors_e1, neighbors_e2])
         count = paddle.concat([count_e1, count_e2])
         reindex_src, reindex_dst, out_nodes = \
diff --git a/python/paddle/incubate/operators/graph_sample_neighbors.py b/python/paddle/incubate/operators/graph_sample_neighbors.py
index b230b2a45d5..81839fda137 100644
--- a/python/paddle/incubate/operators/graph_sample_neighbors.py
+++ b/python/paddle/incubate/operators/graph_sample_neighbors.py
@@ -39,13 +39,13 @@ def graph_sample_neighbors(row,
     Graph Sample Neighbors API.
 
     This API is mainly used in Graph Learning domain, and the main purpose is to
-    provide high performance of graph sampling method. For example, we get the 
-    CSC(Compressed Sparse Column) format of the input graph edges as `row` and 
+    provide high performance of graph sampling method. For example, we get the
+    CSC(Compressed Sparse Column) format of the input graph edges as `row` and
     `colptr`, so as to convert graph data into a suitable format for sampling.
-    `input_nodes` means the nodes we need to sample neighbors, and `sample_sizes` 
+    `input_nodes` means the nodes we need to sample neighbors, and `sample_sizes`
     means the number of neighbors and number of layers we want to sample.
 
-    Besides, we support fisher-yates sampling in GPU version. 
+    Besides, we support fisher-yates sampling in GPU version.
 
     Args:
         row (Tensor): One of the components of the CSC format of the input graph, and
@@ -57,16 +57,16 @@ def graph_sample_neighbors(row,
         input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
                               data type should be the same with `row`.
         eids (Tensor): The eid information of the input graph. If return_eids is True,
-                            then `eids` should not be None. The data type should be the 
+                            then `eids` should not be None. The data type should be the
                             same with `row`. Default is None.
         perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `flag_perm_buffer`
                               is True, then `perm_buffer` should not be None. The data type should
-                              be the same with `row`. Default is None. 
-        sample_size (int): The number of neighbors we need to sample. Default value is 
+                              be the same with `row`. Default is None.
+        sample_size (int): The number of neighbors we need to sample. Default value is
                            -1, which means returning all the neighbors of the input nodes.
         return_eids (bool): Whether to return eid information of sample edges. Default is False.
-        flag_perm_buffer (bool): Using the permutation for fisher-yates sampling in GPU. Default 
-                                 value is false, means not using it. 
+        flag_perm_buffer (bool): Using the permutation for fisher-yates sampling in GPU. Default
+                                 value is false, means not using it.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
@@ -74,7 +74,7 @@ def graph_sample_neighbors(row,
         out_neighbors (Tensor): The sample neighbors of the input nodes.
         out_count (Tensor): The number of sampling neighbors of each input node, and the shape
                             should be the same with `input_nodes`.
-        out_eids (Tensor): If `return_eids` is True, we will return the eid information of the 
+        out_eids (Tensor): If `return_eids` is True, we will return the eid information of the
                            sample edges.
 
     Examples:
@@ -90,7 +90,7 @@ def graph_sample_neighbors(row,
         colptr = paddle.to_tensor(colptr, dtype="int64")
         nodes = paddle.to_tensor(nodes, dtype="int64")
         out_neighbors, out_count = \
-            paddle.incubate.graph_sample_neighbors(row, colptr, nodes, 
+            paddle.incubate.graph_sample_neighbors(row, colptr, nodes,
                                                    sample_size=sample_size)
 
     """
diff --git a/python/paddle/incubate/operators/graph_send_recv.py b/python/paddle/incubate/operators/graph_send_recv.py
index b8b01f9aad2..91ea337eeb2 100644
--- a/python/paddle/incubate/operators/graph_send_recv.py
+++ b/python/paddle/incubate/operators/graph_send_recv.py
@@ -37,9 +37,9 @@ def graph_send_recv(x,
 
     Graph Learning Send_Recv combine operator.
 
-    This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
+    This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory
     consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index`
-    to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor 
+    to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor
     in different pooling types, like sum, mean, max, or min. Besides, we can set `out_size` to get necessary output shape.
 
     .. code-block:: text
@@ -67,20 +67,20 @@ def graph_send_recv(x,
     Args:
         x (Tensor): The input tensor, and the available data type is float32, float64, int32, int64.
         src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
-        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. 
-                            The available data type is int32, int64. 
+        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
+                            The available data type is int32, int64.
         pool_type (str): The pooling types of graph_send_recv, including `sum`, `mean`, `max`, `min`.
                          Default value is `sum`.
-        out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or 
+        out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or
                                     out_size is smaller or equal to 0, then this input will not be used.
-                                    Otherwise, `out_size` should be equal with or larger than 
+                                    Otherwise, `out_size` should be equal with or larger than
                                     max(dst_index) + 1.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out (Tensor): The output tensor, should have the same shape and same dtype as input tensor `x`. 
-                      If `out_size` is set correctly, then it should have the same shape as `x` except 
+        out (Tensor): The output tensor, should have the same shape and same dtype as input tensor `x`.
+                      If `out_size` is set correctly, then it should have the same shape as `x` except
                       the 0th dimension.
 
     Examples:
diff --git a/python/paddle/incubate/optimizer/functional/bfgs.py b/python/paddle/incubate/optimizer/functional/bfgs.py
index 8bf7b71c65a..58d647f4dd4 100644
--- a/python/paddle/incubate/optimizer/functional/bfgs.py
+++ b/python/paddle/incubate/optimizer/functional/bfgs.py
@@ -40,17 +40,17 @@ def minimize_bfgs(objective_func,
         x_{k+1} = x_{k} + H_k \nabla{f_k}
 
     If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method.
-    If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then 
+    If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then
     it's a quasi-Newton. In practice, the approximated Hessians are obtained
-    by only using the gradients, over either whole or part of the search 
+    by only using the gradients, over either whole or part of the search
     history, the former is BFGS, the latter is L-BFGS.
 
-    Reference: 
+    Reference:
         Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp140: Algorithm 6.1 (BFGS Method).
 
     Args:
         objective_func: the objective function to minimize. ``objective_func`` accepts a 1D Tensor and returns a scalar.
-        initial_position (Tensor): the starting point of the iterates, has the same shape with the input of ``objective_func`` . 
+        initial_position (Tensor): the starting point of the iterates, has the same shape with the input of ``objective_func`` .
         max_iters (int, optional): the maximum number of minimization iterations. Default value: 50.
         tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7.
         tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9.
@@ -75,7 +75,7 @@ def minimize_bfgs(objective_func,
         .. code-block:: python
 
             import paddle
-            
+
             def func(x):
                 return paddle.dot(x, x)
 
diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py
index d09ba5c6952..5925d47decd 100644
--- a/python/paddle/incubate/optimizer/functional/lbfgs.py
+++ b/python/paddle/incubate/optimizer/functional/lbfgs.py
@@ -41,9 +41,9 @@ def minimize_lbfgs(objective_func,
         x_{k+1} = x_{k} + H_k \nabla{f_k}
 
     If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method.
-    If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then 
+    If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then
     it's a quasi-Newton. In practice, the approximated Hessians are obtained
-    by only using the gradients, over either whole or part of the search 
+    by only using the gradients, over either whole or part of the search
     history, the former is BFGS, the latter is L-BFGS.
 
     Reference:
@@ -51,7 +51,7 @@ def minimize_lbfgs(objective_func,
 
     Args:
         objective_func: the objective function to minimize. ``objective_func`` accepts a 1D Tensor and returns a scalar.
-        initial_position (Tensor): the starting point of the iterates, has the same shape with the input of ``objective_func`` . 
+        initial_position (Tensor): the starting point of the iterates, has the same shape with the input of ``objective_func`` .
         history_size (Scalar): the number of stored vector pairs {si,yi}. Default value: 100.
         max_iters (int, optional): the maximum number of minimization iterations. Default value: 50.
         tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7.
@@ -71,12 +71,12 @@ def minimize_lbfgs(objective_func,
             - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of the objective function regrading to the initial position.
             - objective_value (Tensor): objective function value at the `position`.
             - objective_gradient (Tensor): objective function gradient at the `position`.
-            
+
     Examples:
         .. code-block:: python
 
             import paddle
-            
+
             def func(x):
                 return paddle.dot(x, x)
 
diff --git a/python/paddle/incubate/optimizer/functional/line_search.py b/python/paddle/incubate/optimizer/functional/line_search.py
index 3aacb137e6e..375f86ddbe5 100644
--- a/python/paddle/incubate/optimizer/functional/line_search.py
+++ b/python/paddle/incubate/optimizer/functional/line_search.py
@@ -20,7 +20,7 @@ def cubic_interpolation_(x1, f1, g1, x2, f2, g2):
     r"""Cubic interpolation between (x1, f1, g1) and (x2, f2, g2).
         Use two points and their gradient to determine a cubic function and get the minimun point
         between them in the cubic curve.
-        
+
     Reference:
         Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
         pp59: formula 3.59
@@ -68,11 +68,11 @@ def strong_wolfe(f,
                  alpha_max=10,
                  dtype='float32'):
     r"""Implements of line search algorithm that satisfies the strong Wolfe conditions using double zoom.
-    
+
     Reference:
         Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
         pp60: Algorithm 3.5 (Line Search Algorithm).
-    
+
     Args:
         f: the objective function to minimize. ``f`` accepts a multivariate input and returns a scalar.
         xk (Tensor): the starting point of the iterates.
@@ -80,14 +80,14 @@ def strong_wolfe(f,
         max_iters (Scalar): the maximum number of iterations.
         tolerance_grad (Scalar): terminates if the gradient norm is smaller than
             this. Currently gradient norm uses inf norm.
-        tolerance_change (Scalar): terminates if the change of function value/position/parameter between 
+        tolerance_change (Scalar): terminates if the change of function value/position/parameter between
             two iterations is smaller than this value.
         initial_step_length (Scalar): step length used in first iteration.
         c1 (Scalar): parameter for sufficient decrease condition.
         c2 (Scalar): parameter for curvature condition.
         alpha_max (float): max step length.
         dtype ('float32' | 'float64'): the datatype to be used.
-    
+
     Returns:
         num_func_calls (float): number of objective function called in line search process.
         a_star(Tensor): optimal step length, or 0. if the line search algorithm did not converge.
@@ -96,26 +96,26 @@ def strong_wolfe(f,
 
     Following summarizes the essentials of the strong Wolfe line search algorithm.
     Some notations used in the description:
-    
+
         - `f` denotes the objective function.
         - `phi` is a function of step size alpha, restricting `f` on a line.
-        
+
             phi = f(xk + a * pk),
-            where xk is the position of k'th iterate, pk is the line search direction(decent direction), 
+            where xk is the position of k'th iterate, pk is the line search direction(decent direction),
             and a is the step size.
         - a : substitute of alpha
         - a1 is a of last iteration, which is alpha_(i-1).
         - a2 is a of current iteration, which is alpha_i.
         - a_lo is a in left position when calls zoom, which is alpha_low.
         - a_hi is a in right position when calls zoom, which is alpha_high.
-    
+
     Line Search Algorithm:
         repeat
             Compute phi(a2) and derphi(a2).
-            1. If phi(a2) > phi(0) + c_1 * a2 * phi'(0) or [phi(a2) >= phi(a1) and i > 1], 
+            1. If phi(a2) > phi(0) + c_1 * a2 * phi'(0) or [phi(a2) >= phi(a1) and i > 1],
                 a_star= zoom(a1, a2) and stop;
 
-            2. If |phi'(a2)| <= -c_2 * phi'(0), 
+            2. If |phi'(a2)| <= -c_2 * phi'(0),
                 a_star= a2 and stop;
 
             3. If phi'(a2) >= 0,
@@ -125,8 +125,8 @@ def strong_wolfe(f,
             a2 = min(2 * a2, a2)
             i = i + 1
         end(repeat)
-    
-    zoom(a_lo, a_hi) Algorithm: 
+
+    zoom(a_lo, a_hi) Algorithm:
         repeat
             aj = cubic_interpolation(a_lo, a_hi)
             Compute phi(aj) and derphi(aj).
diff --git a/python/paddle/incubate/optimizer/functional/utils.py b/python/paddle/incubate/optimizer/functional/utils.py
index d4f69a35491..7a427c4489d 100644
--- a/python/paddle/incubate/optimizer/functional/utils.py
+++ b/python/paddle/incubate/optimizer/functional/utils.py
@@ -30,10 +30,10 @@ def check_initial_inverse_hessian_estimate(H0):
     r"""Check whether the specified initial_inverse_hessian_estimate is symmetric and positive definite.
         Raise errors when precondition not met.
 
-    Note: 
+    Note:
         In static graph can not raise error directly, so use py_func make raise_func as a op,
         and use paddle.static.nn.cond to decide if put the op in net.
-        cholesky is the fast way to check positive definition, but in static graph can not catch 
+        cholesky is the fast way to check positive definition, but in static graph can not catch
         exception to raise value error, so use eigvals rather than cholesky in static graph.
     """
     is_symmetric = paddle.all(paddle.equal(H0, H0.t()))
@@ -78,13 +78,13 @@ def check_initial_inverse_hessian_estimate(H0):
 
 def _value_and_gradient(f, x, v=None):
     r"""Compute function value and gradient of f at x.
-    
+
     Args:
         f (Callable): the objective function.
         x (Tensor): the input tensor.
     Returns:
         value: a tensor that holds the function value.
-        gradient: a tensor that holds the function gradients.  
+        gradient: a tensor that holds the function gradients.
     """
     # use detach to cut off relation between x and original graph
     x = x.detach()
diff --git a/python/paddle/incubate/optimizer/lookahead.py b/python/paddle/incubate/optimizer/lookahead.py
index 8f70f321c0d..31ebf9afcc7 100644
--- a/python/paddle/incubate/optimizer/lookahead.py
+++ b/python/paddle/incubate/optimizer/lookahead.py
@@ -29,18 +29,18 @@ class LookAhead(Optimizer):
     paper : https://arxiv.org/abs/1907.08610.
 
     Lookahead keeps two sets of params: the fast_params and
-    the slow_params. inner_optimizer update fast_params every 
-    training step. Lookahead updates the slow_params and fast_params 
+    the slow_params. inner_optimizer update fast_params every
+    training step. Lookahead updates the slow_params and fast_params
     every k training steps as follows:
 
     .. math::
-        
+
         slow\_param_t &= slow\_param_{t-1} + \\alpha * (fast\_param_{t-1} - slow\_param_{t-1})
-	    
+
         fast\_param_t &=  slow\_param_t
 
     Args:
-        inner_optimizer (Optimizer): The optimizer that update fast params step by step. 
+        inner_optimizer (Optimizer): The optimizer that update fast params step by step.
         alpha (float, optinal): The learning rate of Lookahead. The default value is 0.5.
         k (int, optinal): The slow params is updated every k steps. The default value is 5.
         name (str, optional): Normally there is no need for user to set this property.
@@ -50,7 +50,7 @@ class LookAhead(Optimizer):
     Examples:
 
         .. code-block:: python
-        
+
             import numpy as np
             import paddle
             import paddle.nn as nn
@@ -109,7 +109,7 @@ class LookAhead(Optimizer):
                 shuffle=True,
                 drop_last=True,
                 num_workers=2)
-            
+
             train(layer, loader, loss_fn, lookahead)
 
     """
@@ -147,7 +147,7 @@ class LookAhead(Optimizer):
     def step(self):
         """
         Execute the optimizer and update parameters once.
-        
+
         Returns:
             None
 
@@ -259,8 +259,8 @@ class LookAhead(Optimizer):
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) tensor pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
-            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py
index 67be022c288..ebbe9cd78f4 100644
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -160,7 +160,7 @@ class ModelAverage(Optimizer):
         print("\nEvaluate With Restored Paramters")
         model_average.restore()
         evaluate(layer, eval_loader, loss_fn)
-  
+
     """
 
     def __init__(self,
@@ -291,7 +291,7 @@ class ModelAverage(Optimizer):
                  no_grad_set=None):
         """
         Add operations to minimize ``loss`` by updating ``parameters``.
-        
+
         Args:
             loss (Tensor): A ``Tensor`` containing the value to minimize.
             startup_program (Program, optional): :ref:`api_fluid_Program` for
@@ -302,17 +302,17 @@ class ModelAverage(Optimizer):
                 will be updated.
             no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
                 to be updated. The default value is None.
-        
+
         Returns:
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) tensor pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
-            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
-        
+
         Examples:
-        
+
             .. code-block:: python
 
                 import paddle
@@ -343,7 +343,7 @@ class ModelAverage(Optimizer):
     def step(self):
         """
         Execute the optimizer and update parameters once.
-        
+
         Returns:
             None
 
@@ -414,7 +414,7 @@ class ModelAverage(Optimizer):
                                                             max_average_window=4)
                 sgd.step()
                 modelaverage.step()
-                
+
                 with modelaverage.apply():
                     for param in linear.parameters():
                         print(param)
@@ -461,7 +461,7 @@ class ModelAverage(Optimizer):
     def restore(self, executor=None):
         """
         Restore ``Parameter`` values of current model.
-        
+
         Args:
             executor(Executor): The network executor in static-graph mode. The default value is None in dygraph mode
 
@@ -485,7 +485,7 @@ class ModelAverage(Optimizer):
                                                             max_average_window=4)
                 sgd.step()
                 modelaverage.step()
-                
+
                 with modelaverage.apply(need_restore=False):
                     for param in linear.parameters():
                         print(param)
diff --git a/python/paddle/incubate/sparse/binary.py b/python/paddle/incubate/sparse/binary.py
index 6c78628d270..b09c991800d 100644
--- a/python/paddle/incubate/sparse/binary.py
+++ b/python/paddle/incubate/sparse/binary.py
@@ -30,13 +30,13 @@ _int_dtype_ = [
 @dygraph_only
 def matmul(x, y, name=None):
     """
-    Note:    
+    Note:
         This API is only supported from ``CUDA 11.0`` .
 
-    Applies matrix multiplication of two Tensors. 
-    
+    Applies matrix multiplication of two Tensors.
+
     The supported input/output Tensor layout are as follows:
-    
+
     Note:
         x[SparseCsrTensor] @ y[SparseCsrTensor] -> out[SparseCsrTensor]
         x[SparseCsrTensor] @ y[DenseTensor] -> out[DenseTensor]
@@ -46,14 +46,14 @@ def matmul(x, y, name=None):
     It supports backward propagation.
 
     Dimensions `x` and `y` must be >= 2D. Automatic broadcasting of Tensor is not supported.
-    the shape of `x` should be `[*, M, K]` , and the shape of `y` should be `[*, K, N]` , where `*` 
+    the shape of `x` should be `[*, M, K]` , and the shape of `y` should be `[*, K, N]` , where `*`
     is zero or more batch dimensions.
 
     Args:
         x (Tensor): The input tensor. It can be SparseCooTensor/SparseCsrTensor. The data type can be float32 or float64.
         y (Tensor): The input tensor. It can be SparseCooTensor/SparseCsrTensor/DenseTensor. The data type can be float32 or float64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         Tensor: Its layout is determined by that of `x` and `y` .
 
@@ -68,9 +68,9 @@ def matmul(x, y, name=None):
             cols = [1, 2, 0]
             values = [1., 2., 3.]
             csr = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, [3, 3])
-            # Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 1, 2, 3], 
-            #        cols=[1, 2, 0], 
+            # Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 1, 2, 3],
+            #        cols=[1, 2, 0],
             #        values=[1., 2., 3.])
             dense = paddle.ones([3, 2])
             out = paddle.incubate.sparse.matmul(csr, dense)
@@ -83,9 +83,9 @@ def matmul(x, y, name=None):
             indices = [[0, 1, 2], [1, 2, 0]]
             values = [1., 2., 3.]
             coo = paddle.incubate.sparse.sparse_coo_tensor(indices, values, [3, 3])
-            # Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, 
+            # Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
             #        indices=[[0, 1, 2],
-            #                 [1, 2, 0]], 
+            #                 [1, 2, 0]],
             #        values=[1., 2., 3.])
             dense = paddle.ones([3, 2])
             out = paddle.incubate.sparse.matmul(coo, dense)
@@ -100,13 +100,13 @@ def matmul(x, y, name=None):
 @dygraph_only
 def masked_matmul(x, y, mask, name=None):
     """
-    Note:    
+    Note:
         This API is only supported from ``CUDA 11.3`` .
 
-    Applies matrix multiplication of two Dense Tensors. 
-    
+    Applies matrix multiplication of two Dense Tensors.
+
     The supported input/output Tensor layout are as follows:
-    
+
     Note:
         x[DenseTensor] @ y[DenseTensor] * mask[SparseCooTensor] -> out[SparseCooTensor]
         x[DenseTensor] @ y[DenseTensor] * mask[SparseCsrTensor] -> out[SparseCsrTensor]
@@ -148,9 +148,9 @@ def masked_matmul(x, y, mask, name=None):
             y = paddle.rand([5, 4])
 
             out = paddle.incubate.sparse.masked_matmul(x, y, mask)
-            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 2, 3, 5], 
-            #        cols=[1, 3, 2, 0, 1], 
+            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 2, 3, 5],
+            #        cols=[1, 3, 2, 0, 1],
             #        values=[0.98986477, 0.97800624, 1.14591956, 0.68561077, 0.94714981])
 
     """
@@ -160,11 +160,11 @@ def masked_matmul(x, y, mask, name=None):
 @dygraph_only
 def mv(x, vec, name=None):
     """
-    Note:    
+    Note:
         This API is only supported from ``CUDA 11.0`` .
 
-    Applies matrix-vector product of Sparse Matrix 'x' and Dense vector 'vec' . 
-    
+    Applies matrix-vector product of Sparse Matrix 'x' and Dense vector 'vec' .
+
     The supported input/output Tensor layout are as follows:
 
     Note:
@@ -173,38 +173,38 @@ def mv(x, vec, name=None):
 
     It supports backward propagation.
 
-    The shape of `x` should be `[M, N]` , and the shape of `y` should be `[N]` , 
+    The shape of `x` should be `[M, N]` , and the shape of `y` should be `[N]` ,
     and the shape of `out` will be `[M]` .
 
     Args:
         x (Tensor): The input 2D tensor. It must be SparseCooTensor/SparseCsrTensor. The data type can be float32 or float64.
         y (Tensor): The input 1D tensor. It must be DenseTensor vector. The data type can be float32 or float64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         Tensor: 1D Tensor.
 
     Examples:
 
         .. code-block:: python
-        
+
             import paddle
-            from paddle.fluid.framework import _test_eager_guard 
+            from paddle.fluid.framework import _test_eager_guard
             paddle.seed(100)
 
             # csr @ dense -> dense
-            with _test_eager_guard():         
+            with _test_eager_guard():
                 crows = [0, 2, 3, 5]
                 cols = [1, 3, 2, 0, 1]
                 values = [1., 2., 3., 4., 5.]
                 dense_shape = [3, 4]
                 csr = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
-                # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, 
-                #        crows=[0, 2, 3, 5], 
-                #        cols=[1, 3, 2, 0, 1], 
+                # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+                #        crows=[0, 2, 3, 5],
+                #        cols=[1, 3, 2, 0, 1],
                 #        values=[1., 2., 3., 4., 5.])
                 vec = paddle.randn([4])
-                
+
                 out = paddle.incubate.sparse.mv(csr, vec)
                 # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
                 #        [-3.85499096, -2.42975140, -1.75087738])
diff --git a/python/paddle/incubate/sparse/creation.py b/python/paddle/incubate/sparse/creation.py
index 18794788831..4d1081343fc 100644
--- a/python/paddle/incubate/sparse/creation.py
+++ b/python/paddle/incubate/sparse/creation.py
@@ -72,7 +72,7 @@ def sparse_coo_tensor(indices,
                       place=None,
                       stop_gradient=True):
     r"""
-    Constructs a sparse ``paddle.Tensor`` in coordinate format according to the indices 
+    Constructs a sparse ``paddle.Tensor`` in coordinate format according to the indices
     and values of the specified non-zero elements.
 
     Args:
@@ -81,15 +81,15 @@ def sparse_coo_tensor(indices,
         values(list|tuple|ndarray|Tensor): Initial values for the tensor.
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor.
         shape(list|tuple, optional): The shape of the sparse tensor also represents the shape of
-            original dense tensor. If not provided the smallest shape will be inferred to 
+            original dense tensor. If not provided the smallest shape will be inferred to
             hold all elements.
-        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
+        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' ,
             'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8',
-            'complex64' , 'complex128'. Default: None, infers dtype from ``data`` 
+            'complex64' , 'complex128'. Default: None, infers dtype from ``data``
             except for python float number which gets dtype from ``get_default_type`` .
-        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be  
-            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is 
-            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs. 
+        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be
+            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is
+            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs.
         stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
 
     Returns:
@@ -97,9 +97,9 @@ def sparse_coo_tensor(indices,
 
     Raises:
         TypeError: If the data type of ``values`` is not list, tuple, numpy.ndarray, paddle.Tensor
-        ValueError: If ``values`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]. If the ``indices`` is not a 2-D. 
+        ValueError: If ``values`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]. If the ``indices`` is not a 2-D.
         TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128
-        ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace or specified pattern string. 
+        ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace or specified pattern string.
 
     Examples:
 
@@ -179,28 +179,28 @@ def sparse_csr_tensor(crows,
                       place=None,
                       stop_gradient=True):
     r"""
-    Constructs a sparse ``paddle.Tensor`` in CSR(Compressed Sparse Row) format according to the 
+    Constructs a sparse ``paddle.Tensor`` in CSR(Compressed Sparse Row) format according to the
     ``crows``, ``cols`` and ``values``.
     Currently, the crows and cols of each batch must be incrementd.
 
     Args:
-        crows(list|tuple|ndarray|Tensor): 1-D array, each element in the rows represents the 
-            starting position of the first non-zero element of each row in values. 
-            Can be a list, tuple, numpy\.ndarray, paddle\.Tensor. 
+        crows(list|tuple|ndarray|Tensor): 1-D array, each element in the rows represents the
+            starting position of the first non-zero element of each row in values.
+            Can be a list, tuple, numpy\.ndarray, paddle\.Tensor.
         cols(list|tuple|ndarray|Tensor): 1-D array, the column of non-zero elements.
-            Can be a list, tuple, numpy\.ndarray, paddle\.Tensor. 
+            Can be a list, tuple, numpy\.ndarray, paddle\.Tensor.
         values(list|tuple|ndarray|Tensor): 1-D array, the non-zero elements.
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor.
         shape(list|tuple, optional): The shape of the sparse tensor also represents the shape of
-            original dense tensor. 
+            original dense tensor.
             hold all elements.
-        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
+        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' ,
             'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8',
-            'complex64' , 'complex128'. Default: None, infers dtype from ``data`` 
+            'complex64' , 'complex128'. Default: None, infers dtype from ``data``
             except for python float number which gets dtype from ``get_default_type`` .
-        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be  
-            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is 
-            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs. 
+        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be
+            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is
+            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs.
         stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
 
     Returns:
@@ -208,9 +208,9 @@ def sparse_csr_tensor(crows,
 
     Raises:
         TypeError: If the data type of ``values`` is not list, tuple, numpy.ndarray, paddle.Tensor
-        ValueError: If ``values`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]. If the ``crow``, ``cols`` and ``values`` is not a 2-D. 
+        ValueError: If ``values`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]. If the ``crow``, ``cols`` and ``values`` is not a 2-D.
         TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128
-        ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace or specified pattern string. 
+        ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace or specified pattern string.
 
     Examples:
 
diff --git a/python/paddle/incubate/sparse/multiary.py b/python/paddle/incubate/sparse/multiary.py
index d65847f1383..5389ddc9fae 100644
--- a/python/paddle/incubate/sparse/multiary.py
+++ b/python/paddle/incubate/sparse/multiary.py
@@ -21,7 +21,7 @@ __all__ = []
 @dygraph_only
 def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
     """
-    Note:    
+    Note:
         This API is only supported from ``CUDA 11.0`` .
 
     Applies matrix multiplication for `x` and `y` , `input` is added to
@@ -30,9 +30,9 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
     ..  math::
 
         Out = alpha * x * y + beta * input
-    
+
     The supported input/output Tensor layout are as follows:
-    
+
     Note:
         input[SparseCsrTensor] + x[SparseCsrTensor] @ y[SparseCsrTensor] -> out[SparseCsrTensor]
         input[DenseTensor] + x[SparseCsrTensor] @ y[DenseTensor] -> out[DenseTensor]
@@ -50,10 +50,10 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
         beta (float, optional): Coefficient of `input` . Default: 1.0
         alpha (float, optional): Coefficient of `x * y` . Default: 1.0
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         Tensor: Its layout is determined by that of `x` and `y` . dtype and shape is the same with `input`
-    
+
     Examples:
 
         .. code-block:: python
@@ -76,6 +76,6 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
             x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, [3, 3])
             y = paddle.rand([3, 2])
             out = paddle.incubate.sparse.addmm(input, x, y, 3.0, 2.0)
-            
+
     """
     return _C_ops.sparse_addmm(input, x, y, alpha, beta)
diff --git a/python/paddle/incubate/sparse/nn/functional/activation.py b/python/paddle/incubate/sparse/nn/functional/activation.py
index ddaa6ada01b..4faaa696e6d 100644
--- a/python/paddle/incubate/sparse/nn/functional/activation.py
+++ b/python/paddle/incubate/sparse/nn/functional/activation.py
@@ -54,16 +54,16 @@ def softmax(x, axis=-1, name=None):
     sparse softmax activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
 
     Note:
-        Only support axis=-1 for SparseCsrTensor, which is faster when read data 
+        Only support axis=-1 for SparseCsrTensor, which is faster when read data
         by row (axis=-1).
 
-    From the point of view of dense matrix, for each row :math:`i` and each column :math:`j` 
+    From the point of view of dense matrix, for each row :math:`i` and each column :math:`j`
     in the matrix, we have:
 
     .. math::
 
         softmax_ij = \frac{\exp(x_ij - max_j(x_ij))}{\sum_j(exp(x_ij - max_j(x_ij))}
-    
+
     Parameters:
         x (Tensor): The input tensor. It can be SparseCooTensor/SparseCsrTensor. The data type can be float32 or float64.
         axis (int, optional): The axis along which to perform softmax calculations. Only support -1 for SparseCsrTensor.
@@ -72,7 +72,7 @@ def softmax(x, axis=-1, name=None):
 
     Returns:
         Tensor: SparseCoo or SparseCsr, whose layout is the same with `x` .
-    
+
     Examples:
         .. code-block:: python
 
@@ -87,19 +87,19 @@ def softmax(x, axis=-1, name=None):
             #  [0.         0.         0.         0.98275049]]
 
             csr = paddle.to_tensor(np_x).to_sparse_csr()
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 2, 5, 6], 
-            #        cols=[2, 3, 0, 2, 3, 3], 
+            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 2, 5, 6],
+            #        cols=[2, 3, 0, 2, 3, 3],
             #        values=[0.96823406, 0.19722934, 0.94373937, 0.02060066, 0.71456372,
             #                0.98275049])
 
             out = paddle.incubate.sparse.nn.functional.softmax(csr)
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 2, 5, 6], 
-            #        cols=[2, 3, 0, 2, 3, 3], 
+            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 2, 5, 6],
+            #        cols=[2, 3, 0, 2, 3, 3],
             #        values=[0.68373820, 0.31626180, 0.45610887, 0.18119845, 0.36269269,
             #                1.        ])
-    
+
     """
     return _C_ops.sparse_softmax(x, axis)
 
diff --git a/python/paddle/incubate/sparse/nn/functional/conv.py b/python/paddle/incubate/sparse/nn/functional/conv.py
index cd3e8e3551f..0512b83d842 100644
--- a/python/paddle/incubate/sparse/nn/functional/conv.py
+++ b/python/paddle/incubate/sparse/nn/functional/conv.py
@@ -90,11 +90,11 @@ def conv3d(x,
 
     The sparse convolution3d functional calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional SparseCooTensors with a shape of 
+    Output(Output) are multidimensional SparseCooTensors with a shape of
     :math:`[N, D, H, W, C]` . Where N is batch size, C is the number of
     channels, D is the depth of the feature, H is the height of the feature,
-    and W is the width of the feature. If bias attribution is provided, 
-    bias is added to the output of the convolution. 
+    and W is the width of the feature. If bias attribution is provided,
+    bias is added to the output of the convolution.
 
     For each input :math:`X`, the equation is:
 
@@ -130,16 +130,16 @@ def conv3d(x,
             W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
 
     Args:
-        x (Tensor): The input is 5-D SparseCooTensor with shape [N, D, H, W, C], the data 
+        x (Tensor): The input is 5-D SparseCooTensor with shape [N, D, H, W, C], the data
             type of input is float16 or float32 or float64.
         weight (Tensor): The convolution kernel, a Tensor with shape [kD, kH, kW, C/g, M],
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
         bias (Tensor, optional): The bias, a Tensor of shape [M, ], currently, only support bias is None.
-        stride (int|list|tuple): The stride size. It means the stride in convolution. If stride is a 
-            list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
+        stride (int|list|tuple): The stride size. It means the stride in convolution. If stride is a
+            list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width).
             Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
-        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
+        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
             on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
@@ -149,25 +149,25 @@ def conv3d(x,
             when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel points. 
+        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel points.
             If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height,
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation.
             Default: dilation = 1.
         groups (int): The groups number of the Conv3D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1. Currently, only support groups=1.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`.
             The default is `"NDHWC"`. When it is `"NDHWC"`, the data is stored in the order of:
             `[batch_size, input_depth, input_height, input_width, input_channels]`.
-        name(str|None): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str|None): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
 
     Returns:
-        A SparseCooTensor representing the conv3d, whose data type is the same with input. 
+        A SparseCooTensor representing the conv3d, whose data type is the same with input.
 
     Examples:
         .. code-block:: python
@@ -181,7 +181,7 @@ def conv3d(x,
               indices = paddle.to_tensor(indices, dtype='int32')
               values = paddle.to_tensor(values, dtype='float32')
               dense_shape = [1, 1, 3, 4, 1]
-              sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
+              sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
               weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
               y = paddle.incubate.sparse.nn.functional.conv3d(sparse_x, weight)
               print(y.shape)
@@ -205,11 +205,11 @@ def subm_conv3d(x,
 
     The sparse submanifold convolution3d functional calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional SparseCooTensors with a shape of 
+    Output(Output) are multidimensional SparseCooTensors with a shape of
     :math:`[N, D, H, W, C]` . Where N is batch size, C is the number of
     channels, D is the depth of the feature, H is the height of the feature,
-    and W is the width of the feature. If bias attribution is provided, 
-    bias is added to the output of the convolution. 
+    and W is the width of the feature. If bias attribution is provided,
+    bias is added to the output of the convolution.
 
     For each input :math:`X`, the equation is:
 
@@ -245,16 +245,16 @@ def subm_conv3d(x,
             W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
 
     Args:
-        x (Tensor): The input is 5-D SparseCooTensor with shape [N, D, H, W, C], the data 
+        x (Tensor): The input is 5-D SparseCooTensor with shape [N, D, H, W, C], the data
             type of input is float16 or float32 or float64.
         weight (Tensor): The convolution kernel, a Tensor with shape [kD, kH, kW, C/g, M],
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
         bias (Tensor, optional): The bias, a Tensor of shape [M, ], currently, only support bias is None.
-        stride (int|list|tuple): The stride size. It means the stride in convolution. If stride is a 
-            list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
+        stride (int|list|tuple): The stride size. It means the stride in convolution. If stride is a
+            list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width).
             Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
-        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
+        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
             on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
@@ -264,30 +264,30 @@ def subm_conv3d(x,
             when `data_format` is `"NHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel points. 
+        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel points.
             If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height,
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation.
             Default: dilation = 1.
         groups (int): The groups number of the Conv3D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Currently, only support groups=1.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`.
             The default is `"NDHWC"`. When it is `"NDHWC"`, the data is stored in the order of:
             `[batch_size, input_depth, input_height, input_width, input_channels]`.
-        key(str, optional): the key is used to save or use the same rulebook, 
+        key(str, optional): the key is used to save or use the same rulebook,
             the definition and role of rulebook refers to
-            https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The 
+            https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The
             default value is None.
-        name(str|None): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str|None): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
 
     Returns:
-        A SparseCooTensor representing the conv3d, whose data type is 
-        the same with input. 
+        A SparseCooTensor representing the conv3d, whose data type is
+        the same with input.
 
     Examples:
         .. code-block:: python
@@ -301,7 +301,7 @@ def subm_conv3d(x,
               indices = paddle.to_tensor(indices, dtype='int32')
               values = paddle.to_tensor(values, dtype='float32')
               dense_shape = [1, 1, 3, 4, 1]
-              sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
+              sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
               weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
               y = paddle.incubate.sparse.nn.functional.subm_conv3d(sparse_x, weight)
               print(y.shape)
diff --git a/python/paddle/incubate/sparse/nn/functional/pooling.py b/python/paddle/incubate/sparse/nn/functional/pooling.py
index cae93553b17..6acd303d0f2 100644
--- a/python/paddle/incubate/sparse/nn/functional/pooling.py
+++ b/python/paddle/incubate/sparse/nn/functional/pooling.py
@@ -54,10 +54,10 @@ def max_pool3d(x,
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-    
+
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
-    
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/incubate/sparse/nn/functional/transformer.py b/python/paddle/incubate/sparse/nn/functional/transformer.py
index a4c9faf9ad5..0ed230aa23c 100644
--- a/python/paddle/incubate/sparse/nn/functional/transformer.py
+++ b/python/paddle/incubate/sparse/nn/functional/transformer.py
@@ -27,19 +27,19 @@ def attention(query,
               attn_mask=None,
               name=None):
     """
-    Note:    
+    Note:
         This API is only used from ``CUDA 11.7`` .
 
-    SparseCsrTensor is used to store the intermediate result of Attention matrix 
-    in Transformer module, which can reduce memory usage and improve performance. 
-    ``sparse_mask`` express the sparse layout in CSR format. 
-    The calculation equation is: 
+    SparseCsrTensor is used to store the intermediate result of Attention matrix
+    in Transformer module, which can reduce memory usage and improve performance.
+    ``sparse_mask`` express the sparse layout in CSR format.
+    The calculation equation is:
 
     .. math::
 
         result = softmax(\frac{ Q * K^T }{\sqrt{d}}) * V
 
-    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module. 
+    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
     The shape of the three parameters are: `[batch_size, num_heads, seq_len, head_dim]`, and
     ``d`` represents ``head_dim`` .
 
@@ -47,12 +47,12 @@ def attention(query,
         query(DenseTensor): `query` in the Attention module. 4D Tensor with float32 or float64.
         key(DenseTensor): `key` in the Attention module. 4D Tensor with float32 or float64.
         value(DenseTensor): `value` in the Attention module. 4D Tensor with float32 or float64.
-        sparse_mask(SparseCsrTensor): The sparse layout in the Attention module. Its dense shape 
-            is `[batch_size*num_heads, seq_len, seq_len]` .  `nnz` of each batch must be the same. 
+        sparse_mask(SparseCsrTensor): The sparse layout in the Attention module. Its dense shape
+            is `[batch_size*num_heads, seq_len, seq_len]` .  `nnz` of each batch must be the same.
             dtype of `crows` and `cols` must be int64, dtype of `values` can be float32 or float64.
-        key_padding_mask(DenseTensor, optional): The key padding mask tensor in the Attention module. 
+        key_padding_mask(DenseTensor, optional): The key padding mask tensor in the Attention module.
             2D tensor with shape: [batch_size, seq_len]. dtype can be float32 or float64. Default: None.
-        attn_mask(DenseTensor, optional): The attention mask tensor in the Attention module. 
+        attn_mask(DenseTensor, optional): The attention mask tensor in the Attention module.
             2D tensor with shape: [seq_len, seq_len]. dtype can be float32 or float64. Default: None.
         name(str, optional): The default value is None. Normally there is no need for user
                         to set this property. For more information, please refer to
@@ -63,7 +63,7 @@ def attention(query,
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
 
             batch_size = 16
diff --git a/python/paddle/incubate/sparse/nn/layer/activation.py b/python/paddle/incubate/sparse/nn/layer/activation.py
index da374fa87a8..f3c2e1456c4 100644
--- a/python/paddle/incubate/sparse/nn/layer/activation.py
+++ b/python/paddle/incubate/sparse/nn/layer/activation.py
@@ -63,10 +63,10 @@ class Softmax(Layer):
     Sparse Softmax Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
 
     Note:
-        Only support axis=-1 for SparseCsrTensor, which is faster when read data 
+        Only support axis=-1 for SparseCsrTensor, which is faster when read data
         by row (axis=-1).
 
-    From the point of view of dense matrix, for each row :math:`i` and each column :math:`j` 
+    From the point of view of dense matrix, for each row :math:`i` and each column :math:`j`
     in the matrix, we have:
 
     .. math::
@@ -96,17 +96,17 @@ class Softmax(Layer):
             #  [0.         0.         0.         0.98275049]]
 
             csr = paddle.to_tensor(np_x).to_sparse_csr()
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 2, 5, 6], 
-            #        cols=[2, 3, 0, 2, 3, 3], 
+            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 2, 5, 6],
+            #        cols=[2, 3, 0, 2, 3, 3],
             #        values=[0.96823406, 0.19722934, 0.94373937, 0.02060066, 0.71456372,
             #                0.98275049])
 
             softmax = paddle.incubate.sparse.nn.Softmax()
             out = softmax(csr)
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 2, 5, 6], 
-            #        cols=[2, 3, 0, 2, 3, 3], 
+            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 2, 5, 6],
+            #        cols=[2, 3, 0, 2, 3, 3],
             #        values=[0.68373820, 0.31626180, 0.45610887, 0.18119845, 0.36269269,
             #                1.        ])
     """
@@ -165,7 +165,7 @@ class ReLU6(Layer):
 
 class LeakyReLU(Layer):
     """
-    Sparse Leaky ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor. 
+    Sparse Leaky ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
 
     .. math::
 
diff --git a/python/paddle/incubate/sparse/nn/layer/conv.py b/python/paddle/incubate/sparse/nn/layer/conv.py
index f44358bbe9f..6684a7561f4 100644
--- a/python/paddle/incubate/sparse/nn/layer/conv.py
+++ b/python/paddle/incubate/sparse/nn/layer/conv.py
@@ -122,11 +122,11 @@ class Conv3D(_Conv3D):
     **Sparse Convlution3d Layer**
     The Sparse convolution3d layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional SparseCooTensors with a shape of 
+    Output(Output) are multidimensional SparseCooTensors with a shape of
     :math:`[N, D, H, W, C]` . Where N is batch size, C is the number of
     channels, D is the depth of the feature, H is the height of the feature,
-    and W is the width of the feature. If bias attribution is provided, 
-    bias is added to the output of the convolution. 
+    and W is the width of the feature. If bias attribution is provided,
+    bias is added to the output of the convolution.
     For each input :math:`X`, the equation is:
 
     ..  math::
@@ -150,7 +150,7 @@ class Conv3D(_Conv3D):
             stride_D = stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -209,14 +209,14 @@ class Conv3D(_Conv3D):
 
           import paddle
           from paddle.fluid.framework import _test_eager_guard
-          
+
           with _test_eager_guard():
             indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
             values = [[1], [2], [3], [4]]
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
             dense_shape = [1, 1, 3, 4, 1]
-            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
             conv = paddle.incubate.sparse.nn.Conv3D(1, 1, (1, 3, 3))
             y = conv(sparse_x)
             print(y.shape)
@@ -255,10 +255,10 @@ class SubmConv3D(_Conv3D):
     **Sparse Submanifold Convlution3d Layer**
     The Sparse submanifold convolution3d layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional SparseCooTensors with a shape of 
+    Output(Output) are multidimensional SparseCooTensors with a shape of
     :math:`[N, D, H, W, C]` . Where N is batch size, C is the number of
     channels, D is the depth of the feature, H is the height of the feature,
-    and W is the width of the feature. If bias attribution is provided, 
+    and W is the width of the feature. If bias attribution is provided,
     bias is added to the output of the convolution.
     For each input :math:`X`, the equation is:
 
@@ -283,7 +283,7 @@ class SubmConv3D(_Conv3D):
             stride_D = stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -297,7 +297,7 @@ class SubmConv3D(_Conv3D):
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. The default value is 1.
         padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Currently only support ``'zeros'``.
-        key(str, optional): the key is used to save or use the same rulebook, 
+        key(str, optional): the key is used to save or use the same rulebook,
             the definition and role of rulebook refers to
             https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The
             default value is None.
@@ -346,14 +346,14 @@ class SubmConv3D(_Conv3D):
 
           import paddle
           from paddle.fluid.framework import _test_eager_guard
-          
+
           with _test_eager_guard():
             indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
             values = [[1], [2], [3], [4]]
             dense_shape = [1, 1, 3, 4, 1]
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
-            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
+            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
             subm_conv = paddle.incubate.sparse.nn.SubmConv3D(1, 1, (1, 3, 3))
             y = subm_conv(sparse_x)
             print(y.shape)
diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/incubate/sparse/nn/layer/norm.py
index 776967ac04d..253b33cc5ee 100644
--- a/python/paddle/incubate/sparse/nn/layer/norm.py
+++ b/python/paddle/incubate/sparse/nn/layer/norm.py
@@ -88,7 +88,7 @@ class BatchNorm(paddle.nn.BatchNorm1D):
 
     Returns:
         None.
-    
+
 
     Examples:
         .. code-block:: python
@@ -100,7 +100,7 @@ class BatchNorm(paddle.nn.BatchNorm1D):
               paddle.seed(123)
               channels = 3
               x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
-              dense_x = paddle.to_tensor(x_data) 
+              dense_x = paddle.to_tensor(x_data)
               sparse_x = dense_x.to_sparse_coo(4)
               batch_norm = paddle.incubate.sparse.nn.BatchNorm(channels)
               batch_norm_out = batch_norm(sparse_x)
@@ -164,8 +164,8 @@ class BatchNorm(paddle.nn.BatchNorm1D):
 class SyncBatchNorm(paddle.nn.SyncBatchNorm):
     r"""
     This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
-    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
-    be used as a normalizer function for other operations, such as conv2d and fully connected 
+    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can
+    be used as a normalizer function for other operations, such as conv2d and fully connected
     operations.
     The data is normalized by the mean and variance of the channel based on whole mini-batch
     , which including data in all gpus.
@@ -173,7 +173,7 @@ class SyncBatchNorm(paddle.nn.SyncBatchNorm):
     Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
     for more details.
 
-    When model in training mode, the :math:`\\mu_{\\beta}` 
+    When model in training mode, the :math:`\\mu_{\\beta}`
     and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
     Calculated as follows:
 
@@ -188,7 +188,7 @@ class SyncBatchNorm(paddle.nn.SyncBatchNorm):
     - :math:`m` : the size of the whole mini-batch data
 
     When model in evaluation mode, the :math:`\\mu_{\\beta}`
-    and :math:`\sigma_{\beta}^{2}` are global statistics (moving_mean and moving_variance, 
+    and :math:`\sigma_{\beta}^{2}` are global statistics (moving_mean and moving_variance,
     which usually got from the pre-trained model). Global statistics calculated as follows:
 
     .. math::
@@ -196,7 +196,7 @@ class SyncBatchNorm(paddle.nn.SyncBatchNorm):
         moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
 
     The formula of normalization is as follows:
- 
+
     ..  math::
 
         \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
@@ -205,12 +205,12 @@ class SyncBatchNorm(paddle.nn.SyncBatchNorm):
 
     - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
     - :math:`\gamma` : trainable scale parameter vector
-    - :math:`\beta` : trainable shift parameter vector 
+    - :math:`\beta` : trainable shift parameter vector
 
     Note:
-        If you want to use container to pack your model and has ``SyncBatchNorm`` in the 
-        evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of 
-        ``list`` to pack the model. 
+        If you want to use container to pack your model and has ``SyncBatchNorm`` in the
+        evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of
+        ``list`` to pack the model.
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -219,12 +219,12 @@ class SyncBatchNorm(paddle.nn.SyncBatchNorm):
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of this layer. If it is set to None or one attribute of ParamAttr, this layerr
              will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with Xavier. If it is set to False, 
+             is not set, the parameter is initialized with Xavier. If it is set to False,
              this layer will not have trainable scale parameter. Default: None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
              If it is set to None or one attribute of ParamAttr, this layer
              will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. If it is set to False, this layer will not 
+             is not set, the bias is initialized zero. If it is set to False, this layer will not
              have trainable bias parameter. Default: None.
 
     Shapes:
diff --git a/python/paddle/incubate/sparse/unary.py b/python/paddle/incubate/sparse/unary.py
index 472a71d482b..621e31bc3e8 100644
--- a/python/paddle/incubate/sparse/unary.py
+++ b/python/paddle/incubate/sparse/unary.py
@@ -33,7 +33,7 @@ _int_dtype_ = [
 def sin(x, name=None):
     """
     Calculate elementwise sin of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = sin(x)
@@ -54,7 +54,7 @@ def sin(x, name=None):
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.sin(sparse_x)
-            
+
     """
     return _C_ops.sparse_sin(x)
 
@@ -63,7 +63,7 @@ def sin(x, name=None):
 def tan(x, name=None):
     """
     Calculate elementwise tan of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = tan(x)
@@ -84,7 +84,7 @@ def tan(x, name=None):
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.tan(sparse_x)
-            
+
     """
     return _C_ops.sparse_tan(x)
 
@@ -93,7 +93,7 @@ def tan(x, name=None):
 def asin(x, name=None):
     """
     Calculate elementwise asin of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = asin(x)
@@ -114,7 +114,7 @@ def asin(x, name=None):
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.asin(sparse_x)
-            
+
     """
     return _C_ops.sparse_asin(x)
 
@@ -123,7 +123,7 @@ def asin(x, name=None):
 def atan(x, name=None):
     """
     Calculate elementwise atan of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = atan(x)
@@ -144,7 +144,7 @@ def atan(x, name=None):
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.atan(sparse_x)
-            
+
     """
     return _C_ops.sparse_atan(x)
 
@@ -153,7 +153,7 @@ def atan(x, name=None):
 def sinh(x, name=None):
     """
     Calculate elementwise sinh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = sinh(x)
@@ -174,7 +174,7 @@ def sinh(x, name=None):
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.sinh(sparse_x)
-            
+
     """
     return _C_ops.sparse_sinh(x)
 
@@ -183,7 +183,7 @@ def sinh(x, name=None):
 def asinh(x, name=None):
     """
     Calculate elementwise asinh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = asinh(x)
@@ -204,7 +204,7 @@ def asinh(x, name=None):
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.asinh(sparse_x)
-            
+
     """
     return _C_ops.sparse_asinh(x)
 
@@ -213,7 +213,7 @@ def asinh(x, name=None):
 def atanh(x, name=None):
     """
     Calculate elementwise atanh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = atanh(x)
@@ -234,7 +234,7 @@ def atanh(x, name=None):
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.atanh(sparse_x)
-            
+
     """
     return _C_ops.sparse_atanh(x)
 
@@ -243,7 +243,7 @@ def atanh(x, name=None):
 def tanh(x, name=None):
     """
     Calculate elementwise tanh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = tanh(x)
@@ -260,11 +260,11 @@ def tanh(x, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.tanh(sparse_x)
-            
+
     """
     return _C_ops.sparse_tanh(x)
 
@@ -273,7 +273,7 @@ def tanh(x, name=None):
 def square(x, name=None):
     """
     Calculate elementwise square of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = square(x)
@@ -290,11 +290,11 @@ def square(x, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.square(sparse_x)
-            
+
     """
     return _C_ops.sparse_square(x)
 
@@ -303,7 +303,7 @@ def square(x, name=None):
 def sqrt(x, name=None):
     """
     Calculate elementwise sqrt of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = sqrt(x)
@@ -324,7 +324,7 @@ def sqrt(x, name=None):
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.sqrt(sparse_x)
-            
+
     """
     return _C_ops.sparse_sqrt(x)
 
@@ -354,7 +354,7 @@ def log1p(x, name=None):
             dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.log1p(sparse_x)
-            
+
     """
     return _C_ops.sparse_log1p(x)
 
@@ -367,7 +367,7 @@ def cast(x, index_dtype=None, value_dtype=None, name=None):
 
     Parameters:
         x (Tensor): The input Sparse Tensor with data type float32, float64.
-        index_dtype (np.dtype|str, optional): Data type of the index of SparseCooTensor, 
+        index_dtype (np.dtype|str, optional): Data type of the index of SparseCooTensor,
             or crows/cols of SparseCsrTensor. Can be uint8, int8, int16, int32, int64.
         value_dtype (np.dtype|str, optional): Data type of the value of SparseCooTensor,
             SparseCsrTensor. Can be bool, float16, float32, float64, int8, int32, int64, uint8.
@@ -385,7 +385,7 @@ def cast(x, index_dtype=None, value_dtype=None, name=None):
             dense_x = paddle.to_tensor([-2, 0, 1])
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.cast(sparse_x, 'int32', 'float64')
-            
+
     """
     if index_dtype and not isinstance(index_dtype, core.VarDesc.VarType):
         index_dtype = convert_np_dtype_to_dtype_(index_dtype)
@@ -420,7 +420,7 @@ def pow(x, factor, name=None):
             dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.pow(sparse_x, 2)
-            
+
     """
     return _C_ops.sparse_pow(x, float(factor))
 
@@ -450,7 +450,7 @@ def neg(x, name=None):
             dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.neg(sparse_x)
-            
+
     """
     return _C_ops.sparse_scale(x, -1.0, 0.0, True)
 
@@ -480,7 +480,7 @@ def abs(x, name=None):
             dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.abs(sparse_x)
-            
+
     """
     return _C_ops.sparse_abs(x)
 
@@ -541,7 +541,7 @@ def rad2deg(x, name=None):
             dense_x = paddle.to_tensor([3.142, 0., -3.142])
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.rad2deg(sparse_x)
-            
+
     """
     if x.dtype in _int_dtype_:
         x = _C_ops.sparse_cast(x, None, core.VarDesc.VarType.FP32)
@@ -553,7 +553,7 @@ def deg2rad(x, name=None):
     """
     Convert each of the elements of input x from degrees to angles in radians,
     requiring x to be a SparseCooTensor or SparseCsrTensor.
-    
+
     .. math::
 
         deg2rad(x) = \pi * x / 180
@@ -574,7 +574,7 @@ def deg2rad(x, name=None):
             dense_x = paddle.to_tensor([-180, 0, 180])
             sparse_x = dense_x.to_sparse_coo(1)
             out = paddle.incubate.sparse.deg2rad(sparse_x)
-            
+
     """
     if x.dtype in _int_dtype_:
         x = _C_ops.sparse_cast(x, None, core.VarDesc.VarType.FP32)
diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py
index 745df5fccf7..005d2cee2dd 100644
--- a/python/paddle/incubate/tensor/math.py
+++ b/python/paddle/incubate/tensor/math.py
@@ -37,9 +37,9 @@ def segment_sum(data, segment_ids, name=None):
     Args:
         data (Tensor): A tensor, available data type float32, float64, int32, int64.
         segment_ids (Tensor): A 1-D tensor, which have the same size
-                            with the first dimension of input data. 
+                            with the first dimension of input data.
                             Available data type is int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -101,10 +101,10 @@ def segment_mean(data, segment_ids, name=None):
 
     Args:
         data (tensor): a tensor, available data type float32, float64, int32, int64.
-        segment_ids (tensor): a 1-d tensor, which have the same size 
-                            with the first dimension of input data. 
+        segment_ids (tensor): a 1-d tensor, which have the same size
+                            with the first dimension of input data.
                             available data type is int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -167,9 +167,9 @@ def segment_min(data, segment_ids, name=None):
     Args:
         data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size
-                            with the first dimension of input data. 
+                            with the first dimension of input data.
                             available data type is int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -233,9 +233,9 @@ def segment_max(data, segment_ids, name=None):
     Args:
         data (tensor): a tensor, available data type float32, float64, int32, int64.
         segment_ids (tensor): a 1-d tensor, which have the same size
-                            with the first dimension of input data. 
+                            with the first dimension of input data.
                             available data type is int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 84b18b69e86..3a0e4efdb71 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -45,7 +45,7 @@ class Metric(object):
             for prediction, label in ...:
                 m.update(prediction, label)
             m.accumulate()
-        
+
     Advanced usage for :code:`compute`:
 
     Metric calculation can be accelerated by calculating metric states
@@ -191,7 +191,7 @@ class Accuracy(Metric):
             is `acc`.
 
     Example by standalone:
-        
+
         .. code-block:: python
 
           import numpy as np
@@ -212,14 +212,14 @@ class Accuracy(Metric):
 
 
     Example with Model API:
-        
+
         .. code-block:: python
 
           import paddle
           from paddle.static import InputSpec
           import paddle.vision.transforms as T
           from paddle.vision.datasets import MNIST
-             
+
           input = InputSpec([None, 1, 28, 28], 'float32', 'image')
           label = InputSpec([None, 1], 'int64', 'label')
           transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
@@ -254,7 +254,7 @@ class Accuracy(Metric):
             label (Tensor): The ground truth value is Tensor with dtype
                 int64. Shape is [batch_size, d0, ..., 1], or
                 [batch_size, d0, ..., num_classes] in one hot representation.
-                
+
         Return:
             Tensor: Correct mask, a tensor with shape [batch_size, d0, ..., topk].
         """
@@ -280,7 +280,7 @@ class Accuracy(Metric):
         Update the metrics states (correct count and total count), in order to
         calculate cumulative accuracy of all instances. This function also
         returns the accuracy of current step.
-        
+
         Args:
             correct: Correct mask, a tensor with shape [batch_size, d0, ..., topk].
 
@@ -345,7 +345,7 @@ class Precision(Metric):
             Default is `precision`.
 
     Example by standalone:
-        
+
         .. code-block:: python
 
           import numpy as np
@@ -361,27 +361,27 @@ class Precision(Metric):
 
 
     Example with Model API:
-        
+
         .. code-block:: python
 
           import numpy as np
-          
+
           import paddle
           import paddle.nn as nn
-          
+
           class Data(paddle.io.Dataset):
               def __init__(self):
                   super(Data, self).__init__()
                   self.n = 1024
                   self.x = np.random.randn(self.n, 10).astype('float32')
                   self.y = np.random.randint(2, size=(self.n, 1)).astype('float32')
-          
+
               def __getitem__(self, idx):
                   return self.x[idx], self.y[idx]
-          
+
               def __len__(self):
                   return self.n
-  
+
           model = paddle.Model(nn.Sequential(
               nn.Linear(10, 1),
               nn.Sigmoid()
@@ -392,7 +392,7 @@ class Precision(Metric):
               optim,
               loss=nn.BCELoss(),
               metrics=paddle.metric.Precision())
-          
+
           data = Data()
           model.fit(data, batch_size=16)
     """
@@ -478,7 +478,7 @@ class Recall(Metric):
             Default is `recall`.
 
     Example by standalone:
-        
+
         .. code-block:: python
 
           import numpy as np
@@ -494,27 +494,27 @@ class Recall(Metric):
 
 
     Example with Model API:
-        
+
         .. code-block:: python
 
           import numpy as np
-          
+
           import paddle
           import paddle.nn as nn
-          
+
           class Data(paddle.io.Dataset):
               def __init__(self):
                   super(Data, self).__init__()
                   self.n = 1024
                   self.x = np.random.randn(self.n, 10).astype('float32')
                   self.y = np.random.randint(2, size=(self.n, 1)).astype('float32')
-          
+
               def __getitem__(self, idx):
                   return self.x[idx], self.y[idx]
-          
+
               def __len__(self):
                   return self.n
-          
+
           model = paddle.Model(nn.Sequential(
               nn.Linear(10, 1),
               nn.Sigmoid()
@@ -525,7 +525,7 @@ class Recall(Metric):
               optim,
               loss=nn.BCELoss(),
               metrics=[paddle.metric.Precision(), paddle.metric.Recall()])
-          
+
           data = Data()
           model.fit(data, batch_size=16)
     """
@@ -626,48 +626,48 @@ class Auc(Metric):
           import paddle
 
           m = paddle.metric.Auc()
-          
+
           n = 8
           class0_preds = np.random.random(size = (n, 1))
           class1_preds = 1 - class0_preds
-          
+
           preds = np.concatenate((class0_preds, class1_preds), axis=1)
           labels = np.random.randint(2, size = (n, 1))
-          
+
           m.update(preds=preds, labels=labels)
           res = m.accumulate()
 
 
     Example with Model API:
-        
+
         .. code-block:: python
 
           import numpy as np
           import paddle
           import paddle.nn as nn
-          
+
           class Data(paddle.io.Dataset):
               def __init__(self):
                   super(Data, self).__init__()
                   self.n = 1024
                   self.x = np.random.randn(self.n, 10).astype('float32')
                   self.y = np.random.randint(2, size=(self.n, 1)).astype('int64')
-          
+
               def __getitem__(self, idx):
                   return self.x[idx], self.y[idx]
-          
+
               def __len__(self):
                   return self.n
-          
+
           model = paddle.Model(nn.Sequential(
               nn.Linear(10, 2), nn.Softmax())
           )
           optim = paddle.optimizer.Adam(
               learning_rate=0.001, parameters=model.parameters())
-          
+
           def loss(x, y):
               return nn.functional.nll_loss(paddle.log(x), y)
-          
+
           model.prepare(
               optim,
               loss=loss,
@@ -767,12 +767,12 @@ class Auc(Metric):
 def accuracy(input, label, k=1, correct=None, total=None, name=None):
     """
     accuracy layer.
-    Refer to the https://en.wikipedia.org/wiki/Precision_and_recall                                                                                           
- 
+    Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
+
     This function computes the accuracy using the input and label.
     If the correct label occurs in top k predictions, then correct will increment by one.
     Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
- 
+
     Args:
         input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
             The shape is ``[sample_number, class_dim]`` .
@@ -782,15 +782,15 @@ def accuracy(input, label, k=1, correct=None, total=None, name=None):
         total(Tensor, optional): The total entries count. A tensor with type int64 or int32.
         name(str, optional): The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`
- 
+
     Returns:
         Tensor, the correct rate. A Tensor with type float32.
- 
+
     Examples:
         .. code-block:: python
- 
+
             import paddle
- 
+
             predictions = paddle.to_tensor([[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]], dtype='float32')
             label = paddle.to_tensor([[2], [0]], dtype="int64")
             result = paddle.metric.accuracy(input=predictions, label=label, k=1)
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 7cebcbbfcab..a4cb052b594 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -965,21 +965,21 @@ def silu(x, name=None):
     .. math::
 
         silu(x) = \frac{x}{1 + e^{-x}}
-    
+
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         A Tensor with the same data type and shape as ``x`` .
-    
+
     Examples:
         .. code-block:: python
 
             import paddle
             import paddle.nn.functional as F
-            
+
             x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
             out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
     """
@@ -1237,7 +1237,7 @@ def softshrink(x, threshold=0.5, name=None):
 
     .. math::
 
-        softshrink(x)= 
+        softshrink(x)=
             \left\{
                 \begin{array}{rcl}
                 x - threshold,& & \text{if } x > threshold \\
@@ -1378,7 +1378,7 @@ def mish(x, name=None):
             \end{cases}
 
         mish(x) = x * \tanh(softplus(x))
-    
+
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         name (str, optional): Name for the operation (optional, default is None).
@@ -1454,7 +1454,7 @@ def thresholded_relu(x, threshold=1.0, name=None):
 
     .. math::
 
-        thresholded\_relu(x) = 
+        thresholded\_relu(x) =
             \left\{
                 \begin{array}{rl}
                 x,& \text{if } \ x > threshold \\
@@ -1507,7 +1507,7 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
 
     .. math::
 
-        \begin{aligned} 
+        \begin{aligned}
         log\_softmax[i, j] &= log(softmax(x)) \\
         &= log(\frac{\exp(X[i, j])}{\sum_j(\exp(X[i, j])})
         \end{aligned}
@@ -1600,7 +1600,7 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
 
 def glu(x, axis=-1, name=None):
     r"""
-    The gated linear unit. The input is evenly splited into 2 parts along a 
+    The gated linear unit. The input is evenly splited into 2 parts along a
     given axis. The first part is used as the content, and the second part is
     passed through a sigmoid function then used as the gate. The output is a
     elementwise multiplication of the content and the gate.
@@ -1611,23 +1611,23 @@ def glu(x, axis=-1, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        axis (int, optional): The axis along which split the input tensor. It 
-            should be in range [-D, D), where D is the dimensions of ``x`` . 
-            If ``axis`` < 0, it works the same way as :math:`axis + D` . 
+        axis (int, optional): The axis along which split the input tensor. It
+            should be in range [-D, D), where D is the dimensions of ``x`` .
+            If ``axis`` < 0, it works the same way as :math:`axis + D` .
             Default is -1.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
-        A Tensor with the same data type as x. The size of the given aixs is 
+        A Tensor with the same data type as x. The size of the given aixs is
         halved.
-    
+
     Examples:
         .. code-block:: python
-        
+
             import paddle
             from paddle.nn import functional as F
-            
+
             x = paddle.to_tensor(
                 [[-0.22014759, -1.76358426,  0.80566144,  0.04241343],
                  [-1.94900405, -1.89956081,  0.17134808, -1.11280477]]
@@ -1635,7 +1635,7 @@ def glu(x, axis=-1, name=None):
             print(F.glu(x).numpy())
             # array([[-0.15216254, -0.9004892 ],
             #        [-1.0577879 , -0.46985325]], dtype=float32)
-        
+
     """
     check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
                              "glu")
@@ -1668,24 +1668,24 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
         gumbel\_softmax(v_i)=\frac{e^{v_i/t}}{\sum_{j=1}^n{e^{v_j/t}}},i=1,2,3...n
 
     Parameters:
-        x (Tensor): An N-D Tensor, the first N - 1 dimensions index into a batch 
-            of independent distributions and the last dimension represents 
+        x (Tensor): An N-D Tensor, the first N - 1 dimensions index into a batch
+            of independent distributions and the last dimension represents
             a vector of probabilities with datatype float32, float64.
         temperature (float, optional): non-negative scalar temperature.
             Default is 1.0.
-        hard (bool, optional): if True, the returned samples will be discretized as 
-            one-hot vectors, but will be differentiated as if it is the soft sample 
+        hard (bool, optional): if True, the returned samples will be discretized as
+            one-hot vectors, but will be differentiated as if it is the soft sample
             in autograd. Default is False.
-        axis (int, optional): The axis along will be calculated softmax value. 
+        axis (int, optional): The axis along will be calculated softmax value.
             Default is -1.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
-        Sampled tensor of same shape as ``x`` from the Gumbel-Softmax distribution. 
-        If ``hard = True``, the returned samples will be one-hot, otherwise they will be 
+        Sampled tensor of same shape as ``x`` from the Gumbel-Softmax distribution.
+        If ``hard = True``, the returned samples will be one-hot, otherwise they will be
         probability distributions that sum to 1 across ``axis``.
-    
+
     Examples:
         .. code-block:: python
 
@@ -1701,7 +1701,7 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
             # [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 1.        ],
             # [0.00000062, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.99999940],
             # [0.00000000, 0.00000000, 0.00000000, 0.00001258, 0.99998736, 0.00000000]]
-        
+
     """
     if in_dygraph_mode():
         return _C_ops.gumbel_softmax(x, temperature, hard, axis)
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index d4d3e9759e0..4171281473d 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -191,9 +191,9 @@ def interpolate(x,
         'bicubic' : Bicubic interpolation
         'area': Area interpolation
 
-    Linear interpolation is the method of using a line connecting two known quantities 
-    to determine the value of an unknown quantity between the two known quantities. 
-    
+    Linear interpolation is the method of using a line connecting two known quantities
+    to determine the value of an unknown quantity between the two known quantities.
+
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
     direction) on input tensor.
@@ -218,8 +218,8 @@ def interpolate(x,
 
     Area interpolation is to perform area interpolation
     in both the 3rd dimension(in height direction) , the 4th dimension(in width
-    direction) and the 5th dimension(in depth direction) on input tensor. Set to 
-    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or 
+    direction) and the 5th dimension(in depth direction) on input tensor. Set to
+    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or
     `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
 
     Example:
@@ -242,7 +242,7 @@ def interpolate(x,
                 input : (N,C,W_in)
                 output: (N,C,W_out) where:
                 W_out = W_{in} * scale_{factor}
-        
+
         Nearest neighbor interpolation:
 
               align_corners = False
@@ -294,25 +294,25 @@ def interpolate(x,
 
     For details of linear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Linear_interpolation.
-    
+
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-    
+
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation.
-    
+
     For details of trilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
-    
+
     For details of bicubic interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bicubic_interpolation
-    
+
     Parameters:
         x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         size (list|tuple|Tensor|None): Output shape of image resize
-             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
-             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w)
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor.
              Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor, its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
@@ -664,9 +664,9 @@ def upsample(x,
         'trilinear' : Trilinear interpolation
         'nearest' : Nearest neighbor interpolation
         'bicubic' : Bicubic interpolation
-    Linear interpolation is the method of using a line connecting two known quantities 
-    to determine the value of an unknown quantity between the two known quantities. 
-    
+    Linear interpolation is the method of using a line connecting two known quantities
+    to determine the value of an unknown quantity between the two known quantities.
+
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
     direction) on input tensor.
@@ -675,7 +675,7 @@ def upsample(x,
     W-direction in this op) on a rectilinear 2D grid. The key idea is
     to perform linear interpolation first in one direction, and then
     again in the other direction.
-    
+
     Bicubic interpolation is an extension of cubic interpolation for interpolating
     data points on a two-dimensional regular grid. The interpolated surface is
     smoother than corresponding surfaces obtained by bilinear interpolation or
@@ -697,7 +697,7 @@ def upsample(x,
 
     Example:
     .. code-block:: text
-    
+
         For scale_factor:
             if align_corners = True && out_size > 1 :
               scale_factor = (in_size-1.0)/(out_size-1.0)
@@ -726,7 +726,7 @@ def upsample(x,
               output: (N,C,H_out,W_out) where:
               H_out = round(H_{in} * scale_{factor})
               W_out = round(W_{in} * scale_{factor})
-        
+
         Bilinear interpolation:
           if:
               align_corners = False , align_mode = 0
@@ -767,30 +767,30 @@ def upsample(x,
               W_out = W_{in} * scale_{factor}
     https://en.wikipedia.org/wiki/Linear_interpolation.
     For details of linear interpolation, please refer to Wikipedia:
-    
+
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-    
+
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation.
-    
+
     For details of bicubic interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bicubic_interpolation
-    
+
     For details of trilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
-    
+
     Parameters:
         x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         size (list|tuple|Tensor|None, optional): Output shape of image resize
-             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
-             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w)
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor.
              Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None, optional): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
-             And :attr:`size` has a higher priority than :attr:`scale_factor`.Has to match input size if 
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.Has to match input size if
              it is either a list or a tuple or a Tensor.
              Default: None.
         mode (str, optional): The resample method. It supports 'linear', 'nearest', 'bilinear',
@@ -817,7 +817,7 @@ def upsample(x,
 
         Examples:
         .. code-block:: python
-	
+
 		import paddle
 		import paddle.nn as nn
 
@@ -1353,11 +1353,11 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
     Parameters:
         x (Tensor): The input tensor with data type float32/double/int32/int64_t.
         pad (Tensor|list[int]|tuple[int]): The padding size with data type int.
-            If mode is 'constant' and length of pad is twice as length of x dimension, then x will 
+            If mode is 'constant' and length of pad is twice as length of x dimension, then x will
             be padded from the first  dimension to the last dimension.
             Else: 1. If input dimension is 3, then the pad has the form (pad_left,
-            pad_right). 2. If the input dimension is 4, then the pad has the form (pad_left, pad_right, 
-            pad_top, pad_bottom). 3. If the input dimension is 5, then the pad has the form 
+            pad_right). 2. If the input dimension is 4, then the pad has the form (pad_left, pad_right,
+            pad_top, pad_bottom). 3. If the input dimension is 5, then the pad has the form
             (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
         mode (str, optional): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'. Default is 'constant'
 
@@ -1370,12 +1370,12 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
         data_format (str, optional): An string from: "NCL", "NLC", NHWC", "NCHW", "NCDHW", "NDHWC". Specify the data format of
            the input data. Default is "NCHW"，
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-                    
-    Returns: 
+
+    Returns:
         Tensor, a Tensor padded according to pad and mode and data type is same as input.
 
     Example:
-    
+
         .. code-block:: text
 
             x = [[[[[1., 2., 3.],
@@ -1428,21 +1428,21 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
             import paddle
             import paddle.nn.functional as F
-            
+
             # example 1
             x_shape = (1, 1, 3)
             x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
             y = F.pad(x, [0, 0, 0, 0, 2, 3], value=1, mode='constant', data_format="NCL")
             print(y)
             # [[[1. 1. 1. 2. 3. 1. 1. 1.]]]
-            
+
             # example 2
             x_shape = (1, 1, 3)
             x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
             y = F.pad(x, [2, 3], value=1, mode='constant', data_format="NCL")
             print(y)
             # [[[1. 1. 1. 2. 3. 1. 1. 1.]]]
-            
+
             # example 3
             x_shape = (1, 1, 2, 3)
             x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
@@ -1601,7 +1601,7 @@ def zeropad2d(x, padding, data_format="NCHW", name=None):
         name(str, optional): The default value is None. Normally there is no need for user
             to set this property.
 
-    Returns: 
+    Returns:
         Tensor, padded with 0 according to pad and data type is same as input.
 
     Examples:
@@ -1637,8 +1637,8 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
         x2 (Tensor): Second input. float32/double.
         axis (int, optional): Dimension of vectors to compute cosine similarity. Default is 1.
         eps(float, optional): Small value to avoid division by zero. Default is 1e-8.
-                    
-    Returns: 
+
+    Returns:
         Tensor, a Tensor representing cosine similarity between x1 and x2 along axis.
 
     Examples:
@@ -1670,7 +1670,7 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
             result = paddle.nn.functional.cosine_similarity(x1, x2, axis=0)
             print(result)
             # [0.97689527,  0.99996042, -0.55138415]
-            
+
     """
     w12 = sum(paddle.multiply(x1, x2), axis=axis)
     w1 = sum(paddle.multiply(x1, x1), axis=axis)
@@ -1696,7 +1696,7 @@ def linear(x, weight, bias=None, name=None):
     input should be a multi-dimensional tensor of shape
     :math:`[batch\_size, *, in\_features]` , where :math:`*` means any number of
     additional dimensions. The linear operator multiplies input tensor with
-    weight and produces an output tensor of shape :math:`[batch\_size, *, out\_features]` , 
+    weight and produces an output tensor of shape :math:`[batch\_size, *, out\_features]` ,
     If :math:`bias` is not None, the bias should be a 1-D tensor of shape
     :math:`[out\_features]` and will be added to the output.
 
@@ -1714,9 +1714,9 @@ def linear(x, weight, bias=None, name=None):
 
     Examples:
         .. code-block:: python
-          
+
           import paddle
-          
+
           x = paddle.randn((3, 2), dtype="float32")
           # x: [[-0.32342386 -1.200079  ]
           #     [ 0.7979031  -0.90978354]
@@ -1819,7 +1819,7 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
 
             import paddle
             import numpy as np
-            
+
             x_data = np.array([[[0, 1, 0],
                                 [ 1,  0, 1]]]).astype("float32")
             print(x_data.shape)
@@ -1827,7 +1827,7 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
             x = paddle.to_tensor(x_data, stop_gradient=False)
             output = paddle.nn.functional.label_smooth(x)
             print(output)
-            
+
             #[[[0.03333334 0.93333334 0.03333334]
             #  [0.93333334 0.03333334 0.93333334]]]
     """
@@ -1860,19 +1860,19 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
 def class_center_sample(label, num_classes, num_samples, group=None):
     """
     Class center sample method is proposed from the paper PartialFC that only sample a subset of the class centers.
-    The process of sampling subset class centers is straightforward: 
+    The process of sampling subset class centers is straightforward:
 
     1. First select the positive class centers;
     2. Then randomly sample negative class centers.
 
-    Specifically, given a label tensor, shape [batch_size], select all the positive class centers and randomly 
+    Specifically, given a label tensor, shape [batch_size], select all the positive class centers and randomly
     sample negative class centers, then remap the input label tensor using the sampled class centers.
 
     For more information, Partial FC: Training 10 Million Identities on a Single Machine
     arxiv: https://arxiv.org/abs/2010.05222
-    
+
     .. hint::
-        If the number of the positive class centers is greater than the input num_samples, it keeps all the positive 
+        If the number of the positive class centers is greater than the input num_samples, it keeps all the positive
         class centers and the shape of sampled_class_center will be [num_positive_class_centers].
 
         The API supports CPU, single GPU and multi GPU.
@@ -1886,7 +1886,7 @@ def class_center_sample(label, num_classes, num_samples, group=None):
         num_classes (int): A positive integer to specify the number of classes at local rank.
             Note that num_classes of each GPU can be different.
         num_samples (int): A positive integer to specify the number of class center to sample.
-        group (Group, optional): The group instance return by paddle.distributed.new_group 
+        group (Group, optional): The group instance return by paddle.distributed.new_group
             or ``None`` for global default group or ``False`` for data parallel (do not communication cross ranks).
             Default is ``None``.
 
@@ -1952,7 +1952,7 @@ def class_center_sample(label, num_classes, num_samples, group=None):
         #       [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ])
         #Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
         #       [0, 2, 4, 8, 9, 3])
-        
+
         # rank 1 output:
         #Tensor(shape=[20], dtype=int64, place=CUDAPlace(1), stop_gradient=True,
         #       [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ])
@@ -2048,17 +2048,17 @@ def fold(x,
          dilations=1,
          name=None):
     r"""
-    
+
     Combines an array of sliding local blocks into a large containing
-    tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
-    combined value in the resulting large tensor by summing all values from all containing blocks. 
+    tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each
+    combined value in the resulting large tensor by summing all values from all containing blocks.
 
 
     For each input :math:`x` with shape [N, C_in , L], the output shape [N, C_out, H_out, W_out]
     can be calculated as following.
 
     .. math::
-    
+
         H_{out} &= output\_size[0] \\
         W_{out} &= output\_size[1] \\
         C_{out} &= \frac{C_{in}}{kernel\_sizes[0]\times kernel\_sizes[1]} \\
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 06784f5d13c..f600e48f6f8 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -281,10 +281,10 @@ def conv1d(x,
             L_{out} = \frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1
 
     Args:
-        x (Tensor): The input is 3-D Tensor with shape [N, C, L], the data type 
+        x (Tensor): The input is 3-D Tensor with shape [N, C, L], the data type
             of input is float16 or float32 or float64.
         weight (Tensor): The convolution kernel with shape [M, C/g, K], where M is
-            the number of output channels, g is the number of groups, K is the kernel's size. 
+            the number of output channels, g is the number of groups, K is the kernel's size.
         bias (Tensor, optional): The bias with shape [M,]. Default: None.
         stride (int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain one integers, (stride_size). Default: 1.
@@ -302,23 +302,23 @@ def conv1d(x,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: 1.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
             The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
             `[batch_size, input_channels, feature_length]`.
-        name(str, optional): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str, optional): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
 
     Returns:
-        A tensor representing the conv1d, whose data type is the 
+        A tensor representing the conv1d, whose data type is the
         same with input.
 
     Raises:
         ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `data_format` is not "NCL" or "NLC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0
             or the element corresponding to the input's channel is not 0.
         ShapeError: If the input is not 3-D Tensor.
         ShapeError: If the input's dimension size and filter's dimension size not equal.
@@ -342,13 +342,13 @@ def conv1d(x,
            [[0, 3, 4],
             [2, 9, 7],
             [5, 6, 8]]]).astype(np.float32)
-          
+
           x_var = paddle.to_tensor(x)
           w_var = paddle.to_tensor(w)
           y_var = F.conv1d(x_var, w_var)
           y_np = y_var.numpy()
           print(y_np)
-          
+
           # [[[133. 238.]
           #   [160. 211.]]]
     """
@@ -528,50 +528,50 @@ def conv2d(x,
             W_{out}&= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
-        x (Tensor): The input is 4-D Tensor with shape [N, C, H, W], the data type 
+        x (Tensor): The input is 4-D Tensor with shape [N, C, H, W], the data type
             of input is float16 or float32 or float64.
         weight (Tensor): The convolution kernel with shape [M, C/g, kH, kW], where M is
             the number of output channels, g is the number of groups, kH is the filter's
-            height, kW is the filter's width. 
+            height, kW is the filter's width.
         bias (Tensor, optional): The bias with shape [M,].
-        stride (int|list|tuple): The stride size. It means the stride in convolution. 
-            If stride is a list/tuple, it must contain two integers, (stride_height, stride_width). 
+        stride (int|list|tuple): The stride size. It means the stride in convolution.
+            If stride is a list/tuple, it must contain two integers, (stride_height, stride_width).
             Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
             on both sides for each dimension.If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when 
-            `data_format` is `"NCHW"`, `padding` can be in the form `[[0,0], [0,0], 
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when
+            `data_format` is `"NCHW"`, `padding` can be in the form `[[0,0], [0,0],
             [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
             when `data_format` is `"NHWC"`, `padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         dilation (int|list|tuple): The dilation size. It means the spacing between the kernel
-            points. If dilation is a list/tuple, it must contain two integers, (dilation_height, 
-            dilation_width). Otherwise, dilation_height = dilation_width = dilation. 
+            points. If dilation is a list/tuple, it must contain two integers, (dilation_height,
+            dilation_width). Otherwise, dilation_height = dilation_width = dilation.
             Default: dilation = 1.
         groups (int): The groups number of the Conv2D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str, optional): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
 
     Returns:
-        A Tensor representing the conv2d result, whose data type is the same with input. 
+        A Tensor representing the conv2d result, whose data type is the same with input.
 
     Raises:
         ValueError: If `data_format` is not "NCHW" or "NHWC".
         ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0
             or the element corresponding to the input's channel is not 0.
         ShapeError: If the input is not 4-D Tensor.
         ShapeError: If the input's dimension size and filter's dimension size not equal.
@@ -766,12 +766,12 @@ def conv1d_transpose(x,
         output_size(int|tuple|list, optional): The output image size. If output size is a
             tuple/list, it must contain one integer, `(feature_length)`. None if use
             filter_size(shape of weight), padding, and stride to calculate output_size.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
             The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
             `[batch_size, input_channels, input_length]`.
-        name(str, optional): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str, optional): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
 
     Returns:
@@ -783,7 +783,7 @@ def conv1d_transpose(x,
     Raises:
         ValueError: If `data_format` is a string, but not "NCL" or "NLC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0
             or the element corresponding to the input's channel is not 0.
         ValueError: If `output_size` and filter_size are None at the same time.
         ValueError: If `output_padding` is greater than `stride`.
@@ -801,7 +801,7 @@ def conv1d_transpose(x,
           import paddle
           import paddle.nn.functional as F
           import numpy as np
-          
+
           # shape: (1, 2, 4)
           x=np.array([[[4, 0, 9, 7],
                        [8, 0, 9, 2,]]]).astype(np.float32)
@@ -812,7 +812,7 @@ def conv1d_transpose(x,
           w_var = paddle.to_tensor(w)
           y_var = F.conv1d_transpose(x_var, w_var)
           print(y_var)
-          
+
           # [[[60. 16. 99. 75.  4.]]]
     """
     cudnn_version = get_cudnn_version()
@@ -1008,12 +1008,12 @@ def conv2d_transpose(x,
            W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] ]
 
     Note:
-          The conv2d_transpose can be seen as the backward of the conv2d. For conv2d, 
-          when stride > 1, conv2d maps multiple input shape to the same output shape, 
+          The conv2d_transpose can be seen as the backward of the conv2d. For conv2d,
+          when stride > 1, conv2d maps multiple input shape to the same output shape,
           so for conv2d_transpose, when stride > 1, input shape maps multiple output shape.
-          If output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`; 
-          else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
-          and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must 
+          If output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`;
+          else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}`
+          and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must
           between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`.
 
     Args:
@@ -1023,17 +1023,17 @@ def conv2d_transpose(x,
             where M is the number of output channels(filters), g is the number of groups,
             kH is the height of the kernel, and kW is the width of the kernel.
         bias(Tensor, optional): The bias, a Tensor with shape [M, ].
-        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a list/tuple, it must contain two integers, (stride_height, stride_width). 
+        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution.
+            If stride is a list/tuple, it must contain two integers, (stride_height, stride_width).
             Otherwise, stride_height = stride_width = stride. Default: stride = 1.
-        padding(str|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
-            on both sides for each dimension. If `padding` is a string, either 'VALID' or 
+        padding(str|int|list|tuple, optional): The padding size. It means the number of zero-paddings
+            on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
-            it could be in three forms: `[pad_height, pad_width]` or 
+            it could be in three forms: `[pad_height, pad_width]` or
             `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCHW"`, `padding` can be in the form 
+            and when `data_format` is `"NCHW"`, `padding` can be in the form
             `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `padding` can be in the form 
+            when `data_format` is `"NHWC"`, `padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
@@ -1044,30 +1044,30 @@ def conv2d_transpose(x,
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups = 1.
-        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a list/tuple, it must contain two integers, (dilation_height, dilation_width). 
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points.
+            If dilation is a list/tuple, it must contain two integers, (dilation_height, dilation_width).
             Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         output_size(int|tuple|list, optional): The output image size. If output size is a
             tuple/list, it must contain two integers, (image_height, image_width). None if use
             filter_size(shape of weight), padding, and stride to calculate output_size.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str, optional): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
 
     Returns:
         A Tensor representing the conv2d_transpose, whose
-        data type is the same with input and shape is (num_batches, channels, out_h, 
-        out_w) or (num_batches, out_h, out_w, channels). The tensor variable storing 
+        data type is the same with input and shape is (num_batches, channels, out_h,
+        out_w) or (num_batches, out_h, out_w, channels). The tensor variable storing
         transposed convolution result.
 
     Raises:
         ValueError: If `data_format` is not "NCHW" or "NHWC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0
             or the element corresponding to the input's channel is not 0.
         ValueError: If `output_size` and kernel_size are None at the same time.
         ShapeError: If the input is not 4-D Tensor.
@@ -1272,16 +1272,16 @@ def conv3d(x,
             W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
 
     Args:
-        x (Tensor): The input is 5-D Tensor with shape [N, C, D, H, W], the data 
+        x (Tensor): The input is 5-D Tensor with shape [N, C, D, H, W], the data
             type of input is float16 or float32 or float64.
         weight (Tensor): The convolution kernel, a Tensor with shape [M, C/g, kD, kH, kW],
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
         bias (Tensor, optional): The bias, a Tensor of shape [M, ].
-        stride (int|list|tuple, optional): The stride size. It means the stride in convolution. If stride is a 
-            list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
+        stride (int|list|tuple, optional): The stride size. It means the stride in convolution. If stride is a
+            list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width).
             Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
-        padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
+        padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings
             on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
@@ -1291,27 +1291,27 @@ def conv3d(x,
             when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        dilation (int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+        dilation (int|list|tuple, optional): The dilation size. It means the spacing between the kernel points.
             If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height,
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation.
             Default: dilation = 1.
         groups (int, optional): The groups number of the Conv3D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`.
             The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_depth, input_height, input_width]`.
-        name(str|None, optional): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str|None, optional): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
 
     Returns:
-        A Tensor representing the conv3d, whose data type is 
-        the same with input. If act is None, the tensor storing the 
-        convolution result, and if act is not None, the tensor storing 
+        A Tensor representing the conv3d, whose data type is
+        the same with input. If act is None, the tensor storing the
+        convolution result, and if act is not None, the tensor storing
         convolution and non-linearity activation result.
 
     Examples:
@@ -1440,28 +1440,28 @@ def conv3d_transpose(x,
            W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[2] ]
 
     Note:
-          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d, 
-          when stride > 1, conv3d maps multiple input shape to the same output shape, 
+          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d,
+          when stride > 1, conv3d maps multiple input shape to the same output shape,
           so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
-          H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output 
-          size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
-          the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
-          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
+          H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output
+          size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`,
+          the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}`
+          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must
           between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`.
 
     Args:
-        x(Tensor): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
+        x(Tensor): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type
             of input is float32 or float64.
         weight (Tensor): The convolution kernel, a Tensor with shape [C, M/g, kD, kH, kW],
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
         bias (Tensor, optional): The bias, a Tensor of shape [M, ].
-        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a list/tuple, it must contain three integers, (stride_depth, stride_height, 
-            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
+        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution.
+            If stride is a list/tuple, it must contain three integers, (stride_depth, stride_height,
+            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride.
             Default: stride = 1.
-        padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
+        padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings
             on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
@@ -1479,32 +1479,32 @@ def conv3d_transpose(x,
             first half of the input channels, while the second half of the
             filters is only connected to the second half of the input channels.
             Default: groups=1
-        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height, 
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points.
+            If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height,
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation.
             Default: dilation = 1.
         output_size(int|list|tuple, optional): The output image size. If output size is a
             list/tuple, it must contain three integers, (image_depth, image_height, image_width).
             None if use filter_size(shape of weight), padding, and stride to calculate output_size.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str, optional): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
 
     Returns:
         A Tensor representing the conv3d_transpose, whose data
-        type is the same with input and shape is (num_batches, channels, out_d, out_h, 
-        out_w) or (num_batches, out_d, out_h, out_w, channels). If act is None, the tensor 
-        variable storing the transposed convolution result, and if act is not None, the tensor 
+        type is the same with input and shape is (num_batches, channels, out_d, out_h,
+        out_w) or (num_batches, out_d, out_h, out_w, channels). If act is None, the tensor
+        variable storing the transposed convolution result, and if act is not None, the tensor
         variable storing transposed convolution and non-linearity activation result.
 
     Raises:
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0
             or the element corresponding to the input's channel is not 0.
         ValueError: If `output_size` and kernel_size are None at the same time.
         ShapeError: If the input is not 5-D Tensor.
@@ -1515,7 +1515,7 @@ def conv3d_transpose(x,
 
     Examples:
        .. code-block:: python
-          
+
           import paddle
           import paddle.nn.functional as F
 
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 7ae35666c86..e3c7e96939c 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -33,8 +33,8 @@ __all__ = []
 
 def diag_embed(input, offset=0, dim1=-2, dim2=-1):
     """
-    This OP creates a tensor whose diagonals of certain 2D planes (specified by dim1 and dim2) 
-    are filled by ``input``. By default, a 2D plane formed by the last two dimensions 
+    This OP creates a tensor whose diagonals of certain 2D planes (specified by dim1 and dim2)
+    are filled by ``input``. By default, a 2D plane formed by the last two dimensions
     of the returned tensor will be selected.
 
     The argument ``offset`` determines which diagonal is generated:
@@ -48,16 +48,16 @@ def diag_embed(input, offset=0, dim1=-2, dim2=-1):
         offset(int, optional): Which diagonal to consider. Default: 0 (main diagonal).
         dim1(int, optional): The first dimension with respect to which to take diagonal. Default: -2.
         dim2(int, optional): The second dimension with respect to which to take diagonal. Default: -1.
-    
+
     Returns:
         Tensor, the output data type is the same as input data type.
-    
+
     Examples:
         .. code-block:: python
 
             import paddle.nn.functional as F
             import numpy as np
-            
+
             diag_embed = np.random.randn(2, 3).astype('float32')
             # [[ 0.7545889 , -0.25074545,  0.5929117 ],
             #  [-0.6097662 , -0.01753256,  0.619769  ]]
@@ -191,7 +191,7 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
             to :ref:`api_guide_Name`. Usually name is no need to set and \
             None by default.
 
-    Returns: 
+    Returns:
             Tensor, The output sequence mask. Tensor with shape [d_1, d_2, ..., d_n, maxlen] \
             and data type of :code:`dtype`. The data type should be bool, float32, float64, int8, \
             int32 or int64.
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index 0ed7d314b08..62bced8cb22 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -128,7 +128,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
         otherwise the program will throw an exception and exit.
 
     .. code-block:: text
-    
+
             x is a Tensor.
                 padding_idx = -1
                 x.data = [[1, 3], [2, 4], [4, 127]]
@@ -171,7 +171,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
     Examples:
 
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn as nn
 
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c0742bdbf40..3f5637fa392 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -166,17 +166,17 @@ def fluid_softmax_with_cross_entropy(logits,
                                      axis=-1):
     r"""
 
-    This operator implements the cross entropy loss function with softmax. This function 
-    combines the calculation of the softmax operation and the cross entropy loss function 
+    This operator implements the cross entropy loss function with softmax. This function
+    combines the calculation of the softmax operation and the cross entropy loss function
     to provide a more numerically stable gradient.
 
     Because this operator performs a softmax on logits internally, it expects
     unscaled logits. This operator should not be used with the output of
     softmax operator since that would produce incorrect results.
 
-    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
-    expects mutually exclusive hard labels, each sample in a batch is in exactly 
-    one class with a probability of 1.0. Each sample in the batch will have a 
+    When the attribute :attr:`soft_label` is set :attr:`False`, this operators
+    expects mutually exclusive hard labels, each sample in a batch is in exactly
+    one class with a probability of 1.0. Each sample in the batch will have a
     single label.
 
     The equation is as follows:
@@ -203,27 +203,27 @@ def fluid_softmax_with_cross_entropy(logits,
     Args:
         logits (Tensor): A multi-dimension ``Tensor`` , and the data type is float32 or float64. The input tensor of unscaled log probabilities.
         label (Tensor): The ground truth  ``Tensor`` , data type is the same
-            as the ``logits`` . If :attr:`soft_label` is set to :attr:`True`, 
-            Label is a ``Tensor``  in the same shape with :attr:`logits`. 
-            If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor`` 
+            as the ``logits`` . If :attr:`soft_label` is set to :attr:`True`,
+            Label is a ``Tensor``  in the same shape with :attr:`logits`.
+            If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor``
             in the same shape with :attr:`logits` expect shape in dimension :attr:`axis` as 1.
         soft_label (bool, optional): A flag to indicate whether to interpretant the given
             labels as soft labels. Default False.
         ignore_index (int, optional): Specifies a target value that is ignored and does
                                       not contribute to the input gradient. Only valid
-                                      if :attr:`soft_label` is set to :attr:`False`. 
+                                      if :attr:`soft_label` is set to :attr:`False`.
                                       Default: kIgnoreIndex(-100).
         numeric_stable_mode (bool, optional): A flag to indicate whether to use a more
                                               numerically stable algorithm. Only valid
-                                              when :attr:`soft_label` is :attr:`False` 
-                                              and GPU is used. When :attr:`soft_label` 
-                                              is :attr:`True` or CPU is used, the 
+                                              when :attr:`soft_label` is :attr:`False`
+                                              and GPU is used. When :attr:`soft_label`
+                                              is :attr:`True` or CPU is used, the
                                               algorithm is always numerically stable.
                                               Note that the speed may be slower when use
                                               stable algorithm. Default: True.
         return_softmax (bool, optional): A flag indicating whether to return the softmax
                                          along with the cross entropy loss. Default: False.
-        axis (int, optional): The index of dimension to perform softmax calculations. It 
+        axis (int, optional): The index of dimension to perform softmax calculations. It
                               should be in range :math:`[-1, rank - 1]`, while :math:`rank`
                               is the rank of input :attr:`logits`. Default: -1.
 
@@ -300,42 +300,42 @@ def fluid_softmax_with_cross_entropy(logits,
 
 
 def npair_loss(anchor, positive, labels, l2_reg=0.002):
-    """ 
-  
+    """
+
     Npair loss requires paired data. Npair loss has two parts: the first part is L2
     regularizer on the embedding vector; the second part is cross entropy loss which
     takes the similarity matrix of anchor and positive as logits.
-  
+
     For more information, please refer to:
     `Improved Deep Metric Learning with Multi class N pair Loss Objective <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf>`_
-  
+
     Args:
-      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims], 
+      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims],
                         the data type is float32 or float64.
-      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims], 
+      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims],
                         the data type is float32 or float64.
       labels(Tensor): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
       l2_reg(float32): L2 regularization term on embedding vector, default: 0.002.
 
-  
+
     Returns:
       A Tensor representing the npair loss, the data type is the same as anchor, the shape is [1].
-  
+
     Examples:
 
       .. code-block:: python
-  
+
           import paddle
-          
+
           DATATYPE = "float32"
-  
+
           anchor = paddle.rand(shape=(18, 6), dtype=DATATYPE)
           positive = paddle.rand(shape=(18, 6), dtype=DATATYPE)
           labels = paddle.rand(shape=(18,), dtype=DATATYPE)
-          
+
           npair_loss = paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg = 0.002)
           print(npair_loss)
-  
+
     """
     check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
                              'npair_loss')
@@ -1107,7 +1107,7 @@ def margin_ranking_loss(input,
         reduction (str, optional): Indicate the reduction to apply to the loss, the candicates are ``'none'``, ``'mean'``, ``'sum'``.If :attr:`reduction` is ``'none'``, the unreduced loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. Default is ``'mean'``.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-    Returns: 
+    Returns:
         Tensor, if :attr:`reduction` is ``'mean'`` or ``'sum'``, the out shape is :math:`[1]`, otherwise the shape is the same as `input` .The same dtype as input tensor.
 
     Examples:
@@ -1745,7 +1745,7 @@ def margin_cross_entropy(logits,
         margin2 (float, optional): m2 of margin loss, default value is `0.5`.
         margin3 (float, optional): m3 of margin loss, default value is `0.0`.
         scale (float, optional): s of margin loss, default value is `64.0`.
-        group (Group, optional): The group instance return by paddle.distributed.new_group 
+        group (Group, optional): The group instance return by paddle.distributed.new_group
             or ``None`` for global default group or ``False`` for data parallel (do not communication cross ranks).
             Default is ``None``.
         return_softmax (bool, optional): Whether return softmax probability. Default value is `False`.
@@ -1801,7 +1801,7 @@ def margin_cross_entropy(logits,
         print(label)
         print(loss)
         print(softmax)
-        
+
         #Tensor(shape=[2, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
         #       [[ 0.85204151, -0.55557678,  0.04994566,  0.71986042],
         #        [-0.20198586, -0.35270476, -0.55182702,  0.09749021]])
@@ -1862,7 +1862,7 @@ def margin_cross_entropy(logits,
         print(loss)
         print(softmax)
 
-        # python -m paddle.distributed.launch --gpus=0,1 test_margin_cross_entropy.py 
+        # python -m paddle.distributed.launch --gpus=0,1 test_margin_cross_entropy.py
         ## for rank0 input
         #Tensor(shape=[4, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
         #       [[ 0.32888934,  0.02408748, -0.02763289,  0.18173063],
@@ -2020,17 +2020,17 @@ def softmax_with_cross_entropy(logits,
                                return_softmax=False,
                                axis=-1):
     r"""
-    This operator implements the cross entropy loss function with softmax. This function 
-    combines the calculation of the softmax operation and the cross entropy loss function 
+    This operator implements the cross entropy loss function with softmax. This function
+    combines the calculation of the softmax operation and the cross entropy loss function
     to provide a more numerically stable gradient.
 
     Because this operator performs a softmax on logits internally, it expects
     unscaled logits. This operator should not be used with the output of
     softmax operator since that would produce incorrect results.
 
-    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
-    expects mutually exclusive hard labels, each sample in a batch is in exactly 
-    one class with a probability of 1.0. Each sample in the batch will have a 
+    When the attribute :attr:`soft_label` is set :attr:`False`, this operators
+    expects mutually exclusive hard labels, each sample in a batch is in exactly
+    one class with a probability of 1.0. Each sample in the batch will have a
     single label.
 
     The equation is as follows:
@@ -2057,27 +2057,27 @@ def softmax_with_cross_entropy(logits,
     Args:
         logits (Tensor): A multi-dimension ``Tensor`` , and the data type is float32 or float64. The input tensor of unscaled log probabilities.
         label (Tensor): The ground truth  ``Tensor`` , data type is the same
-            as the ``logits`` . If :attr:`soft_label` is set to :attr:`True`, 
-            Label is a ``Tensor``  in the same shape with :attr:`logits`. 
-            If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor`` 
+            as the ``logits`` . If :attr:`soft_label` is set to :attr:`True`,
+            Label is a ``Tensor``  in the same shape with :attr:`logits`.
+            If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor``
             in the same shape with :attr:`logits` expect shape in dimension :attr:`axis` as 1.
         soft_label (bool, optional): A flag to indicate whether to interpretant the given
             labels as soft labels. Default False.
         ignore_index (int, optional): Specifies a target value that is ignored and does
                                       not contribute to the input gradient. Only valid
-                                      if :attr:`soft_label` is set to :attr:`False`. 
+                                      if :attr:`soft_label` is set to :attr:`False`.
                                       Default: kIgnoreIndex(-100).
         numeric_stable_mode (bool, optional): A flag to indicate whether to use a more
                                               numerically stable algorithm. Only valid
-                                              when :attr:`soft_label` is :attr:`False` 
-                                              and GPU is used. When :attr:`soft_label` 
-                                              is :attr:`True` or CPU is used, the 
+                                              when :attr:`soft_label` is :attr:`False`
+                                              and GPU is used. When :attr:`soft_label`
+                                              is :attr:`True` or CPU is used, the
                                               algorithm is always numerically stable.
                                               Note that the speed may be slower when use
                                               stable algorithm. Default: True.
         return_softmax (bool, optional): A flag indicating whether to return the softmax
                                          along with the cross entropy loss. Default: False.
-        axis (int, optional): The index of dimension to perform softmax calculations. It 
+        axis (int, optional): The index of dimension to perform softmax calculations. It
                               should be in range :math:`[-1, rank - 1]`, while :math:`rank`
                               is the rank of input :attr:`logits`. Default: -1.
 
@@ -2119,18 +2119,18 @@ def cross_entropy(input,
                   use_softmax=True,
                   name=None):
     r"""
-    By default, this operator implements the cross entropy loss function with softmax. This function 
-    combines the calculation of the softmax operation and the cross entropy loss function 
-    to provide a more numerically stable computing. 
+    By default, this operator implements the cross entropy loss function with softmax. This function
+    combines the calculation of the softmax operation and the cross entropy loss function
+    to provide a more numerically stable computing.
 
     This operator will calculate the cross entropy loss function without softmax when use_softmax=False.
 
-    By default, this operator will calculate the mean of the result, and you can also affect 
-    the default behavior by using the reduction parameter. Please refer to the part of 
+    By default, this operator will calculate the mean of the result, and you can also affect
+    the default behavior by using the reduction parameter. Please refer to the part of
     parameters for details.
 
     This operator can be used to calculate the softmax cross entropy loss with soft and hard labels.
-    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels 
+    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels
     mean the probability of the actual label, 0.6, 0.8, 0.2, etc.
 
     The calculation of this operator includes the following two steps.
@@ -2185,7 +2185,7 @@ def cross_entropy(input,
             1.1. Hard labels (soft_label = False)
 
             .. math::
-                \\loss_j=loss_j*weight[label_j] 
+                \\loss_j=loss_j*weight[label_j]
 
 
             1.2. Soft labels (soft_label = True)
@@ -2195,21 +2195,21 @@ def cross_entropy(input,
 
         2. reduction
 
-            2.1 if the ``reduction`` parameter is ``none`` 
+            2.1 if the ``reduction`` parameter is ``none``
 
                 Return the previous result directly
 
-            2.2 if the ``reduction`` parameter is ``sum`` 
+            2.2 if the ``reduction`` parameter is ``sum``
 
                 Return the sum of the previous results
 
             .. math::
                \\loss=\sum_{j}loss_j
 
-            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to 
-            the ``weight`` parameter as follows. 
+            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to
+            the ``weight`` parameter as follows.
 
-            2.3.1. If the  ``weight``  parameter is ``None`` 
+            2.3.1. If the  ``weight``  parameter is ``None``
 
                    Return the average value of the previous results
 
@@ -2223,7 +2223,7 @@ def cross_entropy(input,
             1. Hard labels (soft_label = False)
 
             .. math::
-                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 
+                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j]
 
             2. Soft labels (soft_label = True)
 
@@ -2236,11 +2236,11 @@ def cross_entropy(input,
         - **input** (Tensor)
 
             Input tensor, the data type is float32, float64. Shape is
-        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` . 
+        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
 
-            Note: 
+            Note:
 
-                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
                 output of softmax operator, which will produce incorrect results.
 
                 2. when use_softmax=False, it expects the output of softmax operator.
@@ -2251,20 +2251,20 @@ def cross_entropy(input,
             :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
             the data type is int32, int64, float32, float64, where each value is [0, C-1].
 
-            2. If soft_label=True, the shape and data type should be same with ``input`` , 
+            2. If soft_label=True, the shape and data type should be same with ``input`` ,
             and the sum of the labels for each sample should be 1.
 
         - **weight** (Tensor, optional)
 
-            a manual rescaling weight given to each class. 
-            If given, has to be a Tensor of size C and the data type is float32, float64. 
+            a manual rescaling weight given to each class.
+            If given, has to be a Tensor of size C and the data type is float32, float64.
             Default is ``'None'`` .
 
         - **ignore_index** (int64, optional)
 
             Specifies a target value that is ignored
-            and does not contribute to the loss. A negative value means that no label 
-            value needs to be ignored. Only valid when soft_label = False.  
+            and does not contribute to the loss. A negative value means that no label
+            value needs to be ignored. Only valid when soft_label = False.
             Default is ``-100`` .
 
         - **reduction** (str, optional)
@@ -2278,14 +2278,14 @@ def cross_entropy(input,
 
         - **soft_label** (bool, optional)
 
-            Indicate whether label is soft. 
+            Indicate whether label is soft.
             Default is ``False``.
 
         - **axis** (int, optional)
 
-            The index of dimension to perform softmax calculations. 
-            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the 
-            number of dimensions of input :attr:`input`. 
+            The index of dimension to perform softmax calculations.
+            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
+            number of dimensions of input :attr:`input`.
             Default is ``-1`` .
 
         - **use_softmax** (bool, optional)
@@ -2307,9 +2307,9 @@ def cross_entropy(input,
 
         If :attr:`reduction` is ``'none'``:
 
-        1. If soft_label = False, the dimension of return value is the same with ``label`` . 
+        1. If soft_label = False, the dimension of return value is the same with ``label`` .
 
-        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 
+        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
 
 
     Examples:
@@ -2322,10 +2322,10 @@ def cross_entropy(input,
             N=100
             C=200
             reduction='mean'
-            input =  paddle.rand([N, C], dtype='float64')  
+            input =  paddle.rand([N, C], dtype='float64')
             label =  paddle.randint(0, C, shape=[N], dtype='int64')
-            weight = paddle.rand([C], dtype='float64') 
-            
+            weight = paddle.rand([C], dtype='float64')
+
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction=reduction)
             dy_ret = cross_entropy_loss(
@@ -2349,9 +2349,9 @@ def cross_entropy(input,
             labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
             labels /= paddle.sum(labels, axis=axis, keepdim=True)
             paddle_loss_mean = paddle.nn.functional.cross_entropy(
-                                                                  logits,  
-                                                                  labels, 
-                                                                  soft_label=True, 
+                                                                  logits,
+                                                                  labels,
+                                                                  soft_label=True,
                                                                   axis=axis,
                                                                   weight=weight,
                                                                   reduction=reduction)
@@ -2744,12 +2744,12 @@ def sigmoid_focal_loss(logit,
     it is used in one-stage object detection where the foreground-background class
     imbalance is extremely high.
 
-    This operator measures focal loss function as follows: 
+    This operator measures focal loss function as follows:
 
     .. math::
            Out = -Labels * alpha * {(1 - \sigma(Logit))}^{gamma}\log(\sigma(Logit)) - (1 - Labels) * (1 - alpha) * {\sigma(Logit)}^{gamma}\log(1 - \sigma(Logit))
 
-    We know that :math:`\sigma(Logit) = \frac{1}{1 + \exp(-Logit)}`. 
+    We know that :math:`\sigma(Logit) = \frac{1}{1 + \exp(-Logit)}`.
 
     Then, if :attr:`normalizer` is not None, this operator divides the
     normalizer tensor on the loss `Out`:
@@ -2776,7 +2776,7 @@ def sigmoid_focal_loss(logit,
             For object detection task, it is the number of positive samples.
             If set to None, the focal loss will not be normalized. Default is None.
         alpha(int|float, optional): Hyper-parameter to balance the positive and negative example,
-            it should be between 0 and 1.  Default value is set to 0.25. 
+            it should be between 0 and 1.  Default value is set to 0.25.
         gamma(int|float, optional): Hyper-parameter to modulate the easy and hard examples.
             Default value is set to 2.0.
         reduction (str, optional): Indicate how to average the loss by batch_size,
@@ -3239,7 +3239,7 @@ def triplet_margin_with_distance_loss(input,
     .. math::
         d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
 
-    or user can defined their own distance functions. `margin` is a nonnegative margin representing the minimum difference 
+    or user can defined their own distance functions. `margin` is a nonnegative margin representing the minimum difference
     between the positive and negative distances that is required for the loss to be 0. If `swap` is true, it will compare distance of (input, negative) with
     distance of (negative, positive) and change it to the smaller one. For more details see http://www.bmva.org/bmvc/2016/papers/paper119/paper119.pdf.
 
@@ -3255,10 +3255,10 @@ def triplet_margin_with_distance_loss(input,
             The shape of label is the same as the shape of input.
 
         distance_function (callable, optional): Quantifies the distance between two tensors. if not specified, 2 norm functions will be used.
-	
+
 	    margin (float, optional):Default: :math:`1`.A nonnegative margin representing the minimum difference
             between the positive and negative distances required for the loss to be 0.
-	
+
         swap (bool, optional):The distance swap changes the negative distance to the swap distance (distance between positive samples
                 and negative samples) if swap distance smaller than negative distance. Default: ``False``.
 
@@ -3270,7 +3270,7 @@ def triplet_margin_with_distance_loss(input,
             Default: ``'mean'``
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-	    
+
     Returns:
         Output: Tensor. The tensor variable storing the triplet_margin_with_distance_loss of input and positive and negative.
 
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 637b192207e..307b0783b15 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -195,7 +195,7 @@ def avg_pool1d(x,
 
     Examples:
         .. code-block:: python
-          
+
             import paddle
             import paddle.nn as nn
 
@@ -314,16 +314,16 @@ def avg_pool2d(x,
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-    
+
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
-    
+
     Examples:
         .. code-block:: python
-          
+
             import paddle
             import paddle.nn.functional as F
-            
+
             # avg pool2d
             x = paddle.uniform([1, 3, 32, 32], paddle.float32)
             out = F.avg_pool2d(x,
@@ -435,13 +435,13 @@ def avg_pool3d(x,
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-    
+
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
     Examples:
         .. code-block:: python
-          
+
           import paddle
 
           x = paddle.uniform([1, 3, 32, 32, 32], paddle.float32)
@@ -696,13 +696,13 @@ def max_unpool1d(x,
                  name=None):
     r"""
     This API implements max unpooling 1d opereation.
-    `max_unpool1d` accepts the output of `max_pool1d` as input, 
-    including the indices of the maximum value and calculate the partial inverse. 
+    `max_unpool1d` accepts the output of `max_pool1d` as input,
+    including the indices of the maximum value and calculate the partial inverse.
     All non-maximum values ​​are set to zero.
 
     - Input: :math:`(N, C, L_{in})`
     - Output: :math:`(N, C, L_{out})`, where
-    
+
     .. math::
         L_{out} = (L_{in} - 1) * stride - 2 * padding + kernel\_size
 
@@ -711,11 +711,11 @@ def max_unpool1d(x,
 
     Args:
         x (Tensor): The input tensor of unpooling operator which is a 3-D tensor with
-                          shape [N, C, L]. The format of input tensor is `"NCL"`, 
+                          shape [N, C, L]. The format of input tensor is `"NCL"`,
                           where `N` is batch size, `C` is the number of channels, `L` is
                           the length of the feature. The data type is float32 or float64.
         indices (Tensor): The indices given out by maxpooling1d which is a 3-D tensor with
-                          shape [N, C, L]. The format of input tensor is `"NCL"` , 
+                          shape [N, C, L]. The format of input tensor is `"NCL"` ,
                           where `N` is batch size, `C` is the number of channels, `L` is
                           the length of the featuree. The data type is float32 or float64.
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
@@ -723,7 +723,7 @@ def max_unpool1d(x,
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, stride, padding).
         data_format (string): The data format of the input and output data.
@@ -734,11 +734,11 @@ def max_unpool1d(x,
                              None by default.
 
     Returns:
-        Tensor: The output tensor of unpooling result. 
+        Tensor: The output tensor of unpooling result.
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
 
@@ -815,12 +815,12 @@ def max_unpool2d(x,
 
     Args:
         x (Tensor): The input tensor of unpooling operator which is a 4-D tensor with
-                          shape [N, C, H, W]. The format of input tensor is `"NCHW"`, 
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"`,
                           where `N` is batch size, `C` is the number of channels,
                           `H` is the height of the feature, and `W` is the width of the
                           feature. The data type if float32 or float64.
         indices (Tensor): The indices given out by maxpooling2d which is a 4-D tensor with
-                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` , 
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` ,
                           where `N` is batch size, `C` is the number of channels,
                           `H` is the height of the feature, and `W` is the width of the
                           feature. The data type if float32 or float64.
@@ -829,7 +829,7 @@ def max_unpool2d(x,
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, padding).
         name(str, optional): For detailed information, please refer
@@ -849,16 +849,16 @@ def max_unpool2d(x,
           or as given by :attr:`output_size` in the call operator
 
         Returns:
-            Tensor: The output tensor of unpooling result. 
+            Tensor: The output tensor of unpooling result.
 
         Raises:
             ValueError: If the input is not a 4-D tensor.
             ValueError: If indeces shape is not equal input shape.
-            
+
 
         Examples:
             .. code-block:: python
-          
+
             import paddle
             import paddle.nn.functional as F
 
@@ -868,9 +868,9 @@ def max_unpool2d(x,
             unpool_out = F.max_unpool2d(pool_out, indices, kernel_size=2, padding=0)
             # unpool_out shape: [1, 1, 6, 6]
 
-            # specify a different output size than input size 
+            # specify a different output size than input size
             unpool_out = F.max_unpool2d(pool_out, indices, kernel_size=2, padding=0, output_size=[7,7])
-            # unpool_out shape: [1, 1, 7, 7] 
+            # unpool_out shape: [1, 1, 7, 7]
 
     """
     kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
@@ -929,13 +929,13 @@ def max_unpool3d(x,
                  name=None):
     r"""
     This API implements max unpooling 3d opereation.
-    `max_unpool3d` accepts the output of `max_pool3d` as input, 
-    including the indices of the maximum value and calculate the partial inverse. 
+    `max_unpool3d` accepts the output of `max_pool3d` as input,
+    including the indices of the maximum value and calculate the partial inverse.
     All non-maximum values ​​are set to zero.
 
     - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
     - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
-    
+
     .. math::
         D_{out} = (D_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0]
 
@@ -950,21 +950,21 @@ def max_unpool3d(x,
 
     Args:
         x (Tensor): The input tensor of unpooling operator which is a 5-D tensor with
-                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"`, 
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"`,
                           where `N` is batch size, `C` is the number of channels, `D` is
-                          the depth of the feature, `H` is the height of the feature, 
+                          the depth of the feature, `H` is the height of the feature,
                           and `W` is the width of the feature. The data type is float32 or float64.
         indices (Tensor): The indices given out by maxpooling3d which is a 5-D tensor with
-                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` , 
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` ,
                           where `N` is batch size, `C` is the number of channels, `D` is
-                          the depth of the feature, `H` is the height of the feature, 
+                          the depth of the feature, `H` is the height of the feature,
                           and `W` is the width of the feature. The data type is float32 or float64.
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
             it must contain an integer.
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, stride, padding).
         data_format (string): The data format of the input and output data.
@@ -975,11 +975,11 @@ def max_unpool3d(x,
                              None by default.
 
     Returns:
-        Tensor: The output tensor of unpooling result. 
+        Tensor: The output tensor of unpooling result.
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
 
@@ -1215,7 +1215,7 @@ def max_pool3d(x,
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-    
+
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
@@ -1323,8 +1323,8 @@ def max_pool3d(x,
 
 def adaptive_avg_pool1d(x, output_size, name=None):
     """
-    Adaptive average pooling 1d operation on :attr:`x` according to :attr:`output_size`. 
-    
+    Adaptive average pooling 1d operation on :attr:`x` according to :attr:`output_size`.
+
     Notes:
         See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
 
@@ -1332,10 +1332,10 @@ def adaptive_avg_pool1d(x, output_size, name=None):
         x (Tensor): The input Tensor of pooling, which is a 3-D tensor with shape :math:`[N, C, L]`, where :math:`N` is batch size, :math:`C` is the number of channels and :math:`L` is the length of the feature. The data type is float32 or float64.
         output_size (int): The target output size. Its data type must be int.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: The result of 1D adaptive average pooling. Its data type is same as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -1400,7 +1400,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
     """
     Applies 2D adaptive avg pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size.
-    
+
     For avg adaptive pool2d:
     ..  math::
         hstart &= floor(i * H_{in} / H_{out})
@@ -1521,7 +1521,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     """
     This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size.
-    
+
     For avg adaptive pool3d:
     ..  math::
         dstart &= floor(i * D_{in} / D_{out})
diff --git a/python/paddle/nn/functional/sparse_attention.py b/python/paddle/nn/functional/sparse_attention.py
index 77327bae520..cf5db83077e 100644
--- a/python/paddle/nn/functional/sparse_attention.py
+++ b/python/paddle/nn/functional/sparse_attention.py
@@ -30,52 +30,52 @@ def sparse_attention(query,
                      name=None):
     r"""
     This operator sparsify the Attention matrix in Transformer module
-    to achieve the effect of reducing memory consumption and computation. 
-    The sparse layout is expressed in CSR format and contains two parameters, 
-    ``offset`` and ``columns``. The equation is: 
+    to achieve the effect of reducing memory consumption and computation.
+    The sparse layout is expressed in CSR format and contains two parameters,
+    ``offset`` and ``columns``. The equation is:
 
     .. math::
 
         result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V
 
-    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module. 
-    The dimensions of the three parameters are the same. 
+    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
+    The dimensions of the three parameters are the same.
     ``d`` represents the size of the last dimension of the three parameters.
 
-    Warning:    
+    Warning:
         This API is only used in ``CUDA 11.3`` and above versions.
 
     Args:
-        query(Tensor): The query tensor in the Attention module. 
-                        4-D tensor with shape: 
-                        [batch_size, num_heads, seq_len, head_dim]. 
+        query(Tensor): The query tensor in the Attention module.
+                        4-D tensor with shape:
+                        [batch_size, num_heads, seq_len, head_dim].
                         The dtype can be float32 and float64.
-        key(Tensor): The key tensor in the Attention module. 
-                        4-D tensor with shape: 
-                        [batch_size, num_heads, seq_len, head_dim]. 
+        key(Tensor): The key tensor in the Attention module.
+                        4-D tensor with shape:
+                        [batch_size, num_heads, seq_len, head_dim].
                         The dtype can be float32 and float64.
-        value(Tensor): The value tensor in the Attention module. 
-                        4-D tensor with shape:  
-                        [batch_size, num_heads, seq_len, head_dim]. 
+        value(Tensor): The value tensor in the Attention module.
+                        4-D tensor with shape:
+                        [batch_size, num_heads, seq_len, head_dim].
                         The dtype can be float32 and float64.
-        sparse_csr_offset(Tensor): The sparsity feature in the Attention module 
-                        is expressed in the CSR format, and the offset represents 
+        sparse_csr_offset(Tensor): The sparsity feature in the Attention module
+                        is expressed in the CSR format, and the offset represents
                         the number of non-zero elements in each row of the matrix.
-                        3-D tensor with shape:   
-                        [batch_size, num_heads, seq_len + 1]. 
+                        3-D tensor with shape:
+                        [batch_size, num_heads, seq_len + 1].
                         The dtype should be int32.
-        sparse_csr_columns(Tensor): The sparsity feature in the Attention module 
-                        is expressed in the CSR format, and the columns represent 
+        sparse_csr_columns(Tensor): The sparsity feature in the Attention module
+                        is expressed in the CSR format, and the columns represent
                         the column index values of non-zero elements in the matrix.
-                        3-D tensor with shape:  
-                        [batch_size, num_heads, sparse_nnz]. 
+                        3-D tensor with shape:
+                        [batch_size, num_heads, sparse_nnz].
                         The dtype should be int32.
-        key_padding_mask(Tensor, optional):The key padding mask tensor in the Attention module. 
-                        2-D tensor with shape: [batch_size, seq_len]. 
+        key_padding_mask(Tensor, optional):The key padding mask tensor in the Attention module.
+                        2-D tensor with shape: [batch_size, seq_len].
                         The dtype can be float32 and float64.
                         A value of 0 means that the position is masked.
-        attn_mask(Tensor, optional):The attention mask tensor in the Attention module. 
-                        2-D tensor with shape: [seq_len, seq_len]. 
+        attn_mask(Tensor, optional):The attention mask tensor in the Attention module.
+                        2-D tensor with shape: [seq_len, seq_len].
                         The dtype can be float32 and float64.
                         A value of 0 means that the position is masked.
         name(str, optional): The default value is None. Normally there is no need for user
@@ -84,7 +84,7 @@ def sparse_attention(query,
 
     Returns:
         4-D tensor with shape:
-        [batch_size, num_heads, seq_len, head_dim]. 
+        [batch_size, num_heads, seq_len, head_dim].
         The dtype can be float32 or float64.
 
     Examples:
@@ -113,31 +113,31 @@ def sparse_attention(query,
             print(sparse_csr_columns_data.shape)
             # (1, 1, 8)
             paddle.disable_static()
-            query = paddle.to_tensor(query_data, stop_gradient=False, 
+            query = paddle.to_tensor(query_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            key = paddle.to_tensor(key_data, stop_gradient=False, 
+            key = paddle.to_tensor(key_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            value = paddle.to_tensor(value_data, stop_gradient=False, 
+            value = paddle.to_tensor(value_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            offset = paddle.to_tensor(sparse_csr_offset_data, stop_gradient=False, 
+            offset = paddle.to_tensor(sparse_csr_offset_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            columns = paddle.to_tensor(sparse_csr_columns_data, stop_gradient=False, 
+            columns = paddle.to_tensor(sparse_csr_columns_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            key_padding_mask = paddle.to_tensor(key_padding_mask_data, stop_gradient=False, 
+            key_padding_mask = paddle.to_tensor(key_padding_mask_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            attention_mask = paddle.to_tensor(attention_mask_data, stop_gradient=False, 
+            attention_mask = paddle.to_tensor(attention_mask_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            output_mask = paddle.nn.functional.sparse_attention(query, key, 
-                            value, offset, columns, 
+            output_mask = paddle.nn.functional.sparse_attention(query, key,
+                            value, offset, columns,
                             key_padding_mask=key_padding_mask, attn_mask=attention_mask)
             print(output_mask)
             # [[[[0.        , 1.        ],
             #    [1.99830270, 2.99830270],
             #    [0.        , 1.        ],
             #    [0.        , 1.        ]]]]
-            output = paddle.nn.functional.sparse_attention(query, key, 
+            output = paddle.nn.functional.sparse_attention(query, key,
                             value, offset, columns)
-            print(output) 
+            print(output)
             # [[[[1.60885942, 2.60885954],
             #       [1.99830270, 2.99830270],
             #       [1.60885942, 2.60885954],
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 7b014204416..847ba013a0f 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -37,7 +37,7 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
     Args:
         theta (Tensor) - A tensor with shape [N, 2, 3] or [N, 3, 4]. It contains a batch of affine transform parameters.
                            The data type can be float32 or float64.
-        out_shape (Tensor | list | tuple): Type can be a 1-D Tensor, list, or tuple. It is used to represent the shape of the output in an affine transformation, in the format ``[N, C, H, W]`` or ``[N, C, D, H, W]``. 
+        out_shape (Tensor | list | tuple): Type can be a 1-D Tensor, list, or tuple. It is used to represent the shape of the output in an affine transformation, in the format ``[N, C, H, W]`` or ``[N, C, D, H, W]``.
                                            When the format is ``[N, C, H, W]``, it represents the batch size, number of channels, height and width. When the format is ``[N, C, D, H, W]``, it represents the batch size, number of channels, depth, height and width.
                                            The data type must be int32.
         align_corners(bool, optional): if True, aligns the centers of the 4 (4D) or 8 (5D) corner pixels of the input and output tensors, and preserves the value of the corner pixels. Default: True
@@ -60,7 +60,7 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
                     [1, 2, 3, 3],
                     align_corners=False)
             print(y_t)
-            
+
             #[[[[ 1.0333333   0.76666665]
             #   [ 0.76666665  1.0999999 ]
             #   [ 0.5         1.4333333 ]]
@@ -126,20 +126,20 @@ def grid_sample(x,
     """
     This operation samples input X by using bilinear interpolation or
     nearest interpolation based on flow field grid, which is usually
-    generated by :code:`affine_grid` . When the input X is 4-D Tensor, 
-    the grid of shape [N, H, W, 2] is the concatenation of (x, y) 
-    coordinates with shape [N, H, W] each, where x is indexing the 4th 
-    dimension (in width dimension) of input data x and y is indexing 
-    the 3rd dimension (in height dimension), finally results is the 
+    generated by :code:`affine_grid` . When the input X is 4-D Tensor,
+    the grid of shape [N, H, W, 2] is the concatenation of (x, y)
+    coordinates with shape [N, H, W] each, where x is indexing the 4th
+    dimension (in width dimension) of input data x and y is indexing
+    the 3rd dimension (in height dimension), finally results is the
     bilinear interpolation or nearest value of 4 nearest corner
-    points. The output tensor shape will be [N, C, H, W]. When the input X 
-    is 5-D Tensor, the grid of shape [N, D, H, W, 3] is the concatenation 
-    of (x, y, z) coordinates with shape [N, D, H, W] each, where x is 
-    indexing the 5th dimension (in width dimension) of input data x, y is 
-    indexing the 4th dimension (in height dimension) and z is indexing the 
-    3rd dimension (in depth dimension) finally results is the bilinear 
-    interpolation or nearest value of 8 nearest cornerpoints. The output 
-    tensor shape will be [N, C, D, H, W]. 
+    points. The output tensor shape will be [N, C, H, W]. When the input X
+    is 5-D Tensor, the grid of shape [N, D, H, W, 3] is the concatenation
+    of (x, y, z) coordinates with shape [N, D, H, W] each, where x is
+    indexing the 5th dimension (in width dimension) of input data x, y is
+    indexing the 4th dimension (in height dimension) and z is indexing the
+    3rd dimension (in depth dimension) finally results is the bilinear
+    interpolation or nearest value of 8 nearest cornerpoints. The output
+    tensor shape will be [N, C, D, H, W].
 
 
 
@@ -153,7 +153,7 @@ def grid_sample(x,
         grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
 
     Step 2:
-    
+
     Indices input data X with grid (x, y) in each [H, W] area, and bilinear
     interpolate point value by 4 nearest points or nearest interpolate point value
     by nearest point.
@@ -189,12 +189,12 @@ def grid_sample(x,
 
     Args:
         x(Tensor): The input tensor, which is a 4-d tensor with shape
-                     [N, C, H, W] or a 5-d tensor with shape [N, C, D, H, W], 
-                     N is the batch size, C is the channel number, 
+                     [N, C, H, W] or a 5-d tensor with shape [N, C, D, H, W],
+                     N is the batch size, C is the channel number,
                      D, H and W is the feature depth, height and width.
                      The data type is float32 or float64.
-        grid(Tensor): Input grid tensor, which is a 4-d tensor with shape [N, grid_H, 
-                        grid_W, 2] or a 5-d tensor with shape [N, grid_D, grid_H, 
+        grid(Tensor): Input grid tensor, which is a 4-d tensor with shape [N, grid_H,
+                        grid_W, 2] or a 5-d tensor with shape [N, grid_D, grid_H,
                         grid_W, 3]. The data type is float32 or float64.
         mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
                          Default: 'bilinear'.
@@ -209,17 +209,17 @@ def grid_sample(x,
                              None by default.
 
     Returns:
-        Tensor, The shape of output is [N, C, grid_H, grid_W] or [N, C, grid_D, grid_H, grid_W] in which `grid_D` is the depth of grid, 
+        Tensor, The shape of output is [N, C, grid_H, grid_W] or [N, C, grid_D, grid_H, grid_W] in which `grid_D` is the depth of grid,
                 `grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.
 
     Examples:
 
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
-            
-            # x shape=[1, 1, 3, 3]           
+
+            # x shape=[1, 1, 3, 3]
             x = paddle.to_tensor([[[[-0.6,  0.8, -0.5],
                                     [-0.5,  0.2,  1.2],
                                     [ 1.4,  0.3, -0.2]]]],dtype='float64')
@@ -243,7 +243,7 @@ def grid_sample(x,
                 padding_mode='border',
                 align_corners=True)
             print(y_t)
-            
+
             # output shape = [1, 1, 3, 4]
             # [[[[ 0.34   0.016  0.086 -0.448]
             #    [ 0.55  -0.076  0.35   0.59 ]
diff --git a/python/paddle/nn/initializer/assign.py b/python/paddle/nn/initializer/assign.py
index 746d2b67b2a..2cdd5fdf1aa 100644
--- a/python/paddle/nn/initializer/assign.py
+++ b/python/paddle/nn/initializer/assign.py
@@ -38,7 +38,7 @@ class Assign(NumpyArrayInitializer):
             # numpy array
             data_1 = paddle.ones(shape=[1, 2], dtype='float32')
             weight_attr_1 = paddle.framework.ParamAttr(
-                name="linear_weight_1", 
+                name="linear_weight_1",
                 initializer=paddle.nn.initializer.Assign(np.array([2, 2])))
             bias_attr_1 = paddle.framework.ParamAttr(
                 name="linear_bias_1",
diff --git a/python/paddle/nn/initializer/dirac.py b/python/paddle/nn/initializer/dirac.py
index 3d6cfc009ca..26b2e450371 100644
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
@@ -27,7 +27,7 @@ __all__ = []
 
 class Dirac(Initializer):
     r"""Initialize the 3D/4D/5D Tensor with Dirac delta function.
-    
+
     It can reserve the feature of convolution layer input, which means that
     as many channels are reserved as possible.
 
@@ -37,11 +37,11 @@ class Dirac(Initializer):
     .. math::
 
         X[d, d, shape[2]//2, shape[3]//2, ...]=1,  \   d=0,1...N
-    
+
     where, ``N`` is the minimum value of ``in_channels`` and ``out_channels``
 
     Args:
-        groups(int, optional): 0-dimension of the Tensor will be divided by groups, 
+        groups(int, optional): 0-dimension of the Tensor will be divided by groups,
             each group has the same value. Default: 1.
         name(str, optional): The default value is None. Normally there is no need for user to set this
             property. For more information, please refer to :ref:`api_guide_Name`.
@@ -53,9 +53,9 @@ class Dirac(Initializer):
         .. code-block:: python
 
             import paddle
-            
+
             #1. For kernel_size is uneven number:
-            
+
             attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Dirac())
             conv = paddle.nn.Conv1D(3, 2, 3, weight_attr=attr)
             conv.weight
@@ -63,14 +63,14 @@ class Dirac(Initializer):
             #       [[[0., 1., 0.],
             #         [0., 0., 0.],
             #         [0., 0., 0.]],
-            # 
+            #
             #        [[0., 0., 0.],
             #         [0., 1., 0.],
             #         [0., 0., 0.]]])
 
             input = paddle.rand([8, 3, 10])
             output = conv(input)
-            output == input[:, 0:2, 1:9]  
+            output == input[:, 0:2, 1:9]
             # output.shape is [8, 2, 8], It means output is almost the same with input, 2 channels are reserved
 
 
@@ -82,7 +82,7 @@ class Dirac(Initializer):
             #       [[[0., 0., 1., 0.],
             #         [0., 0., 0., 0.],
             #         [0., 0., 0., 0.]],
-            # 
+            #
             #        [[0., 0., 0., 0.],
             #         [0., 0., 1., 0.],
             #         [0., 0., 0., 0.]]])
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
index 38394eb5b93..ab4c90343fe 100644
--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -74,7 +74,7 @@ class KaimingUniform(MSRAInitializer):
     by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
     robust initialization method that particularly considers the rectifier
     nonlinearities.
-    
+
     In case of Uniform distribution, the range is [-x, x], where
 
     .. math::
diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py
index 730c55ea6f1..3f7d7152710 100644
--- a/python/paddle/nn/initializer/normal.py
+++ b/python/paddle/nn/initializer/normal.py
@@ -46,7 +46,7 @@ class Normal(NormalInitializer):
             # linear.weight:  [[ 2.1973135 -2.2697184]
             #                  [-1.9104223 -1.0541488]]
             # linear.bias:  [ 0.7885926  -0.74719954]
-            
+
             res = linear(data)
             # res:  [[[ 1.0754838 -4.071067 ]]
             #        [[ 1.0754838 -4.071067 ]]
diff --git a/python/paddle/nn/initializer/orthogonal.py b/python/paddle/nn/initializer/orthogonal.py
index 63e0152e22b..b8fe912a839 100644
--- a/python/paddle/nn/initializer/orthogonal.py
+++ b/python/paddle/nn/initializer/orthogonal.py
@@ -26,9 +26,9 @@ __all__ = []
 class Orthogonal(Initializer):
     """The orthogonal initializer. The initialized tensor is (semi) orthogonal.
 
-    It's only applied to Tensor whose dimension is greater than or equal to 2. 
-    
-    For the Tensor whose dimension is greater than 2, the 0 dimension is seen as ``rows`` , 
+    It's only applied to Tensor whose dimension is greater than or equal to 2.
+
+    For the Tensor whose dimension is greater than 2, the 0 dimension is seen as ``rows`` ,
     and the >=1 dimension are flattened as ``cols`` .
 
     Which can be describe as:
@@ -37,7 +37,7 @@ class Orthogonal(Initializer):
 
         rows = shape[0]
         cols = shape[1]·shape[2]···shape[N]
-        
+
         if rows < cols:
             The rows are orthogonal vectors
         elif rows > cols:
diff --git a/python/paddle/nn/initializer/uniform.py b/python/paddle/nn/initializer/uniform.py
index 707d3d03ecc..2c7a57b1195 100644
--- a/python/paddle/nn/initializer/uniform.py
+++ b/python/paddle/nn/initializer/uniform.py
@@ -44,7 +44,7 @@ class Uniform(UniformInitializer):
             # linear.weight:  [[-0.46245047  0.05260676]
             #                  [ 0.38054508  0.29169726]]
             # linear.bias:  [-0.2734719   0.23939109]
-            
+
             res = linear(data)
             # res:  [[[-0.3553773  0.5836951]]
             #        [[-0.3553773  0.5836951]]
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index dc80743de51..a8622bc5022 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -28,7 +28,7 @@ class CELU(Layer):
     CELU Activation.
 
     .. math::
-    
+
         CELU(x) = max(0, x) + min(0, \alpha * (e^{x/\alpha}-1))
 
     Parameters:
@@ -44,7 +44,7 @@ class CELU(Layer):
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.to_tensor([[-1. ,6.], [1., 15.6]])
             m = paddle.nn.CELU(0.2)
             out = m(x)
@@ -140,7 +140,7 @@ class GELU(Layer):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
 
             x = paddle.to_tensor([[-1, 0.5],[1, 1.5]])
@@ -231,7 +231,7 @@ class Hardswish(Layer):
                 \frac{x(x+3)}{6} &, & \text{otherwise}
                 \end{array}
             \right.
-            
+
 
     Parameters:
         name (str, optional): Name for the operation (optional, default is None).
@@ -982,7 +982,7 @@ class Mish(Layer):
             \end{cases}
 
         Mish(x) = x * \tanh(softplus(x))
-    
+
     Parameters:
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
@@ -990,7 +990,7 @@ class Mish(Layer):
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
 
         .. code-block:: python
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index ee5641f5d12..514be1ba3f9 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -325,8 +325,8 @@ class Upsample(Layer):
         x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         size (list|tuple|Tensor|None): Output shape of image resize
-             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
-             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w)
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor.
              Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
@@ -373,7 +373,7 @@ class Upsample(Layer):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
             import paddle.nn as nn
             import numpy as np
@@ -1016,8 +1016,8 @@ class Pad2D(Layer):
 
     Parameters:
         padding (Tensor|list[int]|int): The padding size with data type int. If is int, use the
-            same padding in all dimensions. Else [len(padding)/2] dimensions of input will be padded. 
-            The pad has the form (pad_left, pad_right, pad_top, pad_bottom). 
+            same padding in all dimensions. Else [len(padding)/2] dimensions of input will be padded.
+            The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
         mode (str, optional): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'. Default is 'constant'.
 
            - 'constant' mode, uses a constant value to pad the input tensor.
@@ -1279,7 +1279,7 @@ class CosineSimilarity(Layer):
 
 class Embedding(Layer):
     r"""
-    
+
     Embedding Layer, used to construct a callable object of the ``Embedding`` class.
     For specific usage, refer to code examples. It implements the function of the Embedding Layer.
     This layer is used to lookup embeddings vector of ids provided by :attr:`x` .
@@ -1449,7 +1449,7 @@ class Unfold(Layer):
 
     See ``paddle.nn.functional.unfold`` for more details.
 
-    
+
     Parameters:
         kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
@@ -1515,15 +1515,15 @@ class Fold(Layer):
     r"""
 
     Combines an array of sliding local blocks into a large containing
-    tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
-    combined value in the resulting large tensor by summing all values from all containing blocks. 
+    tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each
+    combined value in the resulting large tensor by summing all values from all containing blocks.
 
 
     For each input :math:`x` with shape [N, C_in , L], the output shape [N, C_out, H_out, W_out]
     can be calculated as following.
 
     .. math::
-    
+
         H_{out} &= output\_size[0] \\
         W_{out} &= output\_size[1] \\
         C_{out} &= \frac{C_{in}}{kernel\_sizes[0]\times kernel\_sizes[1]} \\
diff --git a/python/paddle/nn/layer/container.py b/python/paddle/nn/layer/container.py
index 0b1bf6bc565..fe2753a6141 100644
--- a/python/paddle/nn/layer/container.py
+++ b/python/paddle/nn/layer/container.py
@@ -22,7 +22,7 @@ __all__ = []
 class LayerDict(Layer):
     """
     LayerDict holds sublayers in the ordered dictionary, and sublayers it contains are properly registered.
-    Holded sublayers can be accessed like a regular ordered python dictionary. 
+    Holded sublayers can be accessed like a regular ordered python dictionary.
 
     Parameters:
         sublayers (LayerDict|OrderedDict|list[(key,Layer)...], optional): iterable of key/value pairs, the type of value is 'paddle.nn.Layer' .
@@ -155,7 +155,7 @@ class LayerDict(Layer):
 
         Parameters:
             None.
-        
+
         Examples:
             .. code-block:: python
 
@@ -171,7 +171,7 @@ class LayerDict(Layer):
                 layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
                 for k in layer_dict.keys():
                     print(k)
-                
+
                 #conv1d
                 #conv2d
                 #conv3d
@@ -185,7 +185,7 @@ class LayerDict(Layer):
 
         Parameters:
             None.
-        
+
         Examples:
             .. code-block:: python
 
@@ -215,7 +215,7 @@ class LayerDict(Layer):
 
         Parameters:
             None.
-        
+
         Examples:
             .. code-block:: python
 
@@ -245,7 +245,7 @@ class LayerDict(Layer):
 
         Parameters:
             sublayers (LayerDict|OrderedDict|list[(key,Layer)...]): iterable of key/value pairs, the type of value is 'paddle.nn.Layer' .
-        
+
         Examples:
             .. code-block:: python
 
@@ -265,7 +265,7 @@ class LayerDict(Layer):
                 layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
 
                 layer_dict.update(new_sublayers)
-                
+
                 for k, v in layer_dict.items():
                     print(k, ":", v)
                 #conv1d : Conv1D(3, 2, kernel_size=[3], data_format=NCL)
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 4ef987eccf2..34a3c2cc4e0 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -186,7 +186,7 @@ class Conv1D(_ConvNd):
     Output are in NCL format or NLC format, where N is batch size, C is the number of
     the feature map, L is the length of the feature map.
     Filter's shape is [MCK] , where M is the number of output feature map,
-    C is the number of input feature map, K is the size of the kernel. 
+    C is the number of input feature map, K is the size of the kernel.
     If the groups is greater than 1, C will equal the number of input feature map divided by the groups.
     If bias attribution and activation type are provided, bias is added to the
     output of the convolution, and the corresponding activation function is
@@ -273,7 +273,7 @@ class Conv1D(_ConvNd):
         - weight: 3-D tensor with shape: (out_channels, in_channels, kernel_size)
         - bias: 1-D tensor with shape: (out_channels)
         - output: 3-D tensor with same shape as input x.
-    
+
     Raises:
         None
 
@@ -461,7 +461,7 @@ class Conv1DTranspose(_ConvNd):
           import paddle
           from paddle.nn import Conv1DTranspose
           import numpy as np
-          
+
           # shape: (1, 2, 4)
           x=np.array([[[4, 0, 9, 7],
                        [8, 0, 9, 2]]]).astype(np.float32)
@@ -473,7 +473,7 @@ class Conv1DTranspose(_ConvNd):
           conv.weight.set_value(y)
           y_t = conv(x_t)
           print(y_t)
-          
+
           # [[[60. 16. 99. 75.  4.]]]
     """
 
@@ -549,7 +549,7 @@ class Conv2D(_ConvNd):
     * :math:`b`: Bias value, a 1-D ``Tensor`` with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    
+
     Parameters:
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
@@ -559,7 +559,7 @@ class Conv2D(_ConvNd):
             stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -616,11 +616,11 @@ class Conv2D(_ConvNd):
 
           import paddle
           import paddle.nn as nn
-          
+
           paddle.disable_static()
-          
+
           x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
-          
+
           conv = nn.Conv2D(4, 6, (3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
@@ -707,7 +707,7 @@ class Conv2DTranspose(_ConvNd):
     * :math:`b`: Bias value, a 1-D ``Tensor`` with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    
+
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
@@ -719,7 +719,7 @@ class Conv2DTranspose(_ConvNd):
             stride_H = stride_W = stride. Default: 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -781,7 +781,7 @@ class Conv2DTranspose(_ConvNd):
 
           import paddle
           import paddle.nn as nn
-          
+
           paddle.disable_static()
 
           x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
@@ -843,7 +843,7 @@ class Conv3D(_ConvNd):
     **Convlution3d Layer**
     The convolution3d layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional tensors with a shape of 
+    Output(Output) are multidimensional tensors with a shape of
     :math:`[N, C, D, H, W]` . Where N is batch size, C is the number of
     channels, D is the depth of the feature, H is the height of the feature,
     and W is the width of the feature. Convlution3D is similar with Convlution2D
@@ -874,7 +874,7 @@ class Conv3D(_ConvNd):
             stride_D = stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -937,11 +937,11 @@ class Conv3D(_ConvNd):
 
           import paddle
           import paddle.nn as nn
-          
+
           paddle.disable_static()
 
           x_var = paddle.uniform((2, 4, 8, 8, 8), dtype='float32', min=-1., max=1.)
-          
+
           conv = nn.Conv3D(4, 6, (3, 3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
@@ -1012,7 +1012,7 @@ class Conv3DTranspose(_ConvNd):
     the output of the convolution, and the corresponding activation function
     is applied to the final result.
     For each input :math:`X`, the equation is:
-    
+
     ..  math::
 
         Out = \sigma (W \ast X + b)
@@ -1029,14 +1029,14 @@ class Conv3DTranspose(_ConvNd):
     **Note**:
 
           The conv3d_transpose can be seen as the backward of the conv3d. For conv3d,
-          when stride > 1, conv3d maps multiple input shape to the same output shape, 
+          when stride > 1, conv3d maps multiple input shape to the same output shape,
           so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
-          H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output 
-          size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
-          the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
-          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
+          H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output
+          size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`,
+          the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}`
+          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`,
           conv3d_transpose can compute the kernel size automatically.
 
     Parameters:
@@ -1045,13 +1045,13 @@ class Conv3DTranspose(_ConvNd):
         kernel_size(int|list|tuple): The kernel size. If kernel_size is a list/tuple,
             it must contain three integers, (kernel_size_D, kernel_size_H, kernel_size_W).
             Otherwise, the kernel will be a square.
-        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a list/tuple, it must contain three integers, (stride_depth, stride_height, 
-            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
+        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution.
+            If stride is a list/tuple, it must contain three integers, (stride_depth, stride_height,
+            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride.
             The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -1100,11 +1100,11 @@ class Conv3DTranspose(_ConvNd):
         ..  math::
 
            D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (kernel\_size[0] - 1) + 1
-           
+
            H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel\_size[1] - 1) + 1
-           
+
            W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel\_size[2] - 1) + 1
-           
+
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
                     groups mismatch.
@@ -1114,11 +1114,11 @@ class Conv3DTranspose(_ConvNd):
 
           import paddle
           import paddle.nn as nn
-          
+
           paddle.disable_static()
 
           x_var = paddle.uniform((2, 4, 8, 8, 8), dtype='float32', min=-1., max=1.)
-          
+
           conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 1ff37afa141..5ce3a9ea5c9 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -131,23 +131,23 @@ class BCEWithLogitsLoss(Layer):
 
 class CrossEntropyLoss(Layer):
     r"""
-    By default, this operator implements the cross entropy loss function with softmax. This function 
-    combines the calculation of the softmax operation and the cross entropy loss function 
+    By default, this operator implements the cross entropy loss function with softmax. This function
+    combines the calculation of the softmax operation and the cross entropy loss function
     to provide a more numerically stable computing.
 
     This operator will calculate the cross entropy loss function without softmax when use_softmax=False.
 
-    By default, this operator will calculate the mean of the result, and you can also affect 
-    the default behavior by using the reduction parameter. Please refer to the part of 
+    By default, this operator will calculate the mean of the result, and you can also affect
+    the default behavior by using the reduction parameter. Please refer to the part of
     parameters for details.
 
     This operator can be used to calculate the softmax cross entropy loss with soft and hard labels.
-    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels 
+    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels
     mean the probability of the actual label, 0.6, 0.8, 0.2, etc.
 
     The calculation of this operator includes the following two steps.
 
-    -  **I.softmax cross entropy** 
+    -  **I.softmax cross entropy**
 
         1. Hard label (each sample can only be assigned into one category)
 
@@ -184,7 +184,7 @@ class CrossEntropyLoss(Layer):
 
 
 
-    -  **II.Weight and reduction processing** 
+    -  **II.Weight and reduction processing**
 
         1. Weight
 
@@ -196,7 +196,7 @@ class CrossEntropyLoss(Layer):
             1.1. Hard labels (soft_label = False)
 
             .. math::
-                \\loss_j=loss_j*weight[label_j] 
+                \\loss_j=loss_j*weight[label_j]
 
 
             1.2. Soft labels (soft_label = True)
@@ -206,21 +206,21 @@ class CrossEntropyLoss(Layer):
 
         2. reduction
 
-            2.1 if the ``reduction`` parameter is ``none`` 
+            2.1 if the ``reduction`` parameter is ``none``
 
             Return the previous result directly
 
-            2.2 if the ``reduction`` parameter is ``sum`` 
+            2.2 if the ``reduction`` parameter is ``sum``
 
             Return the sum of the previous results
 
             .. math::
                \\loss=\sum_{j}loss_j
 
-            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to 
-            the ``weight`` parameter as follows. 
+            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to
+            the ``weight`` parameter as follows.
 
-            2.3.1. If the  ``weight``  parameter is ``None`` 
+            2.3.1. If the  ``weight``  parameter is ``None``
 
             Return the average value of the previous results
 
@@ -234,27 +234,27 @@ class CrossEntropyLoss(Layer):
             1. Hard labels (soft_label = False)
 
              .. math::
-                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 
+                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j]
 
             2. Soft labels (soft_label = True)
 
              .. math::
                 \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right)
- 
- 
+
+
     Parameters:
 
         - **weight** (Tensor, optional)
 
-            a manual rescaling weight given to each class. 
-            If given, has to be a Tensor of size C and the data type is float32, float64. 
+            a manual rescaling weight given to each class.
+            If given, has to be a Tensor of size C and the data type is float32, float64.
             Default is ``'None'`` .
 
         - **ignore_index** (int64, optional)
 
             Specifies a target value that is ignored
-            and does not contribute to the loss. A negative value means that no label 
-            value needs to be ignored. Only valid when soft_label = False.  
+            and does not contribute to the loss. A negative value means that no label
+            value needs to be ignored. Only valid when soft_label = False.
             Default is ``-100`` .
 
         - **reduction** (str, optional)
@@ -268,15 +268,15 @@ class CrossEntropyLoss(Layer):
 
         - **soft_label** (bool, optional)
 
-            Indicate whether label is soft. 
+            Indicate whether label is soft.
             If soft_label=False, the label is hard.  If soft_label=True, the label is soft.
             Default is ``False``.
 
         - **axis** (int, optional)
 
-            The index of dimension to perform softmax calculations. 
-            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number 
-            of dimensions of input :attr:`input`. 
+            The index of dimension to perform softmax calculations.
+            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number
+            of dimensions of input :attr:`input`.
             Default is ``-1`` .
 
         - **use_softmax** (bool, optional)
@@ -295,11 +295,11 @@ class CrossEntropyLoss(Layer):
         - **input** (Tensor)
 
             Input tensor, the data type is float32, float64. Shape is
-        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` . 
+        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
 
-            Note: 
+            Note:
 
-                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
                 output of softmax operator, which will produce incorrect results.
 
                 2. when use_softmax=False, it expects the output of softmax operator.
@@ -307,11 +307,11 @@ class CrossEntropyLoss(Layer):
 
         - **label** (Tensor)
 
-            1. If soft_label=False, the shape is 
+            1. If soft_label=False, the shape is
             :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
             the data type is int32, int64, float32, float64, where each value is [0, C-1].
 
-            2. If soft_label=True, the shape and data type should be same with ``input`` , 
+            2. If soft_label=True, the shape and data type should be same with ``input`` ,
             and the sum of the labels for each sample should be 1.
 
         - **output** (Tensor)
@@ -324,24 +324,24 @@ class CrossEntropyLoss(Layer):
 
             If :attr:`reduction` is ``'none'``:
 
-            1. If soft_label = False, the dimension of return value is the same with ``label`` . 
+            1. If soft_label = False, the dimension of return value is the same with ``label`` .
 
-            2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 
+            2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
 
     Examples:
 
         .. code-block:: python
-            
+
             # hard labels
             import paddle
             paddle.seed(99999)
             N=100
             C=200
             reduction='mean'
-            input =  paddle.rand([N, C], dtype='float64')  
+            input =  paddle.rand([N, C], dtype='float64')
             label =  paddle.randint(0, C, shape=[N], dtype='int64')
-            weight = paddle.rand([C], dtype='float64') 
-            
+            weight = paddle.rand([C], dtype='float64')
+
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction=reduction)
             dy_ret = cross_entropy_loss(
@@ -365,9 +365,9 @@ class CrossEntropyLoss(Layer):
             labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
             labels /= paddle.sum(labels, axis=axis, keepdim=True)
             paddle_loss_mean = paddle.nn.functional.cross_entropy(
-                                                                  logits,  
-                                                                  labels, 
-                                                                  soft_label=True, 
+                                                                  logits,
+                                                                  labels,
+                                                                  soft_label=True,
                                                                   axis=axis,
                                                                   weight=weight,
                                                                   reduction=reduction)
@@ -409,7 +409,7 @@ class CrossEntropyLoss(Layer):
 class HSigmoidLoss(Layer):
     """
     Hierarchical Sigmoid Layer.
-    
+
     The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
     and speed up the model training, especially the training of language model.
     Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
@@ -444,7 +444,7 @@ class HSigmoidLoss(Layer):
             is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
             hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
             set, the bias is initialized zero. Default is None.
-        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and 
+        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and
             `path_code` should be passed to its forward method, otherwise `path_table` and `path_code`
             should not be passed to its forward method. Default is False.
         is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True,
@@ -654,7 +654,7 @@ class L1Loss(Layer):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
             import numpy as np
 
@@ -999,7 +999,7 @@ class MarginRankingLoss(Layer):
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Shape:
-    
+
         input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
 
         other: N-D Tensor, `other` have the same shape and dtype as `input`.
@@ -1176,11 +1176,11 @@ class SmoothL1Loss(Layer):
 
     Call Parameters:
 
-        input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), 
-        where C is number of classes, and if shape is more than 2D, 
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C),
+        where C is number of classes, and if shape is more than 2D,
         this is (N, C, D1, D2,..., Dk), k >= 1.
 
-        label (Tensor): Label tensor, the data type is float32 or float64. 
+        label (Tensor): Label tensor, the data type is float32 or float64.
         The shape of label is the same as the shape of input.
 
     Returns:
@@ -1491,22 +1491,22 @@ class TripletMarginWithDistanceLoss(Layer):
         L(input, pos, neg) = \max \{d(input_i, pos_i) - d(input_i, neg_i) + {\rm margin}, 0\}
 
     where the default `distance_function`
-    
+
     .. math::
     	d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_2
-    
-    or user can define their own distance function. `margin` is a nonnegative margin representing the minimum difference 
+
+    or user can define their own distance function. `margin` is a nonnegative margin representing the minimum difference
     between the positive and negative distances that is required for the loss to be 0. If `swap` is true, it will compare distance of (input, negative) with
     distance of (negative, positive) and change it to the smaller one. For more details see http://www.bmva.org/bmvc/2016/papers/paper119/paper119.pdf.
 
     Parameters:
         distance_function (Callable, Optional): Quantifies the distance between two tensors. if not specified, 2 norm functions will be used.
-	
+
         margin (float, Optional):Default: :math:`1`.A nonnegative margin representing the minimum difference
                 between the positive and negative distances required for the loss to be 0. Larger
                 margins penalize cases where the negative examples are not distant enough from the
                 anchors, relative to the positives.
-		
+
         swap (bool, Optional):The distance swap changes the negative distance to the swap distance (distance between positive samples
                 and negative samples) if swap distance smaller than negative distance. Default: ``False``.
 
@@ -1518,7 +1518,7 @@ class TripletMarginWithDistanceLoss(Layer):
                 Default: ``'mean'``
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-	    
+
     Shapes:
         input (Tensor):Input tensor, the data type is float32 or float64.
 	the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
@@ -1528,7 +1528,7 @@ class TripletMarginWithDistanceLoss(Layer):
 
         negative (Tensor):Negative tensor, the data type is float32 or float64.
 	The shape of label is the same as the shape of input.
-	
+
 	    output(Tensor): The tensor variable storing the triplet_margin_with_distance_loss of input and positive and negative.
 
     Return：
@@ -1650,7 +1650,7 @@ class TripletMarginLoss(Layer):
             loss = triplet_margin_loss(input, positive, negative)
             print(loss)
             # Tensor([0.        , 0.57496738, 0.        ])
-	    
+
             triplet_margin_loss = paddle.nn.TripletMarginLoss(margin=1.0, swap=True, reduction='mean', )
             loss = triplet_margin_loss(input, positive, negative,)
             print(loss)
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 93d6b21c13f..b051a64bfc3 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -56,7 +56,7 @@ __all__ = []
 
 class _InstanceNormBase(Layer):
     """
-    This class is based class for InstanceNorm1D, 2d, 3d. 
+    This class is based class for InstanceNorm1D, 2d, 3d.
 
     See InstaceNorm1D, InstanceNorm2D or InstanceNorm3D for more details.
     """
@@ -117,7 +117,7 @@ class InstanceNorm1D(_InstanceNormBase):
     :math:`input` is the input features over a mini-batch.
 
     ..  math::
-        
+
         \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\
         \ mean\ of\ one\  feature\ map\ in\ mini-batch \\
         \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \
@@ -137,12 +137,12 @@ class InstanceNorm1D(_InstanceNormBase):
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     If the Initializer of the weight_attr is not set, the parameter is initialized
 	     one. If it is set to False, will not create weight_attr. Default: None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
              If it is set to None or one attribute of ParamAttr, instance_norm
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero.
              If it is set to False, will not create bias_attr. Default: None.
         data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL".
         name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
@@ -165,7 +165,7 @@ class InstanceNorm1D(_InstanceNormBase):
 
           np.random.seed(123)
           x_data = np.random.random(size=(2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.to_tensor(x_data)
           instance_norm = paddle.nn.InstanceNorm1D(2)
           instance_norm_out = instance_norm(x)
 
@@ -189,7 +189,7 @@ class InstanceNorm2D(_InstanceNormBase):
     :math:`input` is the input features over a mini-batch.
 
     ..  math::
-        
+
         \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\
         \ mean\ of\ one\  feature\ map\ in\ mini-batch \\
         \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \
@@ -209,12 +209,12 @@ class InstanceNorm2D(_InstanceNormBase):
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     If the Initializer of the weight_attr is not set, the parameter is initialized
 	     one. If it is set to False, will not create weight_attr. Default: None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
              If it is set to None or one attribute of ParamAttr, instance_norm
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero.
              If it is set to False, will not create bias_attr. Default: None.
         data_format(str, optional): Specify the input data format, could be "NCHW". Default: NCHW.
         name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
@@ -236,7 +236,7 @@ class InstanceNorm2D(_InstanceNormBase):
 
           np.random.seed(123)
           x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.to_tensor(x_data)
           instance_norm = paddle.nn.InstanceNorm2D(2)
           instance_norm_out = instance_norm(x)
 
@@ -259,7 +259,7 @@ class InstanceNorm3D(_InstanceNormBase):
     :math:`input` is the input features over a mini-batch.
 
     ..  math::
-        
+
         \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\
         \ mean\ of\ one\  feature\ map\ in\ mini-batch \\
         \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \
@@ -279,12 +279,12 @@ class InstanceNorm3D(_InstanceNormBase):
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the weight_attr is not set, the parameter is initialized 
+	     If the Initializer of the weight_attr is not set, the parameter is initialized
 	     one. If it is set to False, will not create weight_attr. Default: None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
              If it is set to None or one attribute of ParamAttr, instance_norm
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero.
              If it is set to False, will not create bias_attr. Default: None.
         data_format(str, optional): Specify the input data format, could be "NCDHW". Default: NCDHW.
         name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
@@ -306,7 +306,7 @@ class InstanceNorm3D(_InstanceNormBase):
 
           np.random.seed(123)
           x_data = np.random.random(size=(2, 2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.to_tensor(x_data)
           instance_norm = paddle.nn.InstanceNorm3D(2)
           instance_norm_out = instance_norm(x)
 
@@ -356,7 +356,7 @@ class GroupNorm(Layer):
           paddle.disable_static()
           np.random.seed(123)
           x_data = np.random.random(size=(2, 6, 2, 2)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.to_tensor(x_data)
           group_norm = paddle.nn.GroupNorm(num_channels=6, num_groups=6)
           group_norm_out = group_norm(x)
 
@@ -520,7 +520,7 @@ class LayerNorm(Layer):
 
           np.random.seed(123)
           x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.to_tensor(x_data)
           layer_norm = paddle.nn.LayerNorm(x_data.shape[1:])
           layer_norm_out = layer_norm(x)
 
@@ -754,7 +754,7 @@ class BatchNorm1D(_BatchNormBase):
 
     Returns:
         None.
-    
+
 
     Examples:
         .. code-block:: python
@@ -764,7 +764,7 @@ class BatchNorm1D(_BatchNormBase):
 
           np.random.seed(123)
           x_data = np.random.random(size=(2, 1, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.to_tensor(x_data)
           batch_norm = paddle.nn.BatchNorm1D(1)
           batch_norm_out = batch_norm(x)
 
@@ -811,7 +811,7 @@ class BatchNorm2D(_BatchNormBase):
 
         \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//
         \ mini-batch\ mean \\
-        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - 
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i -
         \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
 
     When use_global_stats = True, the :math:`\mu_{\beta}`
@@ -866,7 +866,7 @@ class BatchNorm2D(_BatchNormBase):
 
           np.random.seed(123)
           x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.to_tensor(x_data)
           batch_norm = paddle.nn.BatchNorm2D(1)
           batch_norm_out = batch_norm(x)
 
@@ -954,7 +954,7 @@ class BatchNorm3D(_BatchNormBase):
 
           np.random.seed(123)
           x_data = np.random.random(size=(2, 1, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.to_tensor(x_data)
           batch_norm = paddle.nn.BatchNorm3D(1)
           batch_norm_out = batch_norm(x)
 
@@ -992,8 +992,8 @@ class BatchNorm3D(_BatchNormBase):
 class SyncBatchNorm(_BatchNormBase):
     r"""
     This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
-    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
-    be used as a normalizer function for other operations, such as conv2d and fully connected 
+    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can
+    be used as a normalizer function for other operations, such as conv2d and fully connected
     operations.
     The data is normalized by the mean and variance of the channel based on whole mini-batch
     , which including data in all gpus.
@@ -1001,7 +1001,7 @@ class SyncBatchNorm(_BatchNormBase):
     Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
     for more details.
 
-    When model in training mode, the :math:`\\mu_{\\beta}` 
+    When model in training mode, the :math:`\\mu_{\\beta}`
     and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
     Calculated as follows:
 
@@ -1016,7 +1016,7 @@ class SyncBatchNorm(_BatchNormBase):
     - :math:`m` : the size of the whole mini-batch data
 
     When model in evaluation mode, the :math:`\\mu_{\\beta}`
-    and :math:`\sigma_{\beta}^{2}` are global statistics (moving_mean and moving_variance, 
+    and :math:`\sigma_{\beta}^{2}` are global statistics (moving_mean and moving_variance,
     which usually got from the pre-trained model). Global statistics calculated as follows:
 
     .. math::
@@ -1024,7 +1024,7 @@ class SyncBatchNorm(_BatchNormBase):
         moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
 
     The formula of normalization is as follows:
- 
+
     ..  math::
 
         \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
@@ -1033,12 +1033,12 @@ class SyncBatchNorm(_BatchNormBase):
 
     - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
     - :math:`\gamma` : trainable scale parameter vector
-    - :math:`\beta` : trainable shift parameter vector 
+    - :math:`\beta` : trainable shift parameter vector
 
     Note:
-        If you want to use container to pack your model and has ``SyncBatchNorm`` in the 
-        evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of 
-        ``list`` to pack the model. 
+        If you want to use container to pack your model and has ``SyncBatchNorm`` in the
+        evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of
+        ``list`` to pack the model.
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -1047,12 +1047,12 @@ class SyncBatchNorm(_BatchNormBase):
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of this layer. If it is set to None or one attribute of ParamAttr, this layerr
              will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with ones. If it is set to False, 
+             is not set, the parameter is initialized with ones. If it is set to False,
              this layer will not have trainable scale parameter. Default: None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
              If it is set to None or one attribute of ParamAttr, this layer
              will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. If it is set to False, this layer will not 
+             is not set, the bias is initialized zero. If it is set to False, this layer will not
              have trainable bias parameter. Default: None.
 
     Shapes:
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index ccba13316a1..927cbe20ced 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -61,7 +61,7 @@ class AvgPool1D(Layer):
 
     Returns:
         A callable object of AvgPool1D.
-        
+
     Examples:
 
         .. code-block:: python
@@ -1102,25 +1102,25 @@ class MaxUnPool1D(Layer):
     r"""
     This API implements max unpooling 1d opereation.
 
-    `max_unpool1d` accepts the output of `max_pool1d` as input, 
-    including the indices of the maximum value and calculate the partial inverse. 
+    `max_unpool1d` accepts the output of `max_pool1d` as input,
+    including the indices of the maximum value and calculate the partial inverse.
     All non-maximum values ​​are set to zero.
 
     - Input: :math:`(N, C, L_{in})`
     - Output: :math:`(N, C, L_{out})`, where
-    
+
     .. math::
         L_{out} = (L_{in} - 1) * stride - 2 * padding + kernel\_size
 
     or as given by :attr:`output_size` in the call operator.
-    
+
     Parameters:
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
             it must contain an integer.
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, stride, padding).
         data_format (string): The data format of the input and output data.
@@ -1136,7 +1136,7 @@ class MaxUnPool1D(Layer):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
             import numpy as np
@@ -1186,7 +1186,7 @@ class MaxUnPool2D(Layer):
     'max_unpool2d' accepts the output of 'max_unpool2d' as input
     Including the indices of the maximum value and calculating the partial inverse
     All non-maximum values ​​are set to zero.
-    
+
 
     Parameters:
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
@@ -1195,7 +1195,7 @@ class MaxUnPool2D(Layer):
             it must contain an integer.
         kernel_size (int|tuple): Size of the max unpooling window.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, padding).
         name(str, optional): For detailed information, please refer
@@ -1217,11 +1217,11 @@ class MaxUnPool2D(Layer):
     Returns:
         A callable object of MaxUnPool2D.
 
-            
+
 
     Examples:
         .. code-block:: python
-        
+
         import paddle
         import paddle.nn.functional as F
 
@@ -1267,13 +1267,13 @@ class MaxUnPool3D(Layer):
     r"""
     This API implements max unpooling 3d opereation.
 
-    `max_unpool3d` accepts the output of `max_pool3d` as input, 
-    including the indices of the maximum value and calculate the partial inverse. 
+    `max_unpool3d` accepts the output of `max_pool3d` as input,
+    including the indices of the maximum value and calculate the partial inverse.
     All non-maximum values ​​are set to zero.
 
     - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
     - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
-    
+
     .. math::
         D_{out} = (D_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0]
 
@@ -1285,14 +1285,14 @@ class MaxUnPool3D(Layer):
 
     or as given by :attr:`output_size` in the call operator
 
-    
+
     Parameters:
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
             it must contain an integer.
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, stride, padding).
         data_format (string): The data format of the input and output data.
@@ -1308,7 +1308,7 @@ class MaxUnPool3D(Layer):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
             import numpy as np
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index fc85dfe1a1a..d48219fee48 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -54,32 +54,32 @@ def split_states(states, bidirectional=False, state_components=1):
     Parameters:
         states (Tensor|tuple|list): the concatenated states for RNN network.
             When `state_components` is 1, states in a Tensor with shape
-            `(L*D, N, C)` where `L` is the number of layers of the RNN 
-            network, `D` is the number of directions of the RNN network(1 
-            for unidirectional RNNs and 2 for bidirectional RNNs), `N` is 
-            the batch size of the input to the RNN network, `C` is the 
-            hidden size of the RNN network. 
-
-            When `state_components` is larger than 1, `states` is a tuple of 
-            `state_components` Tensors that meet the requirements described 
-            above. 
-            
-            For SimpleRNNs and GRUs, `state_components` is 1, and for LSTMs, 
+            `(L*D, N, C)` where `L` is the number of layers of the RNN
+            network, `D` is the number of directions of the RNN network(1
+            for unidirectional RNNs and 2 for bidirectional RNNs), `N` is
+            the batch size of the input to the RNN network, `C` is the
+            hidden size of the RNN network.
+
+            When `state_components` is larger than 1, `states` is a tuple of
+            `state_components` Tensors that meet the requirements described
+            above.
+
+            For SimpleRNNs and GRUs, `state_components` is 1, and for LSTMs,
             `state_components` is 2.
-        bidirectional (bool): whether the state is of a bidirectional RNN 
+        bidirectional (bool): whether the state is of a bidirectional RNN
             network. Defaults to False.
         state_components (int): the number of the components of the states. see
             `states` above. Defaults to 1.
-    
+
     Returns:
-        A nested list or tuple of RNN cell states. 
-        If `bidirectional` is True, it can be indexed twice to get an RNN 
-        cell state. The first index indicates the layer, the second index 
+        A nested list or tuple of RNN cell states.
+        If `bidirectional` is True, it can be indexed twice to get an RNN
+        cell state. The first index indicates the layer, the second index
         indicates the direction.
         If `bidirectional` is False, it can be indexed once to get an RNN
         cell state. The index indicates the layer.
         Note that if `state_components` is larger than 1, an RNN cell state
-        can be indexed one more time to get a tensor of shape(N, C), where 
+        can be indexed one more time to get a tensor of shape(N, C), where
         `N` is the batch size of the input to the RNN cell, and `C` is the
         hidden size of the RNN cell.
     """
@@ -101,35 +101,35 @@ def split_states(states, bidirectional=False, state_components=1):
 
 def concat_states(states, bidirectional=False, state_components=1):
     r"""
-    Concatenate a possibly nested list or tuple of RNN cell states into a 
+    Concatenate a possibly nested list or tuple of RNN cell states into a
     compact form.
 
     Parameters:
-        states (list|tuple): a possibly nested list or tuple of RNN cell 
-            states. 
-            If `bidirectional` is True, it can be indexed twice to get an 
-            RNN cell state. The first index indicates the layer, the second 
+        states (list|tuple): a possibly nested list or tuple of RNN cell
+            states.
+            If `bidirectional` is True, it can be indexed twice to get an
+            RNN cell state. The first index indicates the layer, the second
             index indicates the direction.
             If `bidirectional` is False, it can be indexed once to get an RNN
             cell state. The index indicates the layer.
-            Note that if `state_components` is larger than 1, an RNN cell 
-            state can be indexed one more time to get a tensor of shape(N, C), 
-            where `N` is the batch size of the input to the RNN cell, and 
-            `C` is the hidden size of the RNN cell. 
-        bidirectional (bool): whether the state is of a bidirectional RNN 
+            Note that if `state_components` is larger than 1, an RNN cell
+            state can be indexed one more time to get a tensor of shape(N, C),
+            where `N` is the batch size of the input to the RNN cell, and
+            `C` is the hidden size of the RNN cell.
+        bidirectional (bool): whether the state is of a bidirectional RNN
             network. Defaults to False.
         state_components (int): the number of the components of the states. see
             `states` above. Defaults to 1.
-    
+
     Returns:
         Concatenated states for RNN network.
         When `state_components` is 1, states in a Tensor with shape
-        `(L\*D, N, C)` where `L` is the number of layers of the RNN 
-        network, `D` is the number of directions of the RNN network(1 for 
-        unidirectional RNNs and 2 for bidirectional RNNs), `N` is the batch 
-        size of the input to the RNN network, `C` is the hidden size of the 
+        `(L\*D, N, C)` where `L` is the number of layers of the RNN
+        network, `D` is the number of directions of the RNN network(1 for
+        unidirectional RNNs and 2 for bidirectional RNNs), `N` is the batch
+        size of the input to the RNN network, `C` is the hidden size of the
         RNN network.
-        
+
     """
     if state_components == 1:
         return paddle.stack(flatten(states))
@@ -159,28 +159,28 @@ class RNNCellBase(Layer):
         value.
 
         Parameters:
-            batch_ref (Tensor): A tensor, which shape would be used to 
-                determine the batch size, which is used to generate initial 
-                states. For `batch_ref`'s shape d, `d[batch_dim_idx]` is 
+            batch_ref (Tensor): A tensor, which shape would be used to
+                determine the batch size, which is used to generate initial
+                states. For `batch_ref`'s shape d, `d[batch_dim_idx]` is
                 treated as batch size.
-            shape (list|tuple, optional): A (possibly nested structure of) shape[s], 
-                where a shape is a list/tuple of integer. `-1` (for batch size) 
-                will be automatically prepended if a shape does not starts with 
-                it. If None, property `state_shape` will be used. Defaults to 
+            shape (list|tuple, optional): A (possibly nested structure of) shape[s],
+                where a shape is a list/tuple of integer. `-1` (for batch size)
+                will be automatically prepended if a shape does not starts with
+                it. If None, property `state_shape` will be used. Defaults to
                 None.
-            dtype (str|list|tuple, optional): A (possibly nested structure of) 
-                data type[s]. The structure must be same as that of `shape`, 
-                except when all tensors' in states has the same data type, a 
-                single data type can be used. If None and property `cell.state_shape` 
-                is not available, current default floating type of paddle is 
+            dtype (str|list|tuple, optional): A (possibly nested structure of)
+                data type[s]. The structure must be same as that of `shape`,
+                except when all tensors' in states has the same data type, a
+                single data type can be used. If None and property `cell.state_shape`
+                is not available, current default floating type of paddle is
                 used. Defaults to None.
-            init_value (float, optional): A float value used to initialize states. 
+            init_value (float, optional): A float value used to initialize states.
                 Defaults to 0.
-            batch_dim_idx (int, optional): An integer indicating which 
+            batch_dim_idx (int, optional): An integer indicating which
                 dimension of the of `batch_ref` represents batch. Defaults to 0.
-                
+
         Returns:
-            init_states (Tensor|tuple|list): tensor of the provided shape and 
+            init_states (Tensor|tuple|list): tensor of the provided shape and
                 dtype, or list of tensors that each satisfies the requirements,
                 packed in the same structure as `shape` and `type` does.
         """
@@ -242,7 +242,7 @@ class RNNCellBase(Layer):
         r"""
         Abstract method (property).
         Used to initialize states.
-        A (possiblely nested structure of) shape[s], where a shape is a 
+        A (possiblely nested structure of) shape[s], where a shape is a
         list/tuple of integers (-1 for batch size would be automatically
         inserted into a shape if shape is not started with it).
         Not necessary to be implemented if states are not initialized by
@@ -270,7 +270,7 @@ class RNNCellBase(Layer):
 
 class SimpleRNNCell(RNNCellBase):
     r"""
-    Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it 
+    Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
     computes the outputs and updates states.
 
     The formula used is as follows:
@@ -279,26 +279,26 @@ class SimpleRNNCell(RNNCellBase):
         h_{t} & = act(W_{ih}x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
 
         y_{t} & = h_{t}
-    
+
     where :math:`act` is for :attr:`activation`.
 
-    Please refer to `Finding Structure in Time 
+    Please refer to `Finding Structure in Time
     <https://crl.ucsd.edu/~elman/Papers/fsit.pdf>`_ for more details.
-    
+
     Parameters:
         input_size (int): The input size.
         hidden_size (int): The hidden size.
-        activation (str, optional): The activation in the SimpleRNN cell. 
+        activation (str, optional): The activation in the SimpleRNN cell.
             It can be `tanh` or `relu`. Defaults to `tanh`.
-        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for
             :math:`weight_ih`. Default: None.
-        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for
             :math:`weight_hh`. Default: None.
-        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the
             :math:`bias_ih`. Default: None.
-        bias_hh_attr (ParamAttr, optional): The parameter attribute for the 
+        bias_hh_attr (ParamAttr, optional): The parameter attribute for the
             :math:`bias_hh`. Default: None.
-        name (str, optional): Name for the operation (optional, default is 
+        name (str, optional): Name for the operation (optional, default is
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Variables:
@@ -306,7 +306,7 @@ class SimpleRNNCell(RNNCellBase):
         - **weight_hh** (Parameter): shape (hidden_size, hidden_size), hidden to hidden weight, corresponding to :math:`W_{hh}` in the formula.
         - **bias_ih** (Parameter): shape (hidden_size, ), input to hidden bias, corresponding to :math:`b_{ih}` in the formula.
         - **bias_hh** (Parameter): shape (hidden_size, ), hidden to hidden bias, corresponding to :math:`b_{hh}` in the formula.
-    
+
     Inputs:
         - **inputs** (Tensor): shape `[batch_size, input_size]`, the input, corresponding to :math:`x_{t}` in the formula.
         - **states** (Tensor, optional): shape `[batch_size, hidden_size]`, the previous hidden state, corresponding to :math:`h_{t-1}` in the formula. When states is None, zero state is used. Defaults to None.
@@ -314,7 +314,7 @@ class SimpleRNNCell(RNNCellBase):
     Returns:
         - **outputs** (Tensor): shape `[batch_size, hidden_size]`, the output, corresponding to :math:`h_{t}` in the formula.
         - **states** (Tensor): shape `[batch_size, hidden_size]`, the new hidden state, corresponding to :math:`h_{t}` in the formula.
-    
+
     Notes:
         All the weights and bias are initialized with `Uniform(-std, std)` by default. Where std = :math:`\frac{1}{\sqrt{hidden\_size}}`. For more information about parameter initialization, please refer to :ref:`api_fluid_ParamAttr`.
 
@@ -406,7 +406,7 @@ class SimpleRNNCell(RNNCellBase):
 
 class LSTMCell(RNNCellBase):
     r"""
-    Long-Short Term Memory(LSTM) RNN cell. Given the inputs and previous states, 
+    Long-Short Term Memory(LSTM) RNN cell. Given the inputs and previous states,
     it computes the outputs and updates states.
 
     The formula used is as follows:
@@ -426,7 +426,7 @@ class LSTMCell(RNNCellBase):
 
         y_{t} & = h_{t}
 
-    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise 
+    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
     multiplication operator.
 
     Please refer to `An Empirical Exploration of Recurrent Network Architectures
@@ -435,15 +435,15 @@ class LSTMCell(RNNCellBase):
     Parameters:
         input_size (int): The input size.
         hidden_size (int): The hidden size.
-        weight_ih_attr(ParamAttr, optional): The parameter attribute for 
+        weight_ih_attr(ParamAttr, optional): The parameter attribute for
             `weight_ih`. Default: None.
-        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for
             `weight_hh`. Default: None.
-        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the
             `bias_ih`. Default: None.
-        bias_hh_attr (ParamAttr, optional): The parameter attribute for the 
+        bias_hh_attr (ParamAttr, optional): The parameter attribute for the
             `bias_hh`. Default: None.
-        name (str, optional): Name for the operation (optional, default is 
+        name (str, optional): Name for the operation (optional, default is
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Variables:
@@ -461,8 +461,8 @@ class LSTMCell(RNNCellBase):
         - **states** (tuple): a tuple of two tensors, each of shape `[batch_size, hidden_size]`, the new hidden states, corresponding to :math:`h_{t}, c_{t}` in the formula.
 
     Notes:
-        All the weights and bias are initialized with `Uniform(-std, std)` by 
-        default. Where std = :math:`\frac{1}{\sqrt{hidden\_size}}`. For more 
+        All the weights and bias are initialized with `Uniform(-std, std)` by
+        default. Where std = :math:`\frac{1}{\sqrt{hidden\_size}}`. For more
         information about parameter initialization, please refer to :ref:`api_fluid_ParamAttr`.
 
     Examples:
@@ -550,9 +550,9 @@ class LSTMCell(RNNCellBase):
     @property
     def state_shape(self):
         r"""
-        The `state_shape` of LSTMCell is a tuple with two shapes: 
-        `((hidden_size, ), (hidden_size,))`. (-1 for batch size would be 
-        automatically inserted into shape). These two shapes correspond 
+        The `state_shape` of LSTMCell is a tuple with two shapes:
+        `((hidden_size, ), (hidden_size,))`. (-1 for batch size would be
+        automatically inserted into shape). These two shapes correspond
         to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
         """
         return ((self.hidden_size, ), (self.hidden_size, ))
@@ -563,7 +563,7 @@ class LSTMCell(RNNCellBase):
 
 class GRUCell(RNNCellBase):
     r"""
-    Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, 
+    Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
     it computes the outputs and updates states.
 
     The formula for GRU used is as follows:
@@ -579,8 +579,8 @@ class GRUCell(RNNCellBase):
         h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
 
         y_{t} & = h_{t}
-    
-    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise 
+
+    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
     multiplication operator.
 
     Please refer to `An Empirical Exploration of Recurrent Network Architectures
@@ -589,15 +589,15 @@ class GRUCell(RNNCellBase):
     Parameters:
         input_size (int): The input size.
         hidden_size (int): The hidden size.
-        weight_ih_attr(ParamAttr, optional): The parameter attribute for 
+        weight_ih_attr(ParamAttr, optional): The parameter attribute for
             `weight_ih`. Default: None.
-        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for
             `weight_hh`. Default: None.
-        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the
             `bias_ih`. Default: None.
-        bias_hh_attr (ParamAttr, optional): The parameter attribute for the 
+        bias_hh_attr (ParamAttr, optional): The parameter attribute for the
             `bias_hh`. Default: None.
-        name (str, optional): Name for the operation (optional, default is 
+        name (str, optional): Name for the operation (optional, default is
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Variables:
@@ -613,10 +613,10 @@ class GRUCell(RNNCellBase):
     Returns:
         - **outputs** (Tensor): shape `[batch_size, hidden_size]`, the output, corresponding to :math:`h_{t}` in the formula.
         - **states** (Tensor): shape `[batch_size, hidden_size]`, the new hidden state, corresponding to :math:`h_{t}` in the formula.
-    
+
     Notes:
-        All the weights and bias are initialized with `Uniform(-std, std)` by 
-        default. Where std = :math:`\frac{1}{\sqrt{hidden\_size}}`. For more 
+        All the weights and bias are initialized with `Uniform(-std, std)` by
+        default. Where std = :math:`\frac{1}{\sqrt{hidden\_size}}`. For more
         information about parameter initialization, please refer to s:ref:`api_fluid_ParamAttr`.
 
     Examples:
@@ -714,8 +714,8 @@ class GRUCell(RNNCellBase):
 
 class RNN(Layer):
     r"""
-    Wrapper for RNN, which creates a recurrent neural network with an RNN cell. 
-    It performs :code:`cell.forward()` repeatedly until reaches to the maximum 
+    Wrapper for RNN, which creates a recurrent neural network with an RNN cell.
+    It performs :code:`cell.forward()` repeatedly until reaches to the maximum
     length of `inputs`.
 
     Parameters:
@@ -729,16 +729,16 @@ class RNN(Layer):
         - **inputs** (Tensor): A (possibly nested structure of) tensor[s]. The input sequences. If time major is False, the shape is `[batch_size, time_steps, input_size]`. If time major is True, the shape is `[time_steps, batch_size, input_size]` where `input_size` is the input size of the cell.
         - **initial_states** (Tensor|list|tuple, optional): Tensor of a possibly nested structure of tensors, representing the initial state for the rnn cell. If not provided, `cell.get_initial_states` would be called to produce the initial states. Defaults to None.
         - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None.If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whose time step index are not less than the valid length are treated as paddings.
-        - **kwargs**: Additional keyword arguments to pass to `forward` of the cell. 
+        - **kwargs**: Additional keyword arguments to pass to `forward` of the cell.
 
     Returns:
         - **outputs** (Tensor|list|tuple): the output sequences. If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`, else `[batch_size, time_steps, hidden_size]`.
         - **final_states** (Tensor|list|tuple): final states of the cell. Tensor or a possibly nested structure of tensors which has the same structure with intial state. Each tensor in final states has the same shape and dtype as the corresponding tensor in initial states.
-    
+
     Notes:
         This class is a low level API for wrapping rnn cell into a RNN network.
-        Users should take care of the state of the cell. If `initial_states` is 
-        passed to the `forward` method, make sure that it satisfies the 
+        Users should take care of the state of the cell. If `initial_states` is
+        passed to the `forward` method, make sure that it satisfies the
         requirements of the cell.
 
     Examples:
@@ -789,9 +789,9 @@ class RNN(Layer):
 
 class BiRNN(Layer):
     r"""
-    Wrapper for bidirectional RNN, which builds a bidiretional RNN given the 
-    forward rnn cell and backward rnn cell. A BiRNN applies forward RNN and 
-    backward RNN with coresponding cells separately and concats the outputs 
+    Wrapper for bidirectional RNN, which builds a bidiretional RNN given the
+    forward rnn cell and backward rnn cell. A BiRNN applies forward RNN and
+    backward RNN with coresponding cells separately and concats the outputs
     along the last axis.
 
     Parameters:
@@ -808,12 +808,12 @@ class BiRNN(Layer):
 
     Outputs:
         - **outputs** (Tensor): the outputs of the bidirectional RNN. It is the concatenation of the outputs from the forward RNN and backward RNN along the last axis. If time major is True, the shape is `[time_steps, batch_size, size]`, else the shape is `[batch_size, time_steps, size]`, where size is `cell_fw.hidden_size + cell_bw.hidden_size`.
-        - **final_states** (tuple): A tuple of the final states of the forward cell and backward cell. 
+        - **final_states** (tuple): A tuple of the final states of the forward cell and backward cell.
 
     Notes:
-        This class is a low level API for wrapping rnn cells into a BiRNN 
-        network. Users should take care of the states of the cells. 
-        If `initial_states` is passed to the `forward` method, make sure that 
+        This class is a low level API for wrapping rnn cells into a BiRNN
+        network. Users should take care of the states of the cells.
+        If `initial_states` is passed to the `forward` method, make sure that
         it satisfies the requirements of the cells.
 
     Examples:
@@ -1112,12 +1112,12 @@ class RNNBase(LayerList):
 
 class SimpleRNN(RNNBase):
     r"""
-    Multilayer Elman network(SimpleRNN). It takes input sequences and initial 
+    Multilayer Elman network(SimpleRNN). It takes input sequences and initial
     states as inputs, and returns the output sequences and the final states.
 
-    Each layer inside the SimpleRNN maps the input sequences and initial states 
-    to the output sequences and final states in the following manner: at each 
-    step, it takes step inputs(:math:`x_{t}`) and previous 
+    Each layer inside the SimpleRNN maps the input sequences and initial states
+    to the output sequences and final states in the following manner: at each
+    step, it takes step inputs(:math:`x_{t}`) and previous
     states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`)
     and new states(:math:`h_{t}`).
 
@@ -1126,7 +1126,7 @@ class SimpleRNN(RNNBase):
         h_{t} & = act(W_{ih}x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
 
         y_{t} & = h_{t}
-    
+
     where :math:`act` is for :attr:`activation`.
 
     Using key word arguments to construct is recommended.
@@ -1138,24 +1138,24 @@ class SimpleRNN(RNNBase):
         direction (str, optional): The direction of the network. It can be "forward"
             or "bidirect"(or "bidirectional"). When "bidirect", the way to merge
             outputs of forward and backward is concatenating. Defaults to "forward".
-        time_major (bool, optional): Whether the first dimension of the input 
-            means the time steps. If time_major is True, the shape of Tensor is 
+        time_major (bool, optional): Whether the first dimension of the input
+            means the time steps. If time_major is True, the shape of Tensor is
             [time_steps,batch_size,input_size], otherwise [batch_size, time_steps,input_size].
             Defaults to False. `time_steps` means the length of input sequence.
-        dropout (float, optional): The droput probability. Dropout is applied 
-            to the input of each layer except for the first layer. The range of 
+        dropout (float, optional): The droput probability. Dropout is applied
+            to the input of each layer except for the first layer. The range of
             dropout from 0 to 1. Defaults to 0.
-        activation (str, optional): The activation in each SimpleRNN cell. It can be 
+        activation (str, optional): The activation in each SimpleRNN cell. It can be
             `tanh` or `relu`. Defaults to `tanh`.
-        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for
             `weight_ih` of each cell. Defaults to None.
-        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for
             `weight_hh` of each cell. Defaults to None.
-        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the
             `bias_ih` of each cells. Defaults to None.
-        bias_hh_attr (ParamAttr, optional): The parameter attribute for the 
+        bias_hh_attr (ParamAttr, optional): The parameter attribute for the
             `bias_hh` of each cells. Defaults to None.
-        name (str, optional): Name for the operation (optional, default is 
+        name (str, optional): Name for the operation (optional, default is
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Inputs:
@@ -1166,7 +1166,7 @@ class SimpleRNN(RNNBase):
     Returns:
 
         - **outputs** (Tensor): the output sequence. If `time_major` is True, the shape is `[time_steps, batch_size, num_directions * hidden_size]`, else, the shape is `[batch_size, time_steps, num_directions * hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1. `time_steps` means the length of the output sequence.
-        
+
         - **final_states** (Tensor): final states. The shape is `[num_layers * num_directions, batch_size, hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" (the index of forward states are 0, 2, 4, 6... and the index of backward states are 1, 3, 5, 7...), else 1.
 
     Variables:
@@ -1223,13 +1223,13 @@ class SimpleRNN(RNNBase):
 
 class LSTM(RNNBase):
     r"""
-    Multilayer LSTM. It takes a sequence and an initial state as inputs, and 
+    Multilayer LSTM. It takes a sequence and an initial state as inputs, and
     returns the output sequences and the final states.
 
-    Each layer inside the LSTM maps the input sequences and initial states 
-    to the output sequences and final states in the following manner: at each 
-    step, it takes step inputs(:math:`x_{t}`) and previous 
-    states(:math:`h_{t-1}, c_{t-1}`) as inputs, and returns step 
+    Each layer inside the LSTM maps the input sequences and initial states
+    to the output sequences and final states in the following manner: at each
+    step, it takes step inputs(:math:`x_{t}`) and previous
+    states(:math:`h_{t-1}, c_{t-1}`) as inputs, and returns step
     outputs(:math:`y_{t}`) and new states(:math:`h_{t}, c_{t}`).
 
     .. math::
@@ -1248,7 +1248,7 @@ class LSTM(RNNBase):
 
         y_{t} & = h_{t}
 
-    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise 
+    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
     multiplication operator.
 
     Using key word arguments to construct is recommended.
@@ -1260,22 +1260,22 @@ class LSTM(RNNBase):
         direction (str, optional): The direction of the network. It can be "forward"
             or "bidirect"(or "bidirectional"). When "bidirect", the way to merge
             outputs of forward and backward is concatenating. Defaults to "forward".
-        time_major (bool, optional): Whether the first dimension of the input 
-            means the time steps. If time_major is True, the shape of Tensor is 
+        time_major (bool, optional): Whether the first dimension of the input
+            means the time steps. If time_major is True, the shape of Tensor is
             [time_steps,batch_size,input_size], otherwise [batch_size, time_steps,input_size].
             Defaults to False. `time_steps` means the length of input sequence.
-        dropout (float, optional): The droput probability. Dropout is applied 
-            to the input of each layer except for the first layer. The range of 
+        dropout (float, optional): The droput probability. Dropout is applied
+            to the input of each layer except for the first layer. The range of
             dropout from 0 to 1. Defaults to 0.
-        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for
             `weight_ih` of each cell. Default: None.
-        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for
             `weight_hh` of each cell. Default: None.
-        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the
             `bias_ih` of each cells. Default: None.
-        bias_hh_attr (ParamAttr, optional): The parameter attribute for the 
+        bias_hh_attr (ParamAttr, optional): The parameter attribute for the
             `bias_hh` of each cells. Default: None.
-        name (str, optional): Name for the operation (optional, default is 
+        name (str, optional): Name for the operation (optional, default is
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Inputs:
@@ -1286,7 +1286,7 @@ class LSTM(RNNBase):
     Returns:
 
         - **outputs** (Tensor): the output sequence. If `time_major` is True, the shape is `[time_steps, batch_size, num_directions * hidden_size]`, If `time_major` is False, the shape is `[batch_size, time_steps, num_directions * hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1. `time_steps` means the length of the output sequence.
-        
+
         - **final_states** (tuple): the final state, a tuple of two tensors, h and c. The shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" (the index of forward states are 0, 2, 4, 6... and the index of backward states are 1, 3, 5, 7...), else 1.
 
     Variables:
@@ -1296,7 +1296,7 @@ class LSTM(RNNBase):
         - **bias_hh_l[k]**: the learnable hidden-hidden bias of the k-th layer, swith shape `[hidden_size]`.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
@@ -1338,13 +1338,13 @@ class LSTM(RNNBase):
 
 class GRU(RNNBase):
     r"""
-    Multilayer GRU. It takes input sequencse and initial states as inputs, and 
+    Multilayer GRU. It takes input sequencse and initial states as inputs, and
     returns the output sequences and the final states.
 
-    Each layer inside the GRU maps the input sequences and initial states 
-    to the output sequences and final states in the following manner: at each 
-    step, it takes step inputs(:math:`x_{t}`) and previous 
-    states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`) 
+    Each layer inside the GRU maps the input sequences and initial states
+    to the output sequences and final states in the following manner: at each
+    step, it takes step inputs(:math:`x_{t}`) and previous
+    states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`)
     and new states(:math:`h_{t}`).
 
     .. math::
@@ -1359,7 +1359,7 @@ class GRU(RNNBase):
 
         y_{t} & = h_{t}
 
-    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise 
+    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
     multiplication operator.
 
     Using key word arguments to construct is recommended.
@@ -1371,22 +1371,22 @@ class GRU(RNNBase):
         direction (str, optional): The direction of the network. It can be "forward"
             or "bidirect"(or "bidirectional"). When "bidirect", the way to merge
             outputs of forward and backward is concatenating. Defaults to "forward".
-        time_major (bool, optional): Whether the first dimension of the input 
-            means the time steps. If time_major is True, the shape of Tensor is 
+        time_major (bool, optional): Whether the first dimension of the input
+            means the time steps. If time_major is True, the shape of Tensor is
             [time_steps,batch_size,input_size], otherwise [batch_size, time_steps,input_size].
             Defaults to False. `time_steps` means the length of input sequence.
-        dropout (float, optional): The droput probability. Dropout is applied 
-            to the input of each layer except for the first layer. The range of 
+        dropout (float, optional): The droput probability. Dropout is applied
+            to the input of each layer except for the first layer. The range of
             dropout from 0 to 1. Defaults to 0.
-        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for
             `weight_ih` of each cell. Default: None.
-        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for
             `weight_hh` of each cell. Default: None.
-        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the
             `bias_ih` of each cells. Default: None.
-        bias_hh_attr (ParamAttr, optional): The parameter attribute for the 
+        bias_hh_attr (ParamAttr, optional): The parameter attribute for the
             `bias_hh` of each cells. Default: None.
-        name (str, optional): Name for the operation (optional, default is 
+        name (str, optional): Name for the operation (optional, default is
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Inputs:
@@ -1397,7 +1397,7 @@ class GRU(RNNBase):
     Returns:
 
         - **outputs** (Tensor): the output sequence. If `time_major` is True, the shape is `[time_steps, batch_size, num_directions * hidden_size]`, else, the shape is `[batch_size, time_steps, num_directions * hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1. `time_steps` means the length of the output sequence.
-        
+
         - **final_states** (Tensor): final states. The shape is `[num_layers * num_directions, batch_size, hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" (the index of forward states are 0, 2, 4, 6... and the index of backward states are 1, 3, 5, 7...), else 1.
 
     Variables:
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 75f8bac75bc..0ea83b3d84c 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -86,11 +86,11 @@ def _convert_attention_mask(attn_mask, dtype):
                 to prevents attention to some unwanted positions, usually the
                 paddings or the subsequent positions. It is a tensor with shape
                 broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
-                When the data type is bool, the unwanted positions have `False` 
-                values and the others have `True` values. When the data type is 
-                int, the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
         dtype (VarType): The target type of `attn_mask` we expect.
 
@@ -133,7 +133,7 @@ class MultiHeadAttention(Layer):
             Default: None, which means the default bias parameter property is used.
             If it is set to False, this layer will not have trainable bias parameter.
             See usage for details in :code:`ParamAttr` .
-         
+
     Examples:
 
         .. code-block:: python
@@ -255,7 +255,7 @@ class MultiHeadAttention(Layer):
         (reshape and transpose) to get keys and values from different representation
         subspaces. The results are used as key-values pairs for subsequent multiple
         parallel attention.
-        
+
         It is part of calculations in multi-head attention, and is provided as
         a method to pre-compute and prefetch these results, thus we can use them
         to construct cache for inference.
@@ -291,12 +291,12 @@ class MultiHeadAttention(Layer):
         and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]`
         which are results of linear projection, reshape and transpose calculations
         in MultiHeadAttention.
-        
+
         If the generated cache is an instance of `Cache`, `k` and `v` fields
         reserve intermediate result tensors of previous positions, and the tensors
         are incremental among decoding steps, which mostly are used for decoder
         decoder self attention.
-        
+
         If the generated cache is an instance of `StaticCache`, `k` and `v` fields
         would be used as calculated result tensors on keys an values in `forward`,
         and the tensors keep unchanged among decoding steps, which are mostly used
@@ -306,7 +306,7 @@ class MultiHeadAttention(Layer):
 
         1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the
         results to create an instance of `StaticCache`.
-        
+
         2. If `type` is `Cache` and `value` is None, generate empty tensors shaped
         `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results
         to create an instance of `Cache`, where `batch_size` is from the first
@@ -326,7 +326,7 @@ class MultiHeadAttention(Layer):
                 for batch size reference. Default None.
             type (type): It should be `MultiHeadAttention.StaticCache` or
                 `MultiHeadAttention.Cache` to indicate the cache type to generate.
-        
+
         Returns:
             namedtuple: an instance of `Cache` or `StaticCache` accordingly.
         """
@@ -370,11 +370,11 @@ class MultiHeadAttention(Layer):
                 to prevents attention to some unwanted positions, usually the
                 paddings or the subsequent positions. It is a tensor with shape
                 broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
-                When the data type is bool, the unwanted positions have `False` 
-                values and the others have `True` values. When the data type is 
-                int, the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
             cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
                 It is a namedtuple with `k` and `v` as fields, and stores tensors
@@ -473,7 +473,7 @@ class TransformerEncoderLayer(Layer):
             MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
             Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
             Default: None, which means the default weight parameter property is used.
-            See usage for details in :code:`ParamAttr` . 
+            See usage for details in :code:`ParamAttr` .
         bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
             If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
             MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
@@ -481,7 +481,7 @@ class TransformerEncoderLayer(Layer):
             The `False` value means the corresponding layer would not have trainable
             bias parameter. See usage for details in :code:`ParamAttr` . Default: None,
             which means the default bias parameter property is used.
-            
+
 
     Examples:
 
@@ -562,11 +562,11 @@ class TransformerEncoderLayer(Layer):
                 to prevents attention to some unwanted positions, usually the
                 paddings or the subsequent positions. It is a tensor with shape
                 broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
-                When the data type is bool, the unwanted positions have `False` 
-                values and the others have `True` values. When the data type is 
-                int, the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
             cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
                 See `TransformerEncoderLayer.gen_cache` for more details. It is
@@ -609,17 +609,17 @@ class TransformerEncoderLayer(Layer):
 
     def gen_cache(self, src):
         r"""
-        Generates cache for `forward` usage. The generated cache is an 
+        Generates cache for `forward` usage. The generated cache is an
         instance of `MultiHeadAttention.Cache`.
 
         Parameters:
             src (Tensor): The input of Transformer encoder. It is a tensor
-                with shape `[batch_size, source_length, d_model]`. The data 
+                with shape `[batch_size, source_length, d_model]`. The data
                 type should be float32 or float64.
 
         Returns:
             incremental_cache: It is an instance of `MultiHeadAttention.Cache` \
-                produced by `self_attn.gen_cache`, it reserves two tensors 
+                produced by `self_attn.gen_cache`, it reserves two tensors
                 shaped `[batch_size, nhead, 0, d_model // nhead]`. See \
                 `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
                 for more details.
@@ -631,7 +631,7 @@ class TransformerEncoderLayer(Layer):
 
 class TransformerEncoder(Layer):
     """
-    TransformerEncoder is a stack of N encoder layers. 
+    TransformerEncoder is a stack of N encoder layers.
 
     Parameters:
         encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It
@@ -680,14 +680,14 @@ class TransformerEncoder(Layer):
                 to prevents attention to some unwanted positions, usually the
                 paddings or the subsequent positions. It is a tensor with shape
                 broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
-                When the data type is bool, the unwanted positions have `False` 
-                values and the others have `True` values. When the data type is 
-                int, the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
             cache (list, optional): It is a list, and each element in the list
-                is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`. 
+                is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`.
                 See `TransformerEncoder.gen_cache` for more details. It is only
                 used for inference and should be None for training. Default None.
 
@@ -721,7 +721,7 @@ class TransformerEncoder(Layer):
     def gen_cache(self, src):
         r"""
         Generates cache for `forward` usage. The generated cache is a list, and
-        each element in it is `incremental_cache` produced by 
+        each element in it is `incremental_cache` produced by
         `TransformerEncoderLayer.gen_cache`. See `TransformerEncoderLayer.gen_cache`
         for more details.
 
@@ -731,8 +731,8 @@ class TransformerEncoder(Layer):
                 should be float32 or float64.
 
         Returns:
-            list: It is a list, and each element in the list is `incremental_cache` 
-            produced by `TransformerEncoderLayer.gen_cache`. See 
+            list: It is a list, and each element in the list is `incremental_cache`
+            produced by `TransformerEncoderLayer.gen_cache`. See
             `TransformerEncoderLayer.gen_cache` for more details.
         """
         cache = [layer.gen_cache(src) for layer in self.layers]
@@ -774,7 +774,7 @@ class TransformerDecoderLayer(Layer):
             for linear in FFN. Otherwise, the three sub-layers all uses it as
             `weight_attr` to create parameters. Default: None, which means the
             default weight parameter property is used. See usage for details
-            in :ref:`api_paddle_fluid_param_attr_ParamAttr` . 
+            in :ref:`api_paddle_fluid_param_attr_ParamAttr` .
         bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
             If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
             self attention, `bias_attr[1]` would be used as `bias_attr` for
@@ -881,21 +881,21 @@ class TransformerDecoderLayer(Layer):
                 to prevents attention to some unwanted positions, usually the
                 the subsequent positions. It is a tensor with shape broadcasted
                 to `[batch_size, n_head, target_length, target_length]`.
-                When the data type is bool, the unwanted positions have `False` 
-                values and the others have `True` values. When the data type is 
-                int, the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
             memory_mask (Tensor, optional): A tensor used in decoder-encoder
                 cross attention to prevents attention to some unwanted positions,
-                usually the paddings. It is a tensor with shape broadcasted to 
-                `[batch_size, n_head, target_length, source_length]`. When the 
-                data type is bool, the unwanted positions have `False` values 
-                and the others have `True` values. When the data type is int, 
-                the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
+                usually the paddings. It is a tensor with shape broadcasted to
+                `[batch_size, n_head, target_length, source_length]`. When the
+                data type is bool, the unwanted positions have `False` values
+                and the others have `True` values. When the data type is int,
+                the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
             cache (tuple, optional): It is a tuple( :code:`(incremental_cache, static_cache)` ),
                 `incremental_cache` is an instance of `MultiHeadAttention.Cache`,
@@ -981,7 +981,7 @@ class TransformerDecoderLayer(Layer):
 
 class TransformerDecoder(Layer):
     """
-    TransformerDecoder is a stack of N decoder layers. 
+    TransformerDecoder is a stack of N decoder layers.
 
     Parameters:
         decoder_layer (Layer): an instance of the `TransformerDecoderLayer`. It
@@ -1039,22 +1039,22 @@ class TransformerDecoder(Layer):
             tgt_mask (Tensor, optional): A tensor used in self attention
                 to prevents attention to some unwanted positions, usually the
                 the subsequent positions. It is a tensor with shape broadcasted
-                to `[batch_size, n_head, target_length, target_length]`. When 
-                the data type is bool, the unwanted positions have `False` 
-                values and the others have `True` values. When the data type is 
-                int, the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
+                to `[batch_size, n_head, target_length, target_length]`. When
+                the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
             memory_mask (Tensor, optional): A tensor used in decoder-encoder
                 cross attention to prevents attention to some unwanted positions,
                 usually the paddings. It is a tensor with shape broadcasted to
-                `[batch_size, n_head, target_length, source_length]`. When the 
-                data type is bool, the unwanted positions have `False` values 
-                and the others have `True` values. When the data type is int, 
-                the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
+                `[batch_size, n_head, target_length, source_length]`. When the
+                data type is bool, the unwanted positions have `False` values
+                and the others have `True` values. When the data type is int,
+                the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
             cache (list, optional): It is a list, and each element in the list
                 is a tuple( :code:`(incremental_cache, static_cache)` ). See
@@ -1131,12 +1131,12 @@ class Transformer(Layer):
 
     Please refer to `Attention is all you need <http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf>`_ ,
     and see `TransformerEncoder` and `TransformerDecoder` for more details.
-    
+
     Users can configurate the model architecture with corresponding parameters.
     Note the usage of `normalize_before` representing where to apply layer
     normalization (in pre-process or post-precess of multi-head attention or FFN),
     and some transformer like models are different on this, such as
-    `BERT <https://arxiv.org/abs/1810.04805>`_ and `GPT2 <https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf>`_ . 
+    `BERT <https://arxiv.org/abs/1810.04805>`_ and `GPT2 <https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf>`_ .
     The default architecture here places layer normalization in post-process and
     applies another layer normalization on the output of last encoder/decoder layer.
 
@@ -1162,30 +1162,30 @@ class Transformer(Layer):
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
         weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
-            If it is a list/tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, 
-            `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]` 
-            would be used as `weight_attr` for cross attention of `TransformerDecoder`, 
-            and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. 
-            If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention 
-            and cross attntion and `weight_attr[1]` would be used as `weight_attr` for 
-            linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr` 
-            for self attention, cross attention and linear in FFN. Otherwise, 
-            the three sub-layers all uses it as `weight_attr` to create parameters. 
-            Default: None, which means the default weight parameter property is used. 
+            If it is a list/tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3,
+            `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]`
+            would be used as `weight_attr` for cross attention of `TransformerDecoder`,
+            and `weight_attr[2]` would be used as `weight_attr` for linear in FFN.
+            If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention
+            and cross attntion and `weight_attr[1]` would be used as `weight_attr` for
+            linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr`
+            for self attention, cross attention and linear in FFN. Otherwise,
+            the three sub-layers all uses it as `weight_attr` to create parameters.
+            Default: None, which means the default weight parameter property is used.
             See usage for details
-            in :code:`ParamAttr` . 
+            in :code:`ParamAttr` .
         bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
-            If it is a list/tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
-            `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` 
-            would be used as `bias_attr` for cross attention of `TransformerDecoder`, 
-            and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. 
-            If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention 
-            and cross attntion and `bias_attr[1]` would be used as `bias_attr` for 
-            linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr` 
-            for self attention, cross attention and linear in FFN. Otherwise, 
-            the three sub-layers all uses it as `bias_attr` to create parameters. 
-            The `False` value means the corresponding layer would not have trainable 
-            bias parameter. See usage for details in :code:`ParamAttr` . 
+            If it is a list/tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3,
+            `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]`
+            would be used as `bias_attr` for cross attention of `TransformerDecoder`,
+            and `bias_attr[2]` would be used as `bias_attr` for linear in FFN.
+            If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention
+            and cross attntion and `bias_attr[1]` would be used as `bias_attr` for
+            linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr`
+            for self attention, cross attention and linear in FFN. Otherwise,
+            the three sub-layers all uses it as `bias_attr` to create parameters.
+            The `False` value means the corresponding layer would not have trainable
+            bias parameter. See usage for details in :code:`ParamAttr` .
             Default: None,which means the default bias parameter property is used.
         custom_encoder (Layer, optional): If custom encoder is provided, use it as the encoder.
             Default None
@@ -1323,31 +1323,31 @@ class Transformer(Layer):
                 to prevents attention to some unwanted positions, usually the
                 paddings or the subsequent positions. It is a tensor with shape
                 broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
-                When the data type is bool, the unwanted positions have `False` 
-                values and the others have `True` values. When the data type is 
-                int, the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
             tgt_mask (Tensor, optional): A tensor used in self attention
                 to prevents attention to some unwanted positions, usually the
                 the subsequent positions. It is a tensor with shape broadcasted
-                to `[batch_size, n_head, target_length, target_length]`. When 
-                the data type is bool, the unwanted positions have `False` 
-                values and the others have `True` values. When the data type is 
-                int, the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
+                to `[batch_size, n_head, target_length, target_length]`. When
+                the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
             memory_mask (Tensor, optional): A tensor used in decoder-encoder
                 cross attention to prevents attention to some unwanted positions,
                 usually the paddings. It is a tensor with shape broadcasted to
-                `[batch_size, n_head, target_length, source_length]`. When the 
-                data type is bool, the unwanted positions have `False` values 
-                and the others have `True` values. When the data type is int, 
-                the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
+                `[batch_size, n_head, target_length, source_length]`. When the
+                data type is bool, the unwanted positions have `False` values
+                and the others have `True` values. When the data type is int,
+                the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
 
         Returns:
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
index e338e89dada..fe367828815 100644
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -22,8 +22,8 @@ __all__ = []
 
 class PixelShuffle(Layer):
     """
-    
-    PixelShuffle Layer    
+
+    PixelShuffle Layer
 
     Rearranges elements in a tensor of shape :math:`[N, C, H, W]`
     to a tensor of shape :math:`[N, C/upscale_factor^2, H*upscale_factor, W \times upscale_factor]`,
@@ -47,7 +47,7 @@ class PixelShuffle(Layer):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
             import paddle.nn as nn
 
@@ -90,8 +90,8 @@ class PixelShuffle(Layer):
 class PixelUnshuffle(Layer):
     """
     Rearranges elements in a tensor of shape :math:`[N, C, H, W]`
-    to a tensor of shape :math:`[N, r^2C, H/r, W/r]`, or from shape 
-    :math:`[N, H, W, C]` to :math:`[N, H/r, W/r, r^2C]`, where :math:`r` is the 
+    to a tensor of shape :math:`[N, r^2C, H/r, W/r]`, or from shape
+    :math:`[N, H, W, C]` to :math:`[N, H/r, W/r, r^2C]`, where :math:`r` is the
     downscale factor. This operation is the reversion of PixelShuffle operation.
     Please refer to the paper: `Real-Time Single Image and Video Super-Resolution
     Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_ .
@@ -155,10 +155,10 @@ class ChannelShuffle(Layer):
     This operator divides channels in a tensor of shape [N, C, H, W] or [N, H, W, C] into g groups,
     getting a tensor with the shape of [N, g, C/g, H, W] or [N, H, W, g, C/g], and transposes them
     as [N, C/g, g, H, W] or [N, H, W, g, C/g], then rearranges them to original tensor shape. This
-    operation can improve the interaction between channels, using features efficiently. Please 
-    refer to the paper: `ShuffleNet: An Extremely Efficient 
+    operation can improve the interaction between channels, using features efficiently. Please
+    refer to the paper: `ShuffleNet: An Extremely Efficient
     Convolutional Neural Network for Mobile Devices <https://arxiv.org/abs/1707.01083>`_ .
-    by Zhang et. al (2017) for more details. 
+    by Zhang et. al (2017) for more details.
 
     Parameters:
         groups (int): Number of groups to divide channels in.
diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py
index 59d1389f099..c41985e4b2f 100644
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -516,7 +516,7 @@ class QuantizedConv2DTranspose(Layer):
     """
     The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
     The only difference is that its inputs are all fake quantized.
-    
+
     Examples:
        .. code-block:: python
           import paddle
@@ -703,7 +703,7 @@ class QuantizedColumnParallelLinear(Layer):
                  act_quant_layer=None):
         super(QuantizedColumnParallelLinear, self).__init__()
         '''
-        
+
         '''
         assert weight_quant_layer is None, "When quantizing ColumnParallelLinear, weight_quant_layer should be None."
         assert act_quant_layer is None, "When quantizing ColumnParallelLinear, act_quant_layer should be None."
diff --git a/python/paddle/nn/utils/spectral_norm_hook.py b/python/paddle/nn/utils/spectral_norm_hook.py
index 375fe9013b8..0a89e8a386e 100644
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
@@ -133,7 +133,7 @@ def spectral_norm(layer,
                   eps=1e-12,
                   dim=None):
     r"""
-    This spectral_norm layer applies spectral normalization to a parameter according to the 
+    This spectral_norm layer applies spectral normalization to a parameter according to the
     following Calculation:
 
     Step 1:
@@ -169,7 +169,7 @@ def spectral_norm(layer,
         n_power_iterations(int, optional): The number of power iterations to calculate spectral norm. Default: 1.
         eps(float, optional): The epsilon for numerical stability in calculating norms. Default: 1e-12.
         dim(int, optional): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer. Default: None.
-        
+
     Returns:
         The original layer with the spectral norm hook
 
@@ -188,11 +188,11 @@ def spectral_norm(layer,
             #        [[[[-0.21090528,  0.18563725, -0.14127982],
             #           [-0.02310637,  0.03197737,  0.34353802],
             #           [-0.17117859,  0.33152047, -0.28408015]],
-            # 
+            #
             #          [[-0.13336606, -0.01862637,  0.06959272],
             #           [-0.02236020, -0.27091628, -0.24532901],
             #           [ 0.27254242,  0.15516677,  0.09036587]],
-            # 
+            #
             #          [[ 0.30169338, -0.28146112, -0.11768346],
             #           [-0.45765871, -0.12504843, -0.17482486],
             #           [-0.36866254, -0.19969313,  0.08783543]]]])
diff --git a/python/paddle/nn/utils/transform_parameters.py b/python/paddle/nn/utils/transform_parameters.py
index 44b870a9a47..bc813ccad24 100644
--- a/python/paddle/nn/utils/transform_parameters.py
+++ b/python/paddle/nn/utils/transform_parameters.py
@@ -44,7 +44,7 @@ def _stride_column(param):
 
     Args:
         param(Tensor]): The param that will be strided according to 'columns'.
-    
+
     Examples:
        .. code-block:: python
 
@@ -82,7 +82,7 @@ def parameters_to_vector(parameters, name=None):
 
     Returns:
         A 1-D Tensor, which represents the parameters of a Layer.
-    
+
 
     Examples:
        .. code-block:: python
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index 40c1021848c..7219077db91 100755
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -164,29 +164,29 @@ class WeightNorm(object):
 
 def weight_norm(layer, name='weight', dim=0):
     r"""
-    This weight_norm layer applies weight normalization to a parameter according to the 
+    This weight_norm layer applies weight normalization to a parameter according to the
     following formula:
 
     .. math::
 
         \mathbf{w} = g \dfrac{v}{\|v\|}
 
-    Weight normalization is a reparameterization of the weight vectors in a neural network that 
-    decouples the magnitude of those weight vectors from their direction. Weight normalization 
-    replaces the parameter specified by `name`(eg: 'weight') with two parameters: one parameter 
-    specifying the magnitude (eg: 'weight_g') and one parameter specifying the direction 
-    (eg: 'weight_v'). Weight normalization has been implemented as discussed in this paper: 
+    Weight normalization is a reparameterization of the weight vectors in a neural network that
+    decouples the magnitude of those weight vectors from their direction. Weight normalization
+    replaces the parameter specified by `name`(eg: 'weight') with two parameters: one parameter
+    specifying the magnitude (eg: 'weight_g') and one parameter specifying the direction
+    (eg: 'weight_v'). Weight normalization has been implemented as discussed in this paper:
     `Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks
     <https://arxiv.org/pdf/1602.07868.pdf>`_.
 
     Parameters:
         layer(Layer): Layer of paddle, which has weight.
         name(str, optional): Name of the weight parameter. Default: 'weight'.
-        dim(int, optional): Dimension over which to compute the norm. Dim is a non-negative number 
-              which is less than the rank of weight Tensor. For Example, dim can be chosen from 0, 
-              1, 2, 3 for convolution whose weight shape is [cout, cin, kh, kw] and rank is 4. 
+        dim(int, optional): Dimension over which to compute the norm. Dim is a non-negative number
+              which is less than the rank of weight Tensor. For Example, dim can be chosen from 0,
+              1, 2, 3 for convolution whose weight shape is [cout, cin, kh, kw] and rank is 4.
               If dim is set to None, meaning that all elements will be normalized. Default: 0.
-    
+
     Returns:
         Origin layer with weight norm hook.
 
@@ -222,7 +222,7 @@ def remove_weight_norm(layer, name='weight'):
 
     Examples:
         .. code-block:: python
-          
+
             import paddle
             from paddle.nn import Conv2D
             from paddle.nn.utils import weight_norm, remove_weight_norm
diff --git a/python/paddle/onnx/export.py b/python/paddle/onnx/export.py
index 666cd7c0862..5cdf0b06eec 100644
--- a/python/paddle/onnx/export.py
+++ b/python/paddle/onnx/export.py
@@ -26,20 +26,20 @@ def export(layer, path, input_spec=None, opset_version=9, **configs):
     Args:
         layer (Layer): The Layer to be exported.
         path (str): The path prefix to export model. The format is ``dirname/file_prefix`` or ``file_prefix`` ,
-            and the exported ONNX file suffix is ``.onnx`` . 
-        input_spec (list[InputSpec|Tensor], optional): Describes the input of the exported model's forward 
-            method, which can be described by InputSpec or example Tensor. If None, all input variables of 
+            and the exported ONNX file suffix is ``.onnx`` .
+        input_spec (list[InputSpec|Tensor], optional): Describes the input of the exported model's forward
+            method, which can be described by InputSpec or example Tensor. If None, all input variables of
             the original Layer's forward method would be the inputs of the exported ``ONNX`` model. Default: None.
         opset_version(int, optional): Opset version of exported ONNX model.
             Now, stable supported opset version include 9, 10, 11. Default: 9.
-        **configs (dict, optional): Other export configuration options for compatibility. We do not 
-            recommend using these configurations, they may be removed in the future. If not necessary, 
+        **configs (dict, optional): Other export configuration options for compatibility. We do not
+            recommend using these configurations, they may be removed in the future. If not necessary,
             DO NOT use them. Default None.
             The following options are currently supported:
             (1) output_spec (list[Tensor]): Selects the output targets of the exported model.
-            By default, all return variables of original Layer's forward method are kept as the 
-            output of the exported model. If the provided ``output_spec`` list is not all output variables, 
-            the exported model will be pruned according to the given ``output_spec`` list. 
+            By default, all return variables of original Layer's forward method are kept as the
+            output of the exported model. If the provided ``output_spec`` list is not all output variables,
+            the exported model will be pruned according to the given ``output_spec`` list.
     Returns:
         None
     Examples:
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index f3c15ce479d..6551f194871 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -57,7 +57,7 @@ class Adadelta(Optimizer):
             If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
             the regularization setting here in optimizer will be ignored for this parameter. \
             Otherwise, the regularization setting here in optimizer will take effect. \
-            Default None, meaning there is no regularization. 
+            Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
@@ -68,7 +68,7 @@ class Adadelta(Optimizer):
 
     Examples:
         .. code-block:: python
-	
+
             import paddle
             import numpy as np
             inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
@@ -99,7 +99,7 @@ class Adadelta(Optimizer):
                     'weight_decay': 0.001,
                     'learning_rate': 0.1,
                 }],
-                weight_decay=0.01)                   
+                weight_decay=0.01)
             out.backward()
             adadelta.step()
             adadelta.clear_grad()
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
index f5cd7bdaa83..99de4243e52 100644
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -22,7 +22,7 @@ __all__ = []
 
 class Adagrad(Optimizer):
     r"""
-    The Adaptive Gradient optimizer (Adagrad for short) use an optimization described 
+    The Adaptive Gradient optimizer (Adagrad for short) use an optimization described
     in paper: `Adaptive Subgradient Methods for Online Learning and
     Stochastic Optimization <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_.
 
@@ -58,9 +58,9 @@ class Adagrad(Optimizer):
 	    the regularization setting here in optimizer will be ignored for this parameter. \
 	    Otherwise, the regularization setting here in optimizer will take effect. \
 	    Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies, 
-            ClipGradByGlobalNorm, ClipGradByNorm and ClipGradByValue. Default None, 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies,
+            ClipGradByGlobalNorm, ClipGradByNorm and ClipGradByValue. Default None,
             meaning there is no gradient clipping.
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
@@ -100,7 +100,7 @@ class Adagrad(Optimizer):
                     'weight_decay': 0.001,
                     'learning_rate': 0.1,
                 }],
-                weight_decay=0.01)                   
+                weight_decay=0.01)
             out.backward()
             adagrad.step()
             adagrad.clear_grad()
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 1140516cdc5..26d082690b7 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -152,7 +152,7 @@ class Adam(Optimizer):
                     'beta1': 0.8
                 }],
                 weight_decay=0.01,
-                beta1=0.9)                   
+                beta1=0.9)
             out.backward()
             adam.step()
             adam.clear_grad()
@@ -426,7 +426,7 @@ class Adam(Optimizer):
             .. code-block:: python
 
                 import paddle
-                
+
                 a = paddle.rand([2,13], dtype="float32")
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
@@ -521,7 +521,7 @@ class Adam(Optimizer):
 
     def _append_optimize_multi_tensor_op(self, target_block,
                                          parameters_and_grads):
-        """ 
+        """
         For Multi Tensor, append optimize merged_operator to block.
         """
         assert isinstance(target_block, framework.Block)
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index cb07fdb7f56..03f766e646d 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -24,7 +24,7 @@ __all__ = []
 
 class Adamax(Optimizer):
     r"""
-    The Adamax optimizer is implemented based on the Adamax Optimization 
+    The Adamax optimizer is implemented based on the Adamax Optimization
     in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
     The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
     which makes the learning rate update algorithm more stable and simple.
@@ -70,9 +70,9 @@ class Adamax(Optimizer):
 	    the regularization setting here in optimizer will be ignored for this parameter. \
 	    Otherwise, the regularization setting here in optimizer will take effect. \
 	    Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
@@ -83,7 +83,7 @@ class Adamax(Optimizer):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
             import numpy as np
 
@@ -124,7 +124,7 @@ class Adamax(Optimizer):
                     'beta1': 0.8
                 }],
                 weight_decay=0.01,
-                beta1=0.9)                   
+                beta1=0.9)
             out.backward()
             adam.step()
             adam.clear_grad()
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index fbe23c84a2a..4c13b8f7897 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -45,7 +45,7 @@ class AdamW(Optimizer):
 
         moemnt\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
 
-        learning\_rate & = learning\_rate * 
+        learning\_rate & = learning\_rate *
             \frac{\sqrt{1 - {\beta}_2^t}}{1 - {beta}_1^t}
 
         param\_out & = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
@@ -69,7 +69,7 @@ class AdamW(Optimizer):
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-08.
         weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
-        lr_ratio (function|None, optional): If it is not None, 
+        lr_ratio (function|None, optional): If it is not None,
             the learning rate will be updated with layerwise learning rate ratio.
             Otherwise, the learning rate is the original.
             Default: None.
@@ -97,7 +97,7 @@ class AdamW(Optimizer):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
 
             linear = paddle.nn.Linear(10, 10)
@@ -136,7 +136,7 @@ class AdamW(Optimizer):
                     'beta1': 0.8
                 }],
                 weight_decay=0.01,
-                beta1=0.9)                   
+                beta1=0.9)
             out.backward()
             opt.step()
             opt.clear_grad()
@@ -541,7 +541,7 @@ class AdamW(Optimizer):
             .. code-block:: python
 
                 import paddle
-                
+
                 a = paddle.rand([2,13], dtype="float32")
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py
index cbdf91ed2bf..60b59e7b2c5 100644
--- a/python/paddle/optimizer/lamb.py
+++ b/python/paddle/optimizer/lamb.py
@@ -79,7 +79,7 @@ class Lamb(Optimizer):
             :ref:`api_guide_Name` . Usually name is no need to set and None by default.
     Examples:
         .. code-block:: python
-            
+
             import paddle
 
             inp = paddle.uniform(shape=[10, 10], dtype='float32', min=-0.1, max=0.1)
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 4d7d128e05e..43404046bc0 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -59,7 +59,7 @@ class LRScheduler(object):
         instance to schedule learning rate.
 
     Examples:
-        Here is an example of a simple ``StepDecay`` implementation. 
+        Here is an example of a simple ``StepDecay`` implementation.
 
         .. code-block:: python
 
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 988ac052b03..f2cc8921215 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -115,7 +115,7 @@ class Momentum(Optimizer):
                     'learning_rate': 0.1
                 }],
                 weight_decay=0.01,
-                momentum=0.9)                   
+                momentum=0.9)
             out.backward()
             momentum.step()
             momentum.clear_grad()
@@ -274,7 +274,7 @@ class Momentum(Optimizer):
 
     def _create_regularization_of_grad(self, param, grad, regularization=None):
         """ Create and add backward regularization Operators
-    
+
         Function helper of append_regularization_ops.
         """
         # If ParamAttr is set to L2Decay, we skip doing regularization here. And then we fused
@@ -416,7 +416,7 @@ class Momentum(Optimizer):
 
     def _append_optimize_multi_tensor_op(self, target_block,
                                          parameters_and_grads):
-        """ 
+        """
         For Multi Tensor, append optimize merged_operator to block.
         """
         assert isinstance(target_block, framework.Block)
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 1d399021c8e..18af11609c3 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -120,8 +120,8 @@ class Optimizer(object):
             The default value is None.
 
     Returns:
-       Base class for optimizer. 
-    
+       Base class for optimizer.
+
     Examples:
         .. code-block:: python
 
@@ -138,7 +138,7 @@ class Optimizer(object):
             adam.clear_grad()
 
             #Take the subclass sgd as an example
-            #optimize parameters in linear_1 and linear2 in different options. 
+            #optimize parameters in linear_1 and linear2 in different options.
             #Note that the learning_rate of linear_2 is 0.01.
             linear_1 = paddle.nn.Linear(10, 10)
             linear_2 = paddle.nn.Linear(10, 10)
@@ -155,7 +155,7 @@ class Optimizer(object):
                     'weight_decay': 0.001,
                     'learning_rate': 0.1
                 }],
-                weight_decay=0.01)                   
+                weight_decay=0.01)
             loss.backward()
             sgd.step()
             sgd.clear_grad()
@@ -277,12 +277,12 @@ class Optimizer(object):
         Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict.
         If the optimizer never be called(minimize function), the state_dict is empty.
 
-        Args: 
+        Args:
             None
 
         Returns:
             state_dict(dict) : dict contains all the Tensor used by optimizer
-        
+
         Examples:
             .. code-block:: python
 
@@ -311,11 +311,11 @@ class Optimizer(object):
         '''
         Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed.
 
-        Args: 
+        Args:
             state_dict(dict) : Dict contains all the Tensor needed by optimizer
         Return:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -326,7 +326,7 @@ class Optimizer(object):
                 layer_state_dict = emb.state_dict()
                 paddle.save(layer_state_dict, "emb.pdparams")
 
-                scheduler = paddle.optimizer.lr.NoamDecay(	
+                scheduler = paddle.optimizer.lr.NoamDecay(
                     d_model=0.01, warmup_steps=100, verbose=True)
                 adam = paddle.optimizer.Adam(
                     learning_rate=scheduler,
@@ -430,7 +430,7 @@ class Optimizer(object):
     def set_lr(self, value):
         """
         :api_attr: imperative
-        
+
         Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler,
         this API cannot be invoked, because it will lead to conflict.
 
@@ -439,7 +439,7 @@ class Optimizer(object):
 
         Returns:
             None
-          
+
         Examples:
             .. code-block:: python
 
@@ -495,7 +495,7 @@ class Optimizer(object):
 
     def get_lr(self):
         """
-        Get current learning rate of optimizer. 
+        Get current learning rate of optimizer.
         If 'LRScheduler' is not used, the return value is all the same.
         If 'LRScheduler' is used, the return value is the current scheduled learing rete.
 
@@ -884,7 +884,7 @@ class Optimizer(object):
                 a = paddle.to_tensor(value)
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
-                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                adam = paddle.optimizer.Adam(learning_rate = 0.01,
                                             parameters = linear.parameters())
                 out = linear(a)
                 out.backward()
@@ -1021,7 +1021,7 @@ class Optimizer(object):
 
     def _create_regularization_of_grad(self, param, grad, regularization=None):
         """ Create and add backward regularization Operators
-    
+
         Function helper of append_regularization_ops.
         """
         # If no gradient or no regularization is specified,  then we don't need to do anything
@@ -1069,22 +1069,22 @@ class Optimizer(object):
                                   parameters_and_grads,
                                   regularization=None):
         r"""Create and add backward regularization Operators
-    
+
         Creates and adds backward regularization operators in the BlockDesc.
         This will add gradients of the regularizer function to the gradients
         of the parameters and return these modified gradients. This is the
         same as implementing weight decay in optimizers for regularization.
-    
+
         Args:
             parameters_and_grads: A list of (parameters, gradients) pairs
                                   that need to be regularized.
             regularization: A global regularizer. If the parameter is not
                             set. It will be applied with regularizer.
-    
+
         Returns:
             list[(Variable, Variable)]: list of (parameters, gradients) \
             pair with the regularized gradient
-    
+
         Raises:
             Exception: Unknown regularization type
         """
@@ -1128,13 +1128,13 @@ class Optimizer(object):
         If not, new gradient will accumulat on previous gradient.
 
         There are two method to clear grad: set_to_zero or delete grad.
-        
+
         Args:
             set_to_zero (bool, optional): If set grads to zero or not, default is True.
-        
+
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -1145,7 +1145,7 @@ class Optimizer(object):
                 a = paddle.to_tensor(value)
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
-                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                adam = paddle.optimizer.Adam(learning_rate = 0.01,
                                             parameters = linear.parameters())
                 out = linear(a)
                 out.backward()
@@ -1195,13 +1195,13 @@ class Optimizer(object):
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) tensor pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
-            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
             .. code-block:: python
- 
+
                 import paddle
                 linear = paddle.nn.Linear(10, 10)
                 input = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
@@ -1240,7 +1240,7 @@ class Optimizer(object):
     def step(self):
         """
         Execute the optimizer and update parameters once.
-        
+
         Returns:
             None
 
@@ -1254,7 +1254,7 @@ class Optimizer(object):
                 a = paddle.to_tensor(value)
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
-                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                adam = paddle.optimizer.Adam(learning_rate = 0.01,
                                             parameters = linear.parameters())
                 out = linear(a)
                 out.backward()
@@ -1359,7 +1359,7 @@ class Optimizer(object):
     @framework.dygraph_only
     def _append_optimize_multi_tensor_op(self, target_block,
                                          parameters_and_grads):
-        """ 
+        """
         For Multi Tensor, append optimize merged_operator to block.
         """
         pass
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 7205a434d38..f0ca03a61a6 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -80,24 +80,24 @@ class RMSProp(Optimizer):
           the gradient; if False, by the uncentered second moment. Setting this to
           True may help with training, but is slightly more expensive in terms of
           computation and memory. Defaults to False.
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. 
-          This parameter is required in dygraph mode. And you can specify different options for 
-          different parameter groups such as the learning rate, weight decay, etc, 
-          then the parameters are list of dict. Note that the learning_rate in paramter groups 
-          represents the scale of base learning_rate. 
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
+          This parameter is required in dygraph mode. And you can specify different options for
+          different parameter groups such as the learning rate, weight decay, etc,
+          then the parameters are list of dict. Note that the learning_rate in paramter groups
+          represents the scale of base learning_rate.
           The default value is None in static mode, at this time all parameters will be updated.
-        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. 
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization.
           It canbe a float value as coeff of L2 regularization or \
           :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-          If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, 
-          the regularization setting here in optimizer will be ignored for this parameter. 
-          Otherwise, the regularization setting here in optimizer will take effect. 
+          If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already,
+          the regularization setting here in optimizer will be ignored for this parameter.
+          Otherwise, the regularization setting here in optimizer will take effect.
           Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
           some derived class of ``GradientClipBase`` . There are three cliping strategies
           ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
           :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
-        name (str, optional): This parameter is used by developers to print debugging information. 
+        name (str, optional): This parameter is used by developers to print debugging information.
           For details, please refer to :ref:`api_guide_Name`. Default is None.
 
     Raises:
@@ -136,7 +136,7 @@ class RMSProp(Optimizer):
                     'weight_decay': 0.001,
                     'learning_rate': 0.1
                 }],
-                weight_decay=0.01)                   
+                weight_decay=0.01)
             out.backward()
             rmsprop.step()
             rmsprop.clear_grad()
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index 1b5b9f4c4f1..5d601422e44 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -54,8 +54,8 @@ class SGD(Optimizer):
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): The default value is None. Normally there is no need for user
                 to set this property. For more information, please refer to
-                :ref:`api_guide_Name` . 
-        
+                :ref:`api_guide_Name` .
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index c44d2f0f611..785e3bc1c2d 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -202,7 +202,7 @@ def export_chrome_tracing(dir_name: str,
     Args:
         dir_name(str): Directory to save profiling data.
         worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
-    
+
     Returns:
         A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file.
 
@@ -376,27 +376,27 @@ class Profiler:
 
                 import paddle
                 import paddle.profiler as profiler
-                
+
                 class RandomDataset(paddle.io.Dataset):
                     def __init__(self, num_samples):
                         self.num_samples = num_samples
-                
+
                     def __getitem__(self, idx):
                         image = paddle.rand(shape=[100], dtype='float32')
                         label = paddle.randint(0, 10, shape=[1], dtype='int64')
                         return image, label
-                
+
                     def __len__(self):
                         return self.num_samples
-                
+
                 class SimpleNet(paddle.nn.Layer):
                     def __init__(self):
                         super(SimpleNet, self).__init__()
                         self.fc = paddle.nn.Linear(100, 10)
-                
+
                     def forward(self, image, label=None):
                         return self.fc(image)
-                
+
                 dataset = RandomDataset(20 * 4)
                 simple_net = SimpleNet()
                 opt = paddle.optimizer.SGD(learning_rate=1e-3,
diff --git a/python/paddle/profiler/timer.py b/python/paddle/profiler/timer.py
index 35689feb56c..5fcc251510d 100644
--- a/python/paddle/profiler/timer.py
+++ b/python/paddle/profiler/timer.py
@@ -169,7 +169,7 @@ class Hook(object):
 class TimerHook(Hook):
     """
     A hook for recording real-time performance and the summary
-    performance of total steps. 
+    performance of total steps.
     """
 
     def __init__(self):
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index c0146fe9276..6eeea876a9c 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -37,7 +37,7 @@ class RecordEvent(ContextDecorator):
 
     Args:
         name(str): Name of the record event
-        event_type(TracerEventType, optional): Optional, default value is TracerEventType.PythonUserDefined. It is reserved for internal purpose, and it is better not to specify this parameter. 
+        event_type(TracerEventType, optional): Optional, default value is TracerEventType.PythonUserDefined. It is reserved for internal purpose, and it is better not to specify this parameter.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 9002cd0676e..b810a7b8c43 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -38,7 +38,7 @@ items. It can be any function with no parameter that creates a iterable
 Element produced from the iterable should be a **single** entry of data,
 **not** a mini batch. That entry of data could be a single item, or a tuple of
 items.
-Item should be of supported type (e.g., numpy array or list/tuple of float 
+Item should be of supported type (e.g., numpy array or list/tuple of float
 or int).
 
 An example implementation for single item data reader creator:
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 981f6e9253c..a01628e184a 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -51,31 +51,31 @@ else:
 
 def cache(reader):
     """
-    Cache the reader data into memory. 
+    Cache the reader data into memory.
 
-    Be careful that this method may take long time to process, 
-    and consume lots of memory. :code:`reader()` would only 
-    call once. 
+    Be careful that this method may take long time to process,
+    and consume lots of memory. :code:`reader()` would only
+    call once.
 
     Args:
-        reader (generator): a reader object which yields 
+        reader (generator): a reader object which yields
             data each time.
 
     Returns:
         generator: a decorated reader object which yields data from cached memory.
-    
+
     Examples:
         .. code-block:: python
 
             import paddle
-            
+
             def reader():
                 for i in range(3):
                     yield i
-            
+
             # All data is cached into memory
             cached_reader = paddle.io.cache(reader)
-            
+
             # Output: 0 1 2
             for i in cached_reader():
                 print(i)
@@ -100,10 +100,10 @@ def map_readers(func, *readers):
 
 
     Args:
-        func: a function to read data and compute result, the output of this function 
+        func: a function to read data and compute result, the output of this function
               will be set as the output of the resulted data reader.
         readers (Reader|list of Reader): list of readers whose outputs will be used as arguments of func.
- 
+
     Returns:
         the resulted data reader (Reader)
 
@@ -138,9 +138,9 @@ def shuffle(reader, buf_size):
 
     This API creates a decorated reader that outputs the shuffled data.
 
-    The output data from the origin reader will be saved into a buffer, 
+    The output data from the origin reader will be saved into a buffer,
     and then shuffle the data. The size of buffer is determined by argument buf_size.
- 
+
     Args:
         reader(callable): the original reader whose data will be shuffled.
         buf_size(int): the size of shuffled buffer.
@@ -255,18 +255,18 @@ def compose(*readers, **kwargs):
     (1, 2, 3, 4, 5)
 
     Args:
-        readers (Reader|list of Reader): readers that will be composed together. 
+        readers (Reader|list of Reader): readers that will be composed together.
         check_alignment(bool, optional): Indicates whether the input readers are checked for
                               alignment. If True, whether input readers are aligned
                               correctly will be checked, else alignment will not be checkout and trailing outputs
                               will be discarded. Defaults to True.
 
-    Returns: 
+    Returns:
         the new data reader (Reader).
 
     Raises:
         ComposeNotAligned: outputs of readers are not aligned. This will not raise if check_alignment is set to False.
-  
+
     Examples:
         .. code-block:: python
 
@@ -319,19 +319,19 @@ def buffered(reader, size):
 
     Returns:
         generator: the buffered data reader.
-    
+
     Examples:
         .. code-block:: python
 
             import paddle
-            
+
             def reader():
                 for i in range(3):
                     yield i
-            
+
             # Create a buffered reader, and the buffer size is 2.
             buffered_reader = paddle.io.buffered(reader, 2)
-            
+
             # Output: 0 1 2
             for i in buffered_reader():
                 print(i)
@@ -368,8 +368,8 @@ def firstn(reader, n):
     """
     paddle.fluid.io.firstn ( :ref:`api_fluid_io_firstn` ) is recommended to use,
     and paddle.reader.firstn is an alias.
-    
-    This API creates a decorated reader, and limits the max number of 
+
+    This API creates a decorated reader, and limits the max number of
     samples that reader could return.
 
     Args:
@@ -390,7 +390,7 @@ def firstn(reader, n):
             firstn_reader = fluid.io.firstn(reader, 5)
             for e in firstn_reader():
                 print(e)
-            # the outputs are: 0 1 2 3 4  
+            # the outputs are: 0 1 2 3 4
     """
 
     # TODO(yuyang18): Check if just drop the reader, could clean the opened
@@ -415,14 +415,14 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
 
     Args:
         mapper (callable): a function to map the data from reader.
-        reader (callable): a data reader which yields the data. 
+        reader (callable): a data reader which yields the data.
         process_num (int): thread number to handle original sample.
-        buffer_size (int): size of the queue to read data in. 
-        order (bool): whether to keep the data order from original reader. 
+        buffer_size (int): size of the queue to read data in.
+        order (bool): whether to keep the data order from original reader.
             Default False.
 
     Returns:
-        callable: a decorated reader with data mapping. 
+        callable: a decorated reader with data mapping.
     """
     end = XmapEndSignal()
 
@@ -505,17 +505,17 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
 def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
     """
     This API use python ``multiprocessing`` to read data from ``readers`` parallelly,
-    and then ``multiprocess.Queue`` or ``multiprocess.Pipe`` is used to merge 
-    these data. A separate process will be created for each reader in the 
-    ``readers`` list, please guarantee every reader can work independently 
+    and then ``multiprocess.Queue`` or ``multiprocess.Pipe`` is used to merge
+    these data. A separate process will be created for each reader in the
+    ``readers`` list, please guarantee every reader can work independently
     to avoid conflicts in parallel environment.
-    
 
-    ``Multiprocess.Queue`` require the rw access right to /dev/shm, and it's not supported 
+
+    ``Multiprocess.Queue`` require the rw access right to /dev/shm, and it's not supported
     in some platforms.
 
     Parameters:
-       readers (list( ``generator`` ) | tuple( ``generator`` )): a python ``generator`` list 
+       readers (list( ``generator`` ) | tuple( ``generator`` )): a python ``generator`` list
            used to read input data
        use_pipe (bool, optional): control the inner API used to implement the multi-processing,
            default True - use ``multiprocess.Pipe`` which is recommended
@@ -534,16 +534,16 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
         import paddle.fluid as fluid
         from paddle.fluid.io import multiprocess_reader
         import numpy as np
-        
+
         sample_files = ['sample_file_1', 'sample_file_2']
-        
+
         def fake_input_files():
             with open(sample_files[0], 'w') as f:
                np.savez(f, a=np.array([1, 2]), b=np.array([3, 4]), c=np.array([5, 6]), d=np.array([7, 8]))
             with open(sample_files[1], 'w') as f:
                np.savez(f, a=np.array([9, 10]), b=np.array([11, 12]), c=np.array([13, 14]))
-        
-        
+
+
         def generate_reader(file_name):
             # load data file
             def _impl():
@@ -551,28 +551,28 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
                 for item in sorted(data.files):
                     yield data[item],
             return _impl
-        
+
         if __name__ == '__main__':
             # generate sample input files
             fake_input_files()
-            
+
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 place = fluid.CPUPlace()
                 # the 1st 2 is batch size
-                image = fluid.data(name='image', dtype='int64', shape=[2, 1, 2]) 
+                image = fluid.data(name='image', dtype='int64', shape=[2, 1, 2])
                 fluid.layers.Print(image)
                 # print detailed tensor info of image variable
-            
+
                 reader = fluid.io.PyReader(feed_list=[image], capacity=2)
-            
+
                 decorated_reader = multiprocess_reader(
                     [generate_reader(sample_files[0]), generate_reader(sample_files[1])], False)
-            
+
                 reader.decorate_sample_generator(decorated_reader, batch_size=2, places=[place])
-            
+
                 exe = fluid.Executor(place)
                 exe.run(fluid.default_startup_program())
-            
+
                 for data in reader():
                     res = exe.run(feed=data, fetch_list=[image])
                     print(res[0])
diff --git a/python/paddle/regularizer.py b/python/paddle/regularizer.py
index 586ae0f988c..89444a1357d 100644
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
@@ -20,23 +20,23 @@ import paddle.fluid as fluid
 class L1Decay(fluid.regularizer.L1Decay):
     r"""
     Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
-    
-    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
-    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
-    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
-    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined 
+
+    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ).
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has
+    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined
     in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
     in Optimizer will be used.
-    
+
     In the implementation, the loss function of L1 Weight Decay Regularization is as follows:
-	
+
     .. math::
 
         loss = coeff * reduce\_sum(abs(x))
 
     Args:
         coeff(float, optional): regularization coeff. Default:0.0.
-	
+
     Examples:
         .. code-block:: python
 
@@ -82,14 +82,14 @@ class L1Decay(fluid.regularizer.L1Decay):
 class L2Decay(fluid.regularizer.L2Decay):
     r"""
     Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
-    
-    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
-    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
-    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
-    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined 
+
+    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ).
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has
+    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined
     in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
     in Optimizer will be used.
-    
+
     In the implementation, the loss function of L2 Weight Decay Regularization is as follows:
 
     .. math::
@@ -98,7 +98,7 @@ class L2Decay(fluid.regularizer.L2Decay):
 
     Args:
         regularization_coeff(float, optional): regularization coeff. Default:0.0
-	
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index 656605f1bf2..cc4e16b7dd0 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -39,15 +39,15 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
             with shape `[..., seq_length]` or `[seq_length, ...]`.
         frame_length (int): Length of the frame and `0 < frame_length <= x.shape[axis]`.
         hop_length (int): Number of steps to advance between adjacent frames
-            and `0 < hop_length`. 
+            and `0 < hop_length`.
         axis (int, optional): Specify the axis to operate on the input Tensors. Its
             value should be 0(the first dimension) or -1(the last dimension). If not
-            specified, the last axis is used by default. 
+            specified, the last axis is used by default.
 
     Returns:
         The output frames tensor with shape `[..., frame_length, num_frames]` if `axis==-1`,
             otherwise `[num_frames, frame_length, ...]` where
-        
+
             `num_framse = 1 + (x.shape[axis] - frame_length) // hop_length`
 
     Examples:
@@ -56,7 +56,7 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
 
         import paddle
         from paddle.signal import frame
-        
+
         # 1D
         x = paddle.arange(8)
         y0 = frame(x, frame_length=4, hop_length=2, axis=-1)  # [4, 3]
@@ -163,10 +163,10 @@ def overlap_add(x, hop_length, axis=-1, name=None):
             with shape `[..., frame_length, num_frames]` or
             `[num_frames, frame_length ...]`.
         hop_length (int): Number of steps to advance between adjacent frames and
-            `0 < hop_length <= frame_length`. 
+            `0 < hop_length <= frame_length`.
         axis (int, optional): Specify the axis to operate on the input Tensors. Its
             value should be 0(the first dimension) or -1(the last dimension). If not
-            specified, the last axis is used by default. 
+            specified, the last axis is used by default.
 
     Returns:
         The output frames tensor with shape `[..., seq_length]` if `axis==-1`,
@@ -180,7 +180,7 @@ def overlap_add(x, hop_length, axis=-1, name=None):
 
         import paddle
         from paddle.signal import overlap_add
-        
+
         # 2D
         x0 = paddle.arange(16).reshape([8, 2])
         # [[0 , 1 ],
@@ -205,7 +205,7 @@ def overlap_add(x, hop_length, axis=-1, name=None):
         y0 = overlap_add(x0, hop_length=2, axis=-1)  # [2, 1, 10]
 
         x1 = paddle.arange(32).reshape([2, 8, 1, 2])
-        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10, 1, 2] 
+        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10, 1, 2]
     """
     if axis not in [0, -1]:
         raise ValueError(f'Unexpected axis: {axis}. It should be 0 or -1.')
@@ -255,19 +255,19 @@ def stft(x,
 
     The STFT computes the discrete Fourier transforms (DFT) of short overlapping
     windows of the input using this formula:
-    
+
     .. math::
         X_t[\omega] = \sum_{n = 0}^{N-1}%
                       \text{window}[n]\ x[t \times H + n]\ %
                       e^{-{2 \pi j \omega n}/{N}}
-    
+
     Where:
     - :math:`t`: The :math:`t`-th input window.
     - :math:`\omega`: Frequency :math:`0 \leq \omega < \text{n\_fft}` for `onesided=False`,
-        or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`. 
+        or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`.
     - :math:`N`: Value of `n_fft`.
-    - :math:`H`: Value of `hop_length`.  
-    
+    - :math:`H`: Value of `hop_length`.
+
     Args:
         x (Tensor): The input data which is a 1-dimensional or 2-dimensional Tensor with
             shape `[..., seq_length]`. It can be a real-valued or a complex Tensor.
@@ -290,23 +290,23 @@ def stft(x,
             tensor. It can not be `True` if input is a complex tensor. Default: `True`
         name (str, optional): The default value is None. Normally there is no need for user
             to set this property. For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`(
             real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`(
             `onesided` is `False`)
-    
+
     Examples:
         .. code-block:: python
-    
+
             import paddle
             from paddle.signal import stft
-    
+
             # real-valued input
             x = paddle.randn([8, 48000], dtype=paddle.float64)
             y1 = stft(x, n_fft=512)  # [8, 257, 376]
             y2 = stft(x, n_fft=512, onesided=False)  # [8, 512, 376]
-    
+
             # complex input
             x = paddle.randn([8, 48000], dtype=paddle.float64) + \
                     paddle.randn([8, 48000], dtype=paddle.float64)*1j  # [8, 48000] complex128
@@ -413,7 +413,7 @@ def istft(x,
     Inverse short-time Fourier transform (ISTFT).
 
     Reconstruct time-domain signal from the giving complex input and window tensor when
-        nonzero overlap-add (NOLA) condition is met: 
+        nonzero overlap-add (NOLA) condition is met:
 
     .. math::
         \sum_{t = -\infty}^{\infty}%
@@ -432,7 +432,7 @@ def istft(x,
 
     Args:
         x (Tensor): The input data which is a 2-dimensional or 3-dimensional **complesx**
-            Tensor with shape `[..., n_fft, num_frames]`. 
+            Tensor with shape `[..., n_fft, num_frames]`.
         n_fft (int): The size of Fourier transform.
         hop_length (int, optional): Number of steps to advance between adjacent windows
             from time-domain signal and `0 < hop_length < win_length`. Default: `None`(
@@ -452,10 +452,10 @@ def istft(x,
             and `istft` will return a real-valued tensor when it is set to `True`.
             Default: `True`.
         length (int, optional): Specify the length of time-domain signal. Default: `None`(
-            treated as the whole length of signal). 
+            treated as the whole length of signal).
         return_complex (bool, optional): It means that whether the time-domain signal is
             real-valued. If `return_complex` is set to `True`, `onesided` should be set to
-            `False` cause the output is complex. 
+            `False` cause the output is complex.
         name (str, optional): The default value is None. Normally there is no need for user
             to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index b8133872aa9..af2322895f8 100755
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -107,12 +107,12 @@ def fc(x,
             The default value is None, and the weight will be initialized to zero.
             For detailed information, please refer to :attr:`paddle.ParamAttr`.
             Warning, if x is a list of tensor, weight_attr should also be a list of same length.
-        bias_attr (ParamAttr|bool, optional): The attribute of the learnable bias. 
+        bias_attr (ParamAttr|bool, optional): The attribute of the learnable bias.
             If it is set to False, no bias will be added to the output.
             If it is set to None or one kind of ParamAttr, a bias parameter will
             be created according to ParamAttr. For detailed information, please refer
             to :attr:`paddle.ParamAttr`. The default value is None and the bias will be
-            initialized to zero. 
+            initialized to zero.
         activation (str, optional): Activation to be applied to the output of
             this layer, such as tanh, softmax, sigmoid, relu. For more information,
             please refer to :ref:`api_guide_activations_en` . Default: None.
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index f575092153b..d480f24a68b 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -251,7 +251,7 @@ def real(x, name=None):
         x (Tensor): the input Tensor, its data type could be complex64 or complex128.
         name (str, optional): The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name` .
-      
+
     Returns:
         Tensor: a Tensor containing real values of the input Tensor.
 
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index c8d43300238..df5ad281443 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -71,7 +71,7 @@ def linspace(start, stop, num, dtype=None, name=None):
     Returns:
         Tensor: the output data type will be float32, float64. The 1-D tensor with fixed number of evenly spaced values, \
         the data shape of this tensor is :math:`[num]` . If the :attr:`num` is set 1, the output tensor just has \
-        the value with input :attr:`start`. 
+        the value with input :attr:`start`.
 
     Examples:
         .. code-block:: python
@@ -153,10 +153,10 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
     r"""
     Return fixed number of logarithmical-evenly spaced values within the interval \
     :math:`[base^{start}, base^{stop}]`.
-    
+
     Notes:
         This API does not compute the gradient.
-    
+
     Args:
         start(int|float|Tensor): The input :attr:`start` is exponent of first entry in \
             the sequence. It is a scalar, or a Tensor of shape [1] with input data \
@@ -177,7 +177,7 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
         Tensor: The output data type will be float32, float64. The 1-D tensor with \
         fixed number of logarithmical-evenly spaced values, the data shape of this \
         tensor is :math:`[num]`. If the :attr:`num` is set 1, the output tensor \
-        just has the value with exponential of :attr:`start` with base :attr:`base`. 
+        just has the value with exponential of :attr:`start` with base :attr:`base`.
 
     Examples:
         .. code-block:: python
@@ -404,7 +404,7 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None):
 
 def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     r"""
-    Constructs a ``paddle.Tensor`` from ``data`` , 
+    Constructs a ``paddle.Tensor`` from ``data`` ,
     which can be scalar, tuple, list, numpy\.ndarray, paddle\.Tensor.
 
     If the ``data`` is already a Tensor, copy will be performed and return a new tensor.
@@ -413,13 +413,13 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     Args:
         data(scalar|tuple|list|ndarray|Tensor): Initial data for the tensor.
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor.
-        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
+        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' ,
             'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8',
-            'complex64' , 'complex128'. Default: None, infers dtype from ``data`` 
+            'complex64' , 'complex128'. Default: None, infers dtype from ``data``
             except for python float number which gets dtype from ``get_default_type`` .
-        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be  
-            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is 
-            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs. 
+        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be
+            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is
+            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs.
         stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
 
     Returns:
@@ -430,7 +430,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     .. code-block:: python
 
         import paddle
-                
+
         type(paddle.to_tensor(1))
         # <class 'paddle.Tensor'>
 
@@ -445,7 +445,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
 
         paddle.to_tensor(x)  # A new tensor will be created with default stop_gradient=True
         # Tensor(shape=[1], dtype=int64, place=CPUPlace, stop_gradient=True,
-        #        [1])        
+        #        [1])
 
         paddle.to_tensor([[0.1, 0.2], [0.3, 0.4]], place=paddle.CPUPlace(), stop_gradient=False)
         # Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=False,
@@ -486,18 +486,18 @@ def full_like(x, fill_value, dtype=None, name=None):
         x(Tensor): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
         fill_value(bool|float|int): The value to fill the tensor with. Note: this value shouldn't exceed the range of the output data type.
         dtype(np.dtype|str, optional): The data type of output. The data type can be one
-            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
+            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output
             data type is the same as input.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: Tensor which is created according to ``x``, ``fill_value`` and ``dtype``.
-    
+
     Examples:
         .. code-block:: python
 
           import paddle
-          
+
           input = paddle.full(shape=[2, 3], fill_value=0.0, dtype='float32', name='input')
           output = paddle.full_like(input, 2.0)
           # [[2. 2. 2.]
@@ -548,28 +548,28 @@ def ones(shape, dtype=None, name=None):
         dtype (np.dtype|str, optional): Data type of output Tensor, it should be one of
             bool, float16, float32, float64, int32 and int64. If it is set to None, the data type will be float32.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: A Tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements are 1.
 
     Examples:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             # default dtype for ones OP
-            data1 = paddle.ones(shape=[3, 2]) 
+            data1 = paddle.ones(shape=[3, 2])
             # [[1. 1.]
             #  [1. 1.]
             #  [1. 1.]]
 
-            data2 = paddle.ones(shape=[2, 2], dtype='int32') 
+            data2 = paddle.ones(shape=[2, 2], dtype='int32')
             # [[1 1]
             #  [1 1]]
 
             # shape is a Tensor
             shape = paddle.full(shape=[2], dtype='int32', fill_value=2)
-            data3 = paddle.ones(shape=shape, dtype='int32') 
+            data3 = paddle.ones(shape=shape, dtype='int32')
             # [[1 1]
             #  [1 1]]
     """
@@ -627,18 +627,18 @@ def zeros(shape, dtype=None, name=None):
         .. code-block:: python
 
           import paddle
-          
-          data = paddle.zeros(shape=[3, 2], dtype='float32') 
+
+          data = paddle.zeros(shape=[3, 2], dtype='float32')
           # [[0. 0.]
           #  [0. 0.]
           #  [0. 0.]]
-          data = paddle.zeros(shape=[2, 2]) 
+          data = paddle.zeros(shape=[2, 2])
           # [[0. 0.]
           #  [0. 0.]]
-          
+
           # shape is a Tensor
           shape = paddle.full(shape=[2], dtype='int32', fill_value=2)
-          data3 = paddle.zeros(shape=shape, dtype='int32') 
+          data3 = paddle.zeros(shape=shape, dtype='int32')
           # [[0 0]
           #  [0 0]]
     """
@@ -681,7 +681,7 @@ def zeros_like(x, dtype=None, name=None):
 
 def eye(num_rows, num_columns=None, dtype=None, name=None):
     """
-    
+
     This function constructs 2-D Tensor with ones on the diagonal and zeros elsewhere.
 
     Args:
@@ -698,7 +698,7 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
 
     Examples:
         .. code-block:: python
-          
+
           import paddle
 
           data = paddle.eye(3, dtype='int32')
@@ -758,7 +758,7 @@ def full(shape, fill_value, dtype=None, name=None):
     """
 
     Return a Tensor with the ``fill_value`` which size is same as ``shape``.
-    
+
     Args:
         shape(list|tuple|Tensor): Shape of the Tensor to be created.
                 The data type is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
@@ -770,7 +770,7 @@ def full(shape, fill_value, dtype=None, name=None):
             which can be float16, float32, float64, int32, int64, if dytpe is `None`, the data
             type of created Tensor is `float32`.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: Tensor which is created according to ``shape``, ``fill_value`` and ``dtype``.
 
@@ -779,7 +779,7 @@ def full(shape, fill_value, dtype=None, name=None):
 
             import paddle
 
-            data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64') 
+            data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64')
             #[[0]
             # [0]]
 
@@ -790,14 +790,14 @@ def full(shape, fill_value, dtype=None, name=None):
 
             # attr shape is a Tensor.
             shape = paddle.full([2], 2, "int32")
-            data4 = paddle.full(shape=shape, dtype='bool', fill_value=True) 
-            # [[True True] 
+            data4 = paddle.full(shape=shape, dtype='bool', fill_value=True)
+            # [[True True]
             #  [True True]]
-            
+
             # attr fill_value is a Tensor.
             val = paddle.full([1], 2.0, "float32")
             data5 = paddle.full(shape=[2,1], fill_value=val, dtype='float32')
-            # [[2.0] 
+            # [[2.0]
             #  [2.0]]
     """
 
@@ -835,7 +835,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
             If ``dytpe`` is None, the data type is float32. Default is None.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
-    Returns: 
+    Returns:
         Tensor: A 1-D Tensor with values from the interval [``start``, ``end``)
         taken with common difference ``step`` beginning from ``start``. Its
         data type is set by ``dtype``.
@@ -858,7 +858,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
             start_var = paddle.to_tensor([3])
             out4 = paddle.arange(start_var, 7)
             # [3, 4, 5, 6]
-             
+
     """
     if dtype is None:
         dtype = 'int64'
@@ -956,8 +956,8 @@ def _tril_triu_op(helper):
 def tril(x, diagonal=0, name=None):
     r"""
     Returns the lower triangular part of a matrix (2-D tensor) or batch
-    of matrices :attr:`x`, the other elements of the result tensor are set 
-    to 0. The lower triangular part of the matrix is defined as the elements 
+    of matrices :attr:`x`, the other elements of the result tensor are set
+    to 0. The lower triangular part of the matrix is defined as the elements
     on and below the diagonal.
 
     Args:
@@ -1085,14 +1085,14 @@ def triu(x, diagonal=0, name=None):
 def meshgrid(*args, **kwargs):
     """
     Takes a list of N tensors as input *args, each of which is 1-dimensional vector, and creates N-dimensional grids.
-    
+
     Args:
-        *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,), 
+        *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,),
             (N2,),..., (Nk,). Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
-        **kwargs (optional): Currently, only accept name in **kwargs 
+        **kwargs (optional): Currently, only accept name in **kwargs
             The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
- 
+
     Returns:
          Tensor: k tensors. The shape of each tensor is (N1, N2, ..., Nk)
 
@@ -1304,7 +1304,7 @@ def diag(x, offset=0, padding_value=0, name=None):
         offset (int, optional): The diagonal offset. A positive value represents superdiagonal, 0 represents the main diagonal, and a negative value represents subdiagonal.
         padding_value (int|float, optional): Use this value to fill the area outside the specified diagonal band. Only takes effect when the input is a 1-D Tensor. The default value is 0.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-        
+
     Returns:
         Tensor, a square matrix or a vector. The output data type is the same as input data type.
 
@@ -1390,7 +1390,7 @@ def diag(x, offset=0, padding_value=0, name=None):
 def empty(shape, dtype=None, name=None):
     """
     Returns a Tensor with uninitialized data which size is same as ``shape``.
-    
+
     Args:
         shape(list|tuple|Tensor): Shape of the Tensor to be created.
                 The data type of dimension of shape is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
@@ -1401,7 +1401,7 @@ def empty(shape, dtype=None, name=None):
             type of created Tensor use global default dtype (see ``get_default_dtype``
             for details).
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: Tensor which is created according to ``shape`` and ``dtype``, and is uninitialized.
 
@@ -1487,14 +1487,14 @@ def empty_like(x, dtype=None, name=None):
     """
     Returns a Tensor with uninitialized data which has identical shape of ``x`` and ``dtype``.
     If the ``dtype`` is None, the data type of Tensor is same with ``x``.
-    
+
     Args:
         x(Tensor): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
         dtype(np.dtype|str, optional): The data type of output. The data type can be one
-            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
+            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output
             data type is the same as input.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: Tensor which is created according to ``x`` and ``dtype``, and is uninitialized.
 
@@ -1558,16 +1558,16 @@ def assign(x, output=None):
     """
 
     Copy value of the :attr:`x` to the :attr:`output`.
- 
+
     Parameters:
         x (Tensor|np.ndarray|list|tuple|scalar): A Tensor, numpy ndarray, tuple/list of scalar,
             or scalar. Its data type can be float16, float32, float64, int32, int64 or bool. Note: the float64 data will be converted to float32 because of current platform protobuf
             data limitation.
         output (Tensor, optional): A Tensor. If :attr:`output` is None, a new Tensor will be created as :attr:`output`. Default: None.
- 
+
     Returns:
         Tensor: A Tensor with the same shape, data type and value as :attr:`x`.
- 
+
     Examples:
         .. code-block:: python
 
@@ -1706,15 +1706,15 @@ def assign(x, output=None):
 
 def clone(x, name=None):
     """
-    Returns a copy of input Tensor. It will always have a Tensor copy. 
-    
+    Returns a copy of input Tensor. It will always have a Tensor copy.
+
     In addition, This function is derivable, so gradients will flow back from the output to input.
 
     Parameters:
         x (Tensor): The input Tensor.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
-    Returns: 
+    Returns:
         Tensor, A Tensor copied from ``input``.
 
     Examples:
@@ -1842,20 +1842,20 @@ def complex(real, imag, name=None):
 
 def tril_indices(row, col, offset=0, dtype='int64'):
     """
-    Return the indices of the lower triangular part of the 2-D matrix 
-    whose row and col is knowed.Indices are ordered based on row and then columns. 
+    Return the indices of the lower triangular part of the 2-D matrix
+    whose row and col is knowed.Indices are ordered based on row and then columns.
     The lower triangular part of the matrix is defined as the elements on
     and below the diagonal.
-    
+
     Args:
         row (int): The input x which is a int number describe the number of row of the matrix.
         col (int): The input x which is a int number describe the number of col of the matrix.
         offset (int, optional): The offset to consider, default value is 0.
 
-            - If offset = 0, all elements on and below the main diagonal are retained.  
-            - If offset > 0, include just as many diagonals above the main diagonal.  
-            - If offset < 0, excludes just as many diagonals below the main diagonal.  
- 
+            - If offset = 0, all elements on and below the main diagonal are retained.
+            - If offset > 0, include just as many diagonals above the main diagonal.
+            - If offset < 0, excludes just as many diagonals below the main diagonal.
+
         dtype (int, optional): the data type of the output tensor, can be int32, int64.
 
     Returns:
@@ -1866,17 +1866,17 @@ def tril_indices(row, col, offset=0, dtype='int64'):
         .. code-block:: python
 
             import paddle
-            
+
             # example 1, default offset value
             data1 = paddle.tril_indices(4,4,0)
             print(data1)
-            # [[0, 1, 1, 2, 2, 2, 3, 3, 3, 3], 
+            # [[0, 1, 1, 2, 2, 2, 3, 3, 3, 3],
             #  [0, 0, 1, 0, 1, 2, 0, 1, 2, 3]]
 
             # example 2, positive offset value
             data2 = paddle.tril_indices(4,4,2)
             print(data2)
-            # [[0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], 
+            # [[0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
             #  [0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]]
 
             # example 3, negative offset value
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 55726831d2e..99ab768e476 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -47,8 +47,8 @@ def parse_op_labels(labelstr, operand):
 
     Returns
     -------
-    the input operand's full label string in which all anonymous dimensions are 
-    labeled in dots. 
+    the input operand's full label string in which all anonymous dimensions are
+    labeled in dots.
     '''
     # Sanity checks
     for c in labelstr.replace('.', ''):
@@ -74,14 +74,14 @@ def parse_op_labels(labelstr, operand):
 def parse_labels(labelstr, operands):
     '''
     Parse label strings for all input operands.
-    
+
     Parameters
     ----------
     labelstr:
         The equation's label string
     operands:
         The input operands
-    
+
     Returns
     -------
     list of full label strings for all input operands
@@ -97,7 +97,7 @@ def parse_labels(labelstr, operands):
 
 def validate_rhs(rhs, input_labels, n_bcast_dims):
     '''
-    Check whether the equation's right hand side is valid 
+    Check whether the equation's right hand side is valid
     '''
     # Sanity check.
     if n_bcast_dims > 0:
@@ -122,8 +122,8 @@ def validate_rhs(rhs, input_labels, n_bcast_dims):
 
 def build_view(in_labels, out_labels):
     '''
-    Build an inverse map of dimension indices. Three conditions must hold for 
-    the result to be meaningful. 
+    Build an inverse map of dimension indices. Three conditions must hold for
+    the result to be meaningful.
     First, no duplicate letter labels in each label string.
     Second, the number of dots in dimout_labels >= that in in_labels.
     Third, dots are contiguous in each label string.
@@ -134,7 +134,7 @@ def build_view(in_labels, out_labels):
         The dimension labels to map to
     out_labels:
         The dimension labels to map from
-    
+
     Returns
     -------
     The inverse map from out_labels to in_labels. The length of the inverse map equals that of
@@ -181,7 +181,7 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
     plus an index table that maps from the layout to the dimensions
     in each operand. In the global view, the dimensions are arranged
     such that output ones are put on the left and contraction ones
-    are put on the right.  
+    are put on the right.
 
     Parameters
     ----------
@@ -191,7 +191,7 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
         The equation right hand side
     n_bcast_dims:
         The maxium number of broadcast dimensions
-    
+
     Returns
     -------
     A tuple of g_labels, g_view, g_nout, g_count
@@ -237,7 +237,7 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
 
 def build_global_shape(g_view, g_labels, op_shapes):
     '''
-    The global shape is the shape of all dimensions rearranged and broadcasting 
+    The global shape is the shape of all dimensions rearranged and broadcasting
     to the global view. It's a reference data structure for einsum planning.
 
     Parameters
@@ -287,14 +287,14 @@ def has_duplicated_labels(labels):
 
 def diagonalize(labels, operand):
     '''
-    Merges dimensions with duplicate labels. 
-    
+    Merges dimensions with duplicate labels.
+
     For those dimensions with duplicate labels, merge them into one dimension
     which represents the diagonal elements. This requires the dimensions with
     duplicate labels are equal sized.
-    
+
     Examples
-    -------- 
+    --------
     'ijj...i' would be merged into 'ij...'
     '''
     assert not has_duplicated_labels(labels), (
@@ -707,9 +707,9 @@ def preprocess(equation, *operands):
 
 
 def parse_fake_shape(equation, operands, labels):
-    """ 
+    """
     this shape is just used for operands planning. may differ with the original shape.
-    for example: 
+    for example:
     ... is replaced by 1
     -1  is replaced by 1
     Results
@@ -745,7 +745,7 @@ def rhs_inference(lhs):
 
 
 def gen_equation_for_opteinsum(lhs, rhs):
-    """ 
+    """
     1. gen rhs if rhs is None
     2. '...' -> 'A'
     """
@@ -768,7 +768,7 @@ def gen_equation_for_opteinsum(lhs, rhs):
 
 
 def einsum_v2(equation, *operands):
-    """ 
+    """
     einsum v2 implementation.
     1. Implement C++ EinsumOp.
     2. V2 create the EinsumOp to calculate, so just a little verifty work in python.
@@ -798,8 +798,8 @@ def einsum_v2(equation, *operands):
 
 
 def gen_einsum_op(equation, *operands):
-    """ 
-    EinsumOp Python Interface: 
+    """
+    EinsumOp Python Interface:
     """
     assert len(operands) <= 2, "Only support two operands in EinsumOp."
     if in_dygraph_mode():
@@ -862,7 +862,7 @@ def einsum(equation, *operands):
         - for many operads
             - broadcasting multiply
             - chained matrix multiply
-    
+
     **The summation notation**
 
         - The tensor dimensions are labeled using uncased English letters. E.g., `ijk`
@@ -870,7 +870,7 @@ def einsum(equation, *operands):
         - The equation is `,` separated into terms, each being a distinct input's
         dimension label string.
         - Ellipsis `...` enables broadcasting by automatically converting the unlabeled
-        dimensions into broadcasting dimensions. 
+        dimensions into broadcasting dimensions.
         - Singular labels are called free labels, duplicate are dummy labels. Dummy labeled
         dimensions will be reduced and removed in the output.
         - Output labels can be explicitly specified on the right hand side of `->` or omitted.
@@ -891,7 +891,7 @@ def einsum(equation, *operands):
         - Examples
             - '...ij, ...jk', where i and k are free labels, j is dummy. The output label
             string is '...ik'
-            - 'ij -> i', where i is a free label and j is a dummy label. 
+            - 'ij -> i', where i is a free label and j is a dummy label.
             - '...ij, ...jk -> ...ijk', where i, j and k are all free labels.
             - '...ij, ...jk -> ij', an invalid equation since `...` is not present for
             the output.
@@ -910,7 +910,7 @@ def einsum(equation, *operands):
 
     **On trace and diagonal**
 
-    The trace and diagonal are planned yet unimplemented features. 
+    The trace and diagonal are planned yet unimplemented features.
 
     Args:
         equation (`str`):
@@ -918,10 +918,10 @@ def einsum(equation, *operands):
         operands (`list|Tensor`):
             The input tensors over which to compute the Einstein summation. The number of
             operands should equal the number of input terms in the equation.
-    
+
     Returns:
         result (`Tensor`): the result tensor.
-    
+
     Examples:
         .. code-block:: python
 
@@ -939,7 +939,7 @@ def einsum(equation, *operands):
         print(paddle.einsum('i,i->', x, x))
         # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
         #   [1.45936954])
-        
+
         # outer
         print(paddle.einsum("i,j->ij", x, y))
         # Tensor(shape=[4, 5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
@@ -947,10 +947,10 @@ def einsum(equation, *operands):
         #    [0.23455200, 0.35519385, 0.40186870, 0.54970956, 0.56441545],
         #    [0.11773264, 0.17828843, 0.20171674, 0.27592498, 0.28330654],
         #    [0.32897076, 0.49817693, 0.56364071, 0.77099484, 0.79162055]])
-        
+
         A = paddle.rand([2, 3, 2])
         B = paddle.rand([2, 2, 3])
-        
+
         # transpose
         print(paddle.einsum('ijk->kji', A))
         #  Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
@@ -961,7 +961,7 @@ def einsum(equation, *operands):
         #    [[0.07637714, 0.29374704],
         #     [0.51470858, 0.51907635],
         #     [0.99066722, 0.55802226]]])
-        
+
         # batch matrix multiplication
         print(paddle.einsum('ijk, ikl->ijl', A,B))
         # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
@@ -972,7 +972,7 @@ def einsum(equation, *operands):
         #    [[0.32043904, 0.18164253, 0.27810261],
         #     [0.50226176, 0.24512935, 0.39881429],
         #     [0.51476848, 0.23367381, 0.39229113]]])
-        
+
         # Ellipsis transpose
         print(paddle.einsum('...jk->...kj', A))
         # Tensor(shape=[2, 2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
@@ -981,7 +981,7 @@ def einsum(equation, *operands):
         #
         #    [[0.49684682, 0.46258664, 0.33383518],
         #     [0.29374704, 0.51907635, 0.55802226]]])
-        
+
         # Ellipsis batch matrix multiplication
         print(paddle.einsum('...jk, ...kl->...jl', A,B))
         # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index f9c32c3254b..7ef08f81872 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -384,7 +384,7 @@ def templatedoc(op_type=None):
 
 def add_sample_code(func, sample_code):
     """
-    Append sample code for dynamically generated functions. 
+    Append sample code for dynamically generated functions.
 
     Args:
        func: The function of the function to be append sample code to.
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 700c6c340dc..b7dd412fb08 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1093,7 +1093,7 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
     Estimate the covariance matrix of the input variables, given data and weights.
 
     A covariance matrix is a square matrix, indicate the covariance of each pair variables in the input matrix.
-    For example, for an N-dimensional samples X=[x1,x2,…xN]T, then the covariance matrix 
+    For example, for an N-dimensional samples X=[x1,x2,…xN]T, then the covariance matrix
     element Cij is the covariance of xi and xj. The element Cii is the variance of xi itself.
 
     Parameters:
@@ -1219,11 +1219,11 @@ def t(input, name=None):
         .. code-block:: python
            :name: code-example
              import paddle
-             
+
              # Example 1 (0-D tensor)
              x = paddle.to_tensor([0.79])
              paddle.t(x) # [0.79]
-             
+
              # Example 2 (1-D tensor)
              x = paddle.to_tensor([0.79, 0.84, 0.32])
              paddle.t(x) # [0.79000002, 0.83999997, 0.31999999]
@@ -1621,7 +1621,7 @@ def histogram(input, bins=100, min=0, max=0, name=None):
 
 def bincount(x, weights=None, minlength=0, name=None):
     """
-    Computes frequency of each value in the input tensor. 
+    Computes frequency of each value in the input tensor.
 
     Args:
         x (Tensor): A Tensor with non-negative integer. Should be 1-D tensor.
@@ -2003,26 +2003,26 @@ def qr(x, mode="reduced", name=None):
     Args:
         x (Tensor): The input tensor. Its shape should be `[..., M, N]`,
             where ... is zero or more batch dimensions. M and N can be arbitrary
-            positive number. The data type of x should be float32 or float64. 
-        mode (str, optional): A flag to control the behavior of qr, the default is "reduced". 
+            positive number. The data type of x should be float32 or float64.
+        mode (str, optional): A flag to control the behavior of qr, the default is "reduced".
             Suppose x's shape is `[..., M, N]` and denoting `K = min(M, N)`:
-            If mode = "reduced", qr op will return reduced Q and R matrices, 
+            If mode = "reduced", qr op will return reduced Q and R matrices,
             which means Q's shape is `[..., M, K]` and R's shape is `[..., K, N]`.
-            If mode = "complete", qr op will return complete Q and R matrices, 
+            If mode = "complete", qr op will return complete Q and R matrices,
             which means Q's shape is `[..., M, M]` and R's shape is `[..., M, N]`.
             If mode = "r", qr op will only return reduced R matrix, which means
             R's shape is `[..., K, N]`.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-            
+
     Returns:
-        If mode = "reduced" or mode = "complete", qr will return a two tensor-tuple, which represents Q and R. 
+        If mode = "reduced" or mode = "complete", qr will return a two tensor-tuple, which represents Q and R.
         If mode = "r", qr will return a tensor which represents R.
-        
-    Examples:            
+
+    Examples:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
             q, r = paddle.linalg.qr(x)
@@ -2035,8 +2035,8 @@ def qr(x, mode="reduced", name=None):
 
             # R = [[-5.91607978, -7.43735744],
             #      [ 0.        ,  0.82807867]])
-            
-            # one can verify : X = Q * R ;     
+
+            # one can verify : X = Q * R ;
     """
     if in_dygraph_mode():
         q, r = _C_ops.qr(x, mode)
@@ -2072,9 +2072,9 @@ def qr(x, mode="reduced", name=None):
 
 def lu(x, pivot=True, get_infos=False, name=None):
     r"""
-    Computes the LU factorization of an N-D(N>=2) matrix x. 
+    Computes the LU factorization of an N-D(N>=2) matrix x.
 
-    Returns the LU factorization(inplace x) and Pivots. low triangular matrix L and 
+    Returns the LU factorization(inplace x) and Pivots. low triangular matrix L and
     upper triangular matrix U are combined to a single LU matrix.
 
     Pivoting is done if pivot is set to True.
@@ -2094,23 +2094,23 @@ def lu(x, pivot=True, get_infos=False, name=None):
 
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-            
+
     Returns:
         factorization (Tensor): LU matrix, the factorization of input X.
 
-        pivots (IntTensor): the pivots of size(∗(N-2), min(m,n)). `pivots` stores all the 
-                    intermediate transpositions of rows. The final permutation `perm` could be 
+        pivots (IntTensor): the pivots of size(∗(N-2), min(m,n)). `pivots` stores all the
+                    intermediate transpositions of rows. The final permutation `perm` could be
                     reconstructed by this, details refer to upper example.
 
-        infos (IntTensor, optional): if `get_infos` is `True`, this is a tensor of size (∗(N-2)) 
-                    where non-zero values indicate whether factorization for the matrix or each minibatch 
+        infos (IntTensor, optional): if `get_infos` is `True`, this is a tensor of size (∗(N-2))
+                    where non-zero values indicate whether factorization for the matrix or each minibatch
                     has succeeded or failed.
 
-        
-    Examples:            
+
+    Examples:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
             lu,p,info = paddle.linalg.lu(x, get_infos=True)
@@ -2126,26 +2126,26 @@ def lu(x, pivot=True, get_infos=False, name=None):
             # >>> info
             # Tensor(shape=[], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
             #    0)
-            
+
             P,L,U = paddle.linalg.lu_unpack(lu,p)
 
             # >>> P
             # (Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[0., 1., 0.],
             # [0., 0., 1.],
-            # [1., 0., 0.]]), 
+            # [1., 0., 0.]]),
             # >>> L
             # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[1.        , 0.        ],
             # [0.20000000, 1.        ],
-            # [0.60000000, 0.50000000]]), 
+            # [0.60000000, 0.50000000]]),
             # >>> U
             # Tensor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[5.        , 6.        ],
             # [0.        , 0.80000000]]))
-            
 
-            # one can verify : X = P @ L @ U ;     
+
+            # one can verify : X = P @ L @ U ;
     """
 
     if in_dygraph_mode():
@@ -2176,7 +2176,7 @@ def lu(x, pivot=True, get_infos=False, name=None):
 
 def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
     r"""
-    Unpack L U and P to single matrix tensor . 
+    Unpack L U and P to single matrix tensor .
     unpack L and U matrix from LU, unpack permutation matrix P from Pivtos .
 
     P mat can be get by pivots:
@@ -2196,7 +2196,7 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
 
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-            
+
     Returns:
         P (Tensor): Permutation matrix P of lu factorization.
 
@@ -2204,11 +2204,11 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
 
         U (Tensor): The upper triangular matrix tensor of lu factorization.
 
-        
-    Examples:            
+
+    Examples:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
             lu,p,info = paddle.linalg.lu(x, get_infos=True)
@@ -2224,25 +2224,25 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
             # >>> info
             # Tensor(shape=[], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
             #    0)
-            
+
             P,L,U = paddle.linalg.lu_unpack(lu,p)
 
             # >>> P
             # (Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[0., 1., 0.],
             # [0., 0., 1.],
-            # [1., 0., 0.]]), 
+            # [1., 0., 0.]]),
             # >>> L
             # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[1.        , 0.        ],
             # [0.20000000, 1.        ],
-            # [0.60000000, 0.50000000]]), 
+            # [0.60000000, 0.50000000]]),
             # >>> U
             # Tensor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[5.        , 6.        ],
             # [0.        , 0.80000000]]))
 
-            # one can verify : X = P @ L @ U ;   
+            # one can verify : X = P @ L @ U ;
     """
 
     if in_dygraph_mode():
@@ -2291,7 +2291,7 @@ def eig(x, name=None):
     Args:
         x (Tensor): A tensor with shape math:`[*, N, N]`, The data type of the x should be one of ``float32``,
             ``float64``, ``compplex64`` or ``complex128``.
-        name (str, optional): The default value is `None`. Normally there is no need for user to set 
+        name (str, optional): The default value is `None`. Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -2360,7 +2360,7 @@ def eigvals(x, name=None):
             Its data type should be float32, float64, complex64, or complex128.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-            
+
     Returns:
         Tensor: A tensor containing the unsorted eigenvalues which has the same batch dimensions with `x`.
             The eigenvalues are complex-valued even when `x` is real.
@@ -2995,12 +2995,12 @@ def triangular_solve(x,
     Args:
         x (Tensor): The input triangular coefficient matrix. Its shape should be `[*, M, M]`, where `*` is zero or
             more batch dimensions. Its data type should be float32 or float64.
-        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is 
+        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is
             zero or more batch dimensions. Its data type should be float32 or float64.
-        upper (bool, optional): Whether to solve the upper-triangular system of equations (default) or the lower-triangular 
+        upper (bool, optional): Whether to solve the upper-triangular system of equations (default) or the lower-triangular
             system of equations. Default: True.
         transpose (bool, optional): whether `x` should be transposed before calculation. Default: False.
-        unitriangular (bool, optional): whether `x` is unit triangular. If True, the diagonal elements of `x` are assumed 
+        unitriangular (bool, optional): whether `x` is unit triangular. If True, the diagonal elements of `x` are assumed
             to be 1 and not referenced from `x` . Default: False.
         name(str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
@@ -3019,7 +3019,7 @@ def triangular_solve(x,
         import paddle
         import numpy as np
 
-        x = paddle.to_tensor([[1, 1, 1], 
+        x = paddle.to_tensor([[1, 1, 1],
                               [0, 2, 1],
                               [0, 0,-1]], dtype="float64")
         y = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
@@ -3066,7 +3066,7 @@ def cholesky_solve(x, y, upper=False, name=None):
     Args:
         x (Tensor): The input matrix which is upper or lower triangular Cholesky factor of square matrix A. Its shape should be `[*, M, M]`, where `*` is zero or
             more batch dimensions. Its data type should be float32 or float64.
-        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is 
+        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is
             zero or more batch dimensions. Its data type should be float32 or float64.
         upper (bool, optional): whether to consider the Cholesky factor as a lower or upper triangular matrix. Default: False.
         name(str, optional): Name for the operation (optional, default is None).
@@ -3080,7 +3080,7 @@ def cholesky_solve(x, y, upper=False, name=None):
 
         import paddle
 
-        u = paddle.to_tensor([[1, 1, 1], 
+        u = paddle.to_tensor([[1, 1, 1],
                                 [0, 2, 1],
                                 [0, 0,-1]], dtype="float64")
         b = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
@@ -3112,7 +3112,7 @@ def cholesky_solve(x, y, upper=False, name=None):
 
 def eigvalsh(x, UPLO='L', name=None):
     """
-    Computes the eigenvalues of a 
+    Computes the eigenvalues of a
     complex Hermitian (conjugate symmetric) or a real symmetric matrix.
 
     Args:
@@ -3192,26 +3192,26 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
     Args:
         x (Tensor): A tensor with shape ``(*, M, N)`` , the data type of the input Tensor ``x``
             should be one of float32, float64.
-        y (Tensor): A tensor with shape ``(*, M, K)`` , the data type of the input Tensor ``y`` 
+        y (Tensor): A tensor with shape ``(*, M, K)`` , the data type of the input Tensor ``y``
             should be one of float32, float64.
-        rcond(float, optional): The default value is None. A float pointing number used to determine 
-            the effective rank of ``x``. If ``rcond`` is None, it will be set to max(M, N) times the 
+        rcond(float, optional): The default value is None. A float pointing number used to determine
+            the effective rank of ``x``. If ``rcond`` is None, it will be set to max(M, N) times the
             machine precision of x_dtype.
-        driver(str, optional): The default value is None. The name of LAPACK method to be used. For 
-            CPU inputs the valid values are ‘gels’, ‘gelsy’, ‘gelsd, ‘gelss’. For CUDA input, the only 
-            valid driver is ‘gels’. If ``driver`` is None, ‘gelsy’ is used for CPU inputs and ‘gels’ 
+        driver(str, optional): The default value is None. The name of LAPACK method to be used. For
+            CPU inputs the valid values are ‘gels’, ‘gelsy’, ‘gelsd, ‘gelss’. For CUDA input, the only
+            valid driver is ‘gels’. If ``driver`` is None, ‘gelsy’ is used for CPU inputs and ‘gels’
             for CUDA inputs.
-        name(str, optional): The default value is None. Normally there is no need for user to set 
+        name(str, optional): The default value is None. Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tuple: A tuple of 4 Tensors which is (``solution``, ``residuals``, ``rank``, ``singular_values``). 
-        ``solution`` is a tensor with shape ``(*, N, K)``, meaning the least squares solution. ``residuals`` 
-        is a tensor with shape ``(*, K)``, meaning the squared residuals of the solutions, which is computed 
-        when M > N and every matrix in ``x`` is full-rank, otherwise return an empty tensor. ``rank`` is a tensor 
-        with shape ``(*)``, meaning the ranks of the matrices in ``x``, which is computed when ``driver`` in 
-        (‘gelsy’, ‘gelsd’, ‘gelss’), otherwise return an empty tensor. ``singular_values`` is a tensor with 
-        shape ``(*, min(M, N))``, meaning singular values of the matrices in ``x``, which is computed when 
+        Tuple: A tuple of 4 Tensors which is (``solution``, ``residuals``, ``rank``, ``singular_values``).
+        ``solution`` is a tensor with shape ``(*, N, K)``, meaning the least squares solution. ``residuals``
+        is a tensor with shape ``(*, K)``, meaning the squared residuals of the solutions, which is computed
+        when M > N and every matrix in ``x`` is full-rank, otherwise return an empty tensor. ``rank`` is a tensor
+        with shape ``(*)``, meaning the ranks of the matrices in ``x``, which is computed when ``driver`` in
+        (‘gelsy’, ‘gelsd’, ‘gelss’), otherwise return an empty tensor. ``singular_values`` is a tensor with
+        shape ``(*, min(M, N))``, meaning singular values of the matrices in ``x``, which is computed when
         ``driver`` in (‘gelsd’, ‘gelss’), otherwise return an empty tensor.
 
     Examples:
@@ -3328,7 +3328,7 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
 
 def corrcoef(x, rowvar=True, name=None):
     """
-    
+
     A correlation coefficient matrix indicate the correlation of each pair variables in the input matrix.
     For example, for an N-dimensional samples X=[x1,x2,…xN]T, then the correlation coefficient matrix
     element Rij is the correlation of xi and xj. The element Rii is the covariance of xi itself.
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 63a89327505..3eb142f13cf 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -136,7 +136,7 @@ def logical_or(x, y, out=None, name=None):
 
     .. note::
         ``paddle.logical_or`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
-    
+
     Args:
         x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
         y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
@@ -305,7 +305,7 @@ def equal_all(x, y, name=None):
     """
     Returns the truth value of :math:`x == y`. True if two inputs have the same elements, False otherwise.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -428,7 +428,7 @@ def equal(x, y, name=None):
 
     This layer returns the truth value of :math:`x == y` elementwise.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -439,7 +439,7 @@ def equal(x, y, name=None):
 
     Returns:
         Tensor: output Tensor, it's shape is the same as the input's Tensor,
-        and the data type is bool. The result of this op is stop_gradient. 
+        and the data type is bool. The result of this op is stop_gradient.
 
     Examples:
         .. code-block:: python
@@ -489,7 +489,7 @@ def greater_equal(x, y, name=None):
     """
     Returns the truth value of :math:`x >= y` elementwise, which is equivalent function to the overloaded operator `>=`.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -541,7 +541,7 @@ def greater_than(x, y, name=None):
     """
     Returns the truth value of :math:`x > y` elementwise, which is equivalent function to the overloaded operator `>`.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -592,7 +592,7 @@ def less_equal(x, y, name=None):
     """
     Returns the truth value of :math:`x <= y` elementwise, which is equivalent function to the overloaded operator `<=`.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -645,7 +645,7 @@ def less_than(x, y, name=None):
     """
     Returns the truth value of :math:`x < y` elementwise, which is equivalent function to the overloaded operator `<`.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -697,8 +697,8 @@ def less_than(x, y, name=None):
 def not_equal(x, y, name=None):
     """
     Returns the truth value of :math:`x != y` elementwise, which is equivalent function to the overloaded operator `!=`.
-    
-    Note: 
+
+    Note:
         The output has no gradient.
 
     Args:
@@ -769,7 +769,7 @@ def is_tensor(x):
             input3 = [1, 4]
             check = paddle.is_tensor(input3)
             print(check)  #False
-            
+
     """
     return isinstance(x, (Tensor, paddle.fluid.core.eager.Tensor))
 
@@ -821,7 +821,7 @@ def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
 def bitwise_and(x, y, out=None, name=None):
     """
     ${comment}
-    
+
     Args:
         x (Tensor): ${x_comment}
         y (Tensor): ${y_comment}
@@ -829,7 +829,7 @@ def bitwise_and(x, y, out=None, name=None):
 
     Returns:
         Tensor: ${out_comment}
-        
+
     Examples:
         .. code-block:: python
 
@@ -853,7 +853,7 @@ def bitwise_and(x, y, out=None, name=None):
 def bitwise_or(x, y, out=None, name=None):
     """
     ${comment}
-    
+
     Args:
         x (Tensor): ${x_comment}
         y (Tensor): ${y_comment}
@@ -922,7 +922,7 @@ def bitwise_not(x, out=None, name=None):
     Args:
         x(Tensor):  ${x_comment}
         out(Tensor): ${out_comment}
-    
+
     Returns:
         Tensor: ${out_comment}
 
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index bc0e7877aea..126a42173a1 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -128,7 +128,7 @@ def slice(input, axes, starts, ends):
                 ends = [-1, 1000]       # -1 denotes the reverse 0th position of dimension 0.
             Then:
                 result = [ [2, 3, 4], ] # result = data[0:1, 1:4]
-    
+
     Args:
         input (Tensor): A ``Tensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
         axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to .
@@ -505,7 +505,7 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
     For each value `v` in `input`, we reset it to a new value according to the
     following formula:
     ::
-   
+
         v = v - shard_id * shard_size if shard_id * shard_size <= v < (shard_id+1) * shard_size else ignore_value
 
     That is, the value `v` is set to the new offset within the range represented by the shard `shard_id`
@@ -820,16 +820,16 @@ def fill_diagonal_(x, value, offset=0, wrap=False, name=None):
     """
     Note:
         This API is ONLY available in Dygraph mode.
-	
+
     This function fill the value into the x Tensor's diagonal inplace.
-    
+
     Args:
         x(Tensor): ``x`` is the original Tensor
         value(Scale): ``value`` is the value to filled in x
         offset(int,optional): the offset to the main diagonal. Default: 0 (main diagonal).
         wrap(bool,optional): the diagonal 'wrapped' after N columns for tall matrices.
         name(str,optional): Name for the operation (optional, default is None)
-    
+
     Returns:
         Tensor: Tensor with diagonal filled with value.
 
@@ -1017,7 +1017,7 @@ def concat(x, axis=0, name=None):
         x (list|tuple): ``x`` is a Tensor list or Tensor tuple which is with data type bool, float16,
             float32, float64, int32, int64, int8, uint8. All the Tensors in ``x`` must have same data type.
         axis (int|Tensor, optional): Specify the axis to operate on the input Tensors.
-            It's a scalar with data type int or a Tensor with shape [1] and data type int32 
+            It's a scalar with data type int or a Tensor with shape [1] and data type int32
             or int64. The effective range is [-R, R), where R is Rank(x). When ``axis < 0``,
             it works the same way as ``axis+R``. Default is 0.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
@@ -1027,9 +1027,9 @@ def concat(x, axis=0, name=None):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
-            
+
             x1 = paddle.to_tensor([[1, 2, 3],
                                    [4, 5, 6]])
             x2 = paddle.to_tensor([[11, 12, 13],
@@ -1303,23 +1303,23 @@ def rot90(x, k=1, axes=[0, 1], name=None):
 
           data = paddle.arange(4)
           data = paddle.reshape(data, (2, 2))
-          print(data) 
+          print(data)
           #[[0, 1],
           # [2, 3]]
 
           y = paddle.rot90(data, 1, [0, 1])
-          print(y) 
+          print(y)
           #[[1, 3],
           # [0, 2]]
 
           y= paddle.rot90(data, -1, [0, 1])
-          print(y) 
+          print(y)
           #[[2, 0],
           # [3, 1]]
 
           data2 = paddle.arange(8)
           data2 = paddle.reshape(data2, (2,2,2))
-          print(data2) 
+          print(data2)
           #[[[0, 1],
           #  [2, 3]],
           # [[4, 5],
@@ -1385,7 +1385,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
     Flattens a contiguous range of axes in a tensor according to start_axis and stop_axis.
 
     Note:
-        The output Tensor will share data with origin Tensor and doesn't have a Tensor copy in ``dygraph`` mode. 
+        The output Tensor will share data with origin Tensor and doesn't have a Tensor copy in ``dygraph`` mode.
         If you want to use the Tensor copy version, please use `Tensor.clone` like ``flatten_clone_x = x.flatten().clone()``.
 
     For Example:
@@ -1535,9 +1535,9 @@ def flatten_(x, start_axis=0, stop_axis=-1, name=None):
 
 def roll(x, shifts, axis=None, name=None):
     """
-    Roll the `x` tensor along the given axis(axes). With specific 'shifts', Elements that 
-    roll beyond the last position are re-introduced at the first according to 'shifts'. 
-    If a axis is not specified, 
+    Roll the `x` tensor along the given axis(axes). With specific 'shifts', Elements that
+    roll beyond the last position are re-introduced at the first according to 'shifts'.
+    If a axis is not specified,
     the tensor will be flattened before rolling and then restored to the original shape.
 
     Args:
@@ -1554,7 +1554,7 @@ def roll(x, shifts, axis=None, name=None):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
 
             x = paddle.to_tensor([[1.0, 2.0, 3.0],
@@ -1625,13 +1625,13 @@ def roll(x, shifts, axis=None, name=None):
 
 def stack(x, axis=0, name=None):
     """
-    Stacks all the input tensors ``x`` along ``axis`` dimemsion. 
+    Stacks all the input tensors ``x`` along ``axis`` dimemsion.
     All tensors must be of the same shape and same dtype.
-    
-    For example, given N tensors of shape [A, B], if ``axis == 0``, the shape of stacked 
-    tensor is [N, A, B]; if ``axis == 1``, the shape of stacked 
+
+    For example, given N tensors of shape [A, B], if ``axis == 0``, the shape of stacked
+    tensor is [N, A, B]; if ``axis == 1``, the shape of stacked
     tensor is [A, N, B], etc.
-    
+
 
     .. code-block:: text
 
@@ -1679,29 +1679,29 @@ def stack(x, axis=0, name=None):
         x (list[Tensor]|tuple[Tensor]): Input ``x`` can be a ``list`` or ``tuple`` of tensors, the Tensors in ``x``
                                      must be of the same shape and dtype. Supported data types: float32, float64, int32, int64.
         axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
-                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
+                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``.
                               If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-        
+
     Returns:
         Tensor: The stacked tensor with same data type as input.
 
-    Example:    
+    Example:
         .. code-block:: python
 
             import paddle
-            
+
             x1 = paddle.to_tensor([[1.0, 2.0]])
             x2 = paddle.to_tensor([[3.0, 4.0]])
             x3 = paddle.to_tensor([[5.0, 6.0]])
-	    
+
             out = paddle.stack([x1, x2, x3], axis=0)
             print(out.shape)  # [3, 1, 2]
             print(out)
             # [[[1., 2.]],
             #  [[3., 4.]],
             #  [[5., 6.]]]
-	    
+
 	    out = paddle.stack([x1, x2, x3], axis=-2)
 	    print(out.shape)  # [1, 3, 2]
 	    print(out)
@@ -1763,27 +1763,27 @@ def stack(x, axis=0, name=None):
 def split(x, num_or_sections, axis=0, name=None):
     """
     Split the input tensor into multiple sub-Tensors.
-    
+
     Args:
         x (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, uint8, int8, int32 or int64.
-        num_or_sections (int|list|tuple): If ``num_or_sections`` is an int, then ``num_or_sections`` 
+        num_or_sections (int|list|tuple): If ``num_or_sections`` is an int, then ``num_or_sections``
             indicates the number of equal sized sub-Tensors that the ``x`` will be divided into.
             If ``num_or_sections`` is a list or tuple, the length of it indicates the number of
             sub-Tensors and the elements in it indicate the sizes of sub-Tensors'  dimension orderly.
             The length of the list must not  be larger than the ``x`` 's size of specified ``axis``.
-        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type 
+        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type
             ``int`` or a ``Tensor`` with shape [1] and data type  ``int32`` or ``int64``.
             If :math::`axis < 0`, the axis to split along is :math:`rank(x) + axis`. Default is 0.
         name (str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
     Returns:
         list(Tensor): The list of segmented Tensors.
-    
+
     Example:
         .. code-block:: python
-            
+
             import paddle
-            
+
             # x is a Tensor of shape [3, 9, 5]
             x = paddle.rand([3, 9, 5])
 
@@ -1801,7 +1801,7 @@ def split(x, num_or_sections, axis=0, name=None):
             print(out0.shape)  # [3, 2, 5]
             print(out1.shape)  # [3, 3, 5]
             print(out2.shape)  # [3, 4, 5]
-            
+
             # axis is negative, the real axis is (rank(x) + axis)=1
             out0, out1, out2 = paddle.split(x, num_or_sections=3, axis=-2)
             print(out0.shape)  # [3, 3, 5]
@@ -1929,10 +1929,10 @@ def split(x, num_or_sections, axis=0, name=None):
 def vsplit(x, num_or_sections, name=None):
     """
     Split the input tensor into multiple sub-Tensors along the vertical axis, which is equivalent to ``paddle.split`` with ``axis=0``.
-    
+
     Args:
         x (Tensor): A Tensor whose dimension must be greater than 1. The data type is bool, float16, float32, float64, uint8, int8, int32 or int64.
-        num_or_sections (int|list|tuple): If ``num_or_sections`` is an int, then ``num_or_sections`` 
+        num_or_sections (int|list|tuple): If ``num_or_sections`` is an int, then ``num_or_sections``
             indicates the number of equal sized sub-Tensors that the ``x`` will be divided into.
             If ``num_or_sections`` is a list or tuple, the length of it indicates the number of
             sub-Tensors and the elements in it indicate the sizes of sub-Tensors'  dimension orderly.
@@ -1941,12 +1941,12 @@ def vsplit(x, num_or_sections, name=None):
             For more information, please refer to :ref:`api_guide_Name` .
     Returns:
         list[Tensor], The list of segmented Tensors.
-    
+
     Example:
         .. code-block:: python
-            
+
             import paddle
-            
+
             # x is a Tensor of shape [8, 6, 7]
             x = paddle.rand([8, 6, 7])
             out0, out1, out2 = paddle.vsplit(x, num_or_sections=2)
@@ -1970,14 +1970,14 @@ def vsplit(x, num_or_sections, name=None):
 
 def squeeze(x, axis=None, name=None):
     """
-    Squeeze the dimension(s) of size 1 of input tensor x's shape. 
-    
-    Note that the output Tensor will share data with origin Tensor and doesn't have a 
-    Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version, 
+    Squeeze the dimension(s) of size 1 of input tensor x's shape.
+
+    Note that the output Tensor will share data with origin Tensor and doesn't have a
+    Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version,
     please use `Tensor.clone` like ``squeeze_clone_x = x.squeeze().clone()``.
 
-    If axis is provided, it will remove the dimension(s) by given axis that of size 1. 
-    If the dimension of given axis is not of size 1, the dimension remain unchanged. 
+    If axis is provided, it will remove the dimension(s) by given axis that of size 1.
+    If the dimension of given axis is not of size 1, the dimension remain unchanged.
     If axis is not provided, all dims equal of size 1 will be removed.
 
     .. code-block:: text
@@ -1997,11 +1997,11 @@ def squeeze(x, axis=None, name=None):
             axis = 0
           Output:
             out.shape = [3, 1, 5]
-        
+
         Case4:
 
           Input:
-            x.shape = [1, 3, 1, 5]  # If the dimension of one given axis (3) is not of size 1, the dimension remain unchanged. 
+            x.shape = [1, 3, 1, 5]  # If the dimension of one given axis (3) is not of size 1, the dimension remain unchanged.
             axis = [0, 2, 3]
           Output:
             out.shape = [3, 5]
@@ -2009,7 +2009,7 @@ def squeeze(x, axis=None, name=None):
         Case4:
 
           Input:
-            x.shape = [1, 3, 1, 5]  # If axis is negative, axis = axis + ndim (number of dimensions in x). 
+            x.shape = [1, 3, 1, 5]  # If axis is negative, axis = axis + ndim (number of dimensions in x).
             axis = [-2]
           Output:
             out.shape = [1, 3, 5]
@@ -2029,7 +2029,7 @@ def squeeze(x, axis=None, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.rand([5, 1, 10])
             output = paddle.squeeze(x, axis=1)
 
@@ -2139,21 +2139,21 @@ def unique_consecutive(x,
     Example:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             x = paddle.to_tensor([1, 1, 2, 2, 3, 1, 1, 2])
-            output = paddle.unique_consecutive(x) # 
+            output = paddle.unique_consecutive(x) #
             np_output = output.numpy() # [1 2 3 1 2]
             _, inverse, counts = paddle.unique_consecutive(x, return_inverse=True, return_counts=True)
             np_inverse = inverse.numpy() # [0 0 1 1 2 3 3 4]
             np_counts = inverse.numpy() # [2 2 1 2 1]
 
             x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3], [2, 1, 3]])
-            output = paddle.unique_consecutive(x, axis=0) # 
+            output = paddle.unique_consecutive(x, axis=0) #
             np_output = output.numpy() # [2 1 3 0 1 2 1 3 2 1 3]
 
             x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3], [2, 1, 3]])
-            output = paddle.unique_consecutive(x, axis=0) # 
+            output = paddle.unique_consecutive(x, axis=0) #
             np_output = output.numpy()
             # [[2 1 3]
             #  [3 0 1]
@@ -2248,7 +2248,7 @@ def unique(x,
         name(str, optional): Name for the operation. For more information, please refer to
             :ref:`api_guide_Name`. Default: None.
 
-    Returns: 
+    Returns:
         tuple (out, indices, inverse, counts). `out` is the unique tensor for `x`. `indices` is \
             provided only if `return_index` is True. `inverse` is provided only if `return_inverse` \
             is True. `counts` is provided only if `return_counts` is True.
@@ -2271,7 +2271,7 @@ def unique(x,
             np_unique = unique.numpy() # [0 1 2 3]
 
             unique = paddle.unique(x, axis=0)
-            np_unique = unique.numpy() 
+            np_unique = unique.numpy()
             # [[2 1 3]
             #  [3 0 1]]
     """
@@ -2360,14 +2360,14 @@ def unsqueeze(x, axis, name=None):
     required argument axis, a dimension or list of dimensions that will be inserted.
     Dimension indices in axis are as seen in the output tensor.
 
-    Note that the output Tensor will share data with origin Tensor and doesn't have a 
-    Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version, 
+    Note that the output Tensor will share data with origin Tensor and doesn't have a
+    Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version,
     please use `Tensor.clone` like ``unsqueeze_clone_x = x.unsqueeze(-1).clone()``.
 
     Args:
         x (Tensor): The input Tensor to be unsqueezed. Supported data type: float32, float64, bool, int8, int32, int64.
-        axis (int|list|tuple|Tensor): Indicates the dimensions to be inserted. The data type is ``int32`` . 
-                                    If ``axis`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. 
+        axis (int|list|tuple|Tensor): Indicates the dimensions to be inserted. The data type is ``int32`` .
+                                    If ``axis`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
                                     If ``axis`` is a Tensor, it should be an 1-D Tensor .
                                     If ``axis`` is negative, ``axis = axis + ndim(x) + 1``.
         name (str|None): Name for this layer. Please refer to :ref:`api_guide_Name`, Default None.
@@ -2382,15 +2382,15 @@ def unsqueeze(x, axis, name=None):
 
             x = paddle.rand([5, 10])
             print(x.shape)  # [5, 10]
-            
+
             out1 = paddle.unsqueeze(x, axis=0)
             print(out1.shape)  # [1, 5, 10]
-            
-            out2 = paddle.unsqueeze(x, axis=[0, 2]) 
+
+            out2 = paddle.unsqueeze(x, axis=[0, 2])
             print(out2.shape)  # [1, 5, 1, 10]
 
             axis = paddle.to_tensor([0, 1, 2])
-            out3 = paddle.unsqueeze(x, axis=axis) 
+            out3 = paddle.unsqueeze(x, axis=axis)
             print(out3.shape)  # [1, 1, 1, 5, 10]
 
             # out1, out2, out3 share data with x in dygraph mode
@@ -2398,7 +2398,7 @@ def unsqueeze(x, axis, name=None):
             print(out1[0, 0, 0]) # [10.]
             print(out2[0, 0, 0, 0]) # [10.]
             print(out3[0, 0, 0, 0, 0]) # [10.]
-            
+
     """
     input = x
     axes = axis
@@ -2501,7 +2501,7 @@ def gather(x, index, axis=None, name=None):
                 Then:
 
                 out = [[3, 4],
-                       [5, 6]] 
+                       [5, 6]]
 
     Args:
         x (Tensor): The source input tensor with rank>=1. Supported data type is
@@ -2514,7 +2514,7 @@ def gather(x, index, axis=None, name=None):
 
     Returns:
         output (Tensor): The output is a tensor with the same rank as ``x``.
-    
+
     Examples:
 
         .. code-block:: python
@@ -2579,7 +2579,7 @@ def unbind(input, axis=0):
 
     Args:
         input (Tensor): The input variable which is an N-D Tensor, data type being float32, float64, int32 or int64.
-        axis (int32|int64, optional): A scalar with type ``int32|int64`` shape [1]. The dimension along which to unbind. 
+        axis (int32|int64, optional): A scalar with type ``int32|int64`` shape [1]. The dimension along which to unbind.
             If :math:`axis < 0`, the dimension to unbind along is :math:`rank(input) + axis`. Default is 0.
     Returns:
         list(Tensor): The list of segmented Tensor variables.
@@ -2591,7 +2591,7 @@ def unbind(input, axis=0):
 
             # input is a Tensor which shape is [3, 4, 5]
             input = paddle.rand([3, 4, 5])
-       
+
             [x0, x1, x2] = paddle.unbind(input, axis=0)
             # x0.shape [4, 5]
             # x1.shape [4, 5]
@@ -2637,9 +2637,9 @@ def scatter(x, index, updates, overwrite=True, name=None):
     """
     **Scatter Layer**
     Output is obtained by updating the input on selected indices based on updates.
-    
+
     .. code-block:: python
-    
+
         import numpy as np
         #input:
         x = np.array([[1, 1], [2, 2], [3, 3]])
@@ -2661,32 +2661,32 @@ def scatter(x, index, updates, overwrite=True, name=None):
         out = np.array([[3, 3], [6, 6], [1, 1]])
         out.shape # [3, 2]
 
-    **NOTICE**: The order in which updates are applied is nondeterministic, 
+    **NOTICE**: The order in which updates are applied is nondeterministic,
     so the output will be nondeterministic if index contains duplicates.
 
     Args:
         x (Tensor): The input N-D Tensor with ndim>=1. Data type can be float32, float64.
         index (Tensor): The index 1-D Tensor. Data type can be int32, int64. The length of index cannot exceed updates's length, and the value in index cannot exceed input's length.
         updates (Tensor): update input with updates parameter based on index. shape should be the same as input, and dim value with dim > 1 should be the same as input.
-        overwrite (bool): The mode that updating the output when there are same indices. 
-            
+        overwrite (bool): The mode that updating the output when there are same indices.
+
             If True, use the overwrite mode to update the output of the same index,
 	        if False, use the accumulate mode to update the output of the same index.Default value is True.
-        
+
         name(str, optional): The default value is None. Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
- 
+
     Returns:
         Tensor: The output is a Tensor with the same shape as x.
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
 
             x = paddle.to_tensor([[1, 1], [2, 2], [3, 3]], dtype='float32')
             index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
             updates = paddle.to_tensor([[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
-  
+
             output1 = paddle.scatter(x, index, updates, overwrite=False)
             # [[3., 3.],
             #  [6., 6.],
@@ -2806,7 +2806,7 @@ def scatter_nd_add(x, index, updates, name=None):
             index = paddle.to_tensor([[1, 1],
                                     [0, 1],
                                     [1, 3]], dtype='int64')
-            
+
             output = paddle.scatter_nd_add(x, index, updates)
             print(output.shape)
             # [3, 5, 9, 10]
@@ -2880,24 +2880,24 @@ def scatter_nd(index, updates, shape, name=None):
 def chunk(x, chunks, axis=0, name=None):
     """
     Split the input tensor into multiple sub-Tensors.
-    
+
     Args:
         x (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, int32 or int64.
         chunks(int): The number of tensor to be split along the certain axis.
-        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type 
+        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type
             ``int`` or a ``Tensor`` with shape [1] and data type  ``int32`` or ``int64``.
             If :math::`axis < 0`, the axis to split along is :math:`rank(x) + axis`. Default is 0.
         name (str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
     Returns:
         list(Tensor): The list of segmented Tensors.
-    
+
     Example:
         .. code-block:: python
-            
+
             import numpy as np
             import paddle
-            
+
             # x is a Tensor which shape is [3, 9, 5]
             x_np = np.random.random([3, 9, 5]).astype("int32")
             x = paddle.to_tensor(x_np)
@@ -2907,7 +2907,7 @@ def chunk(x, chunks, axis=0, name=None):
             # out1.shape [3, 3, 5]
             # out2.shape [3, 3, 5]
 
-            
+
             # axis is negative, the real axis is (rank(x) + axis) which real
             # value is 1.
             out0, out1, out2 = paddle.chunk(x, chunks=3, axis=-2)
@@ -3091,7 +3091,7 @@ def broadcast_to(x, shape, name=None):
     Args:
         x (Tensor): The input tensor, its data type is bool, float32, float64, int32 or int64.
         shape (list|tuple|Tensor): The result shape after broadcasting. The data type is int32. If shape is a list or tuple, all its elements
-            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32. 
+            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32.
             The value -1 in shape means keeping the corresponding dimension unchanged.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
     Returns:
@@ -3181,7 +3181,7 @@ def expand(x, shape, name=None):
     Args:
         x (Tensor): The input Tensor, its data type is bool, float32, float64, int32 or int64.
         shape (list|tuple|Tensor): The result shape after expanding. The data type is int32. If shape is a list or tuple, all its elements
-            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32. 
+            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32.
             The value -1 in shape means keeping the corresponding dimension unchanged.
         name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` .
 
@@ -3265,8 +3265,8 @@ def reshape(x, shape, name=None):
     Changes the shape of ``x`` without changing its data.
 
     Note that the output Tensor will share data with origin Tensor and doesn't
-    have a Tensor copy in ``dygraph`` mode. 
-    If you want to use the Tensor copy version, please use `Tensor.clone` like 
+    have a Tensor copy in ``dygraph`` mode.
+    If you want to use the Tensor copy version, please use `Tensor.clone` like
     ``reshape_clone_x = x.reshape([-1]).clone()``.
 
     Some tricks exist when specifying the target shape.
@@ -3541,17 +3541,17 @@ def gather_nd(x, index, name=None):
 
     Returns:
         output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
-    
+
     Examples:
 
         .. code-block:: python
-            
+
             import paddle
-            
+
             x = paddle.to_tensor([[[1, 2], [3, 4], [5, 6]],
                                   [[7, 8], [9, 10], [11, 12]]])
             index = paddle.to_tensor([[0, 1]])
-            
+
             output = paddle.gather_nd(x, index) #[[3, 4]]
 
     """
@@ -3651,7 +3651,7 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
             strides_1 = [1, 1, 1]
             strides_2 = [1, 1, 2]
             sliced_1 = paddle.strided_slice(x, axes=axes, starts=starts, ends=ends, strides=strides_1)
-            # sliced_1 is x[:, 1:3:1, 0:2:1, 2:4:1].                                
+            # sliced_1 is x[:, 1:3:1, 0:2:1, 2:4:1].
             # example 2:
             # attr starts is a list which contain tensor Tensor.
             minus_3 = paddle.full(shape=[1], fill_value=-3, dtype='int32')
@@ -3776,44 +3776,44 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
 
 def tensordot(x, y, axes=2, name=None):
     r"""
-    This function computes a contraction, which sum the product of elements from two tensors along the given axes. 
+    This function computes a contraction, which sum the product of elements from two tensors along the given axes.
 
     Args:
         x (Tensor): The left tensor for contraction with data type ``float32`` or ``float64``.
         y (Tensor): The right tensor for contraction with the same data type as ``x``.
         axes (int|tuple|list|Tensor, optional):  The axes to contract for ``x`` and ``y``, defaulted to integer ``2``.
 
-            1. It could be a non-negative integer ``n``, 
+            1. It could be a non-negative integer ``n``,
                in which the function will sum over the last ``n`` axes of ``x`` and the first ``n`` axes of ``y`` in order.
-        
-            2. It could be a 1-d tuple or list with data type ``int``, in which ``x`` and ``y`` will be contracted along the same given axes. 
+
+            2. It could be a 1-d tuple or list with data type ``int``, in which ``x`` and ``y`` will be contracted along the same given axes.
                For example, ``axes`` =[0, 1] applies contraction along the first two axes for ``x`` and the first two axes for ``y``.
-        
-            3. It could be a tuple or list containing one or two 1-d tuple|list|Tensor with data type ``int``. 
-               When containing one tuple|list|Tensor, the data in tuple|list|Tensor specified the same axes for ``x`` and ``y`` to contract. 
-               When containing two tuple|list|Tensor, the first will be applied to ``x`` and the second to ``y``. 
+
+            3. It could be a tuple or list containing one or two 1-d tuple|list|Tensor with data type ``int``.
+               When containing one tuple|list|Tensor, the data in tuple|list|Tensor specified the same axes for ``x`` and ``y`` to contract.
+               When containing two tuple|list|Tensor, the first will be applied to ``x`` and the second to ``y``.
                When containing more than two tuple|list|Tensor, only the first two axis sequences will be used while the others will be ignored.
-        
-            4. It could be a tensor, in which the ``axes`` tensor will be translated to a python list 
-               and applied the same rules described above to determine the contraction axes. 
+
+            4. It could be a tensor, in which the ``axes`` tensor will be translated to a python list
+               and applied the same rules described above to determine the contraction axes.
                Note that the ``axes`` with Tensor type is ONLY available in Dygraph mode.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property. 
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
                              For more information, please refer to :ref:`api_guide_Name` .
 
-    Return: 
-        Output (Tensor): The contraction result with the same data type as ``x`` and ``y``. 
+    Return:
+        Output (Tensor): The contraction result with the same data type as ``x`` and ``y``.
         In general, :math:`output.ndim = x.ndim + y.ndim - 2 \times n_{axes}`, where :math:`n_{axes}` denotes the number of axes to be contracted.
-    
+
     NOTES:
-        1. This function supports tensor broadcast, 
+        1. This function supports tensor broadcast,
            the size in the corresponding dimensions of ``x`` and ``y`` should be equal, or applies to the broadcast rules.
-        2. This function also supports axes expansion, 
-           when the two given axis sequences for ``x`` and ``y`` are of different lengths, 
-           the shorter sequence will expand the same axes as the longer one at the end. 
-           For example, if ``axes`` =[[0, 1, 2, 3], [1, 0]], 
-           the axis sequence for ``x`` is [0, 1, 2, 3], 
+        2. This function also supports axes expansion,
+           when the two given axis sequences for ``x`` and ``y`` are of different lengths,
+           the shorter sequence will expand the same axes as the longer one at the end.
+           For example, if ``axes`` =[[0, 1, 2, 3], [1, 0]],
+           the axis sequence for ``x`` is [0, 1, 2, 3],
            while the corresponding axis sequences for ``y`` will be expanded from [1, 0] to [1, 0, 2, 3].
-  
+
     Examples:
         .. code-block:: python
 
@@ -3822,7 +3822,7 @@ def tensordot(x, y, axes=2, name=None):
             data_type = 'float64'
 
             # For two 2-d tensor x and y, the case axes=0 is equivalent to outer product.
-            # Note that tensordot supports empty axis sequence, so all the axes=0, axes=[], axes=[[]], and axes=[[],[]] are equivalent cases.   
+            # Note that tensordot supports empty axis sequence, so all the axes=0, axes=[], axes=[[]], and axes=[[],[]] are equivalent cases.
             x = paddle.arange(4, dtype=data_type).reshape([2, 2])
             y = paddle.arange(4, dtype=data_type).reshape([2, 2])
             z = paddle.tensordot(x, y, axes=0)
@@ -3884,7 +3884,7 @@ def tensordot(x, y, axes=2, name=None):
             # z = [[23217330., 24915630., 26613930., 28312230.],
             #      [24915630., 26775930., 28636230., 30496530.],
             #      [26613930., 28636230., 30658530., 32680830.],
-            #      [28312230., 30496530., 32680830., 34865130.]] 
+            #      [28312230., 30496530., 32680830., 34865130.]]
     """
     op_type = 'tensordot'
     input_dtype = ['float32', 'float64']
@@ -3983,12 +3983,12 @@ def tensordot(x, y, axes=2, name=None):
 
 
 def as_complex(x, name=None):
-    """Transform a real tensor to a complex tensor. 
-    
+    """Transform a real tensor to a complex tensor.
+
     The data type of the input tensor is 'float32' or 'float64', and the data
     type of the returned tensor is 'complex64' or 'complex128', respectively.
 
-    The shape of the input tensor is ``(* ,2)``, (``*`` means arbitary shape), i.e. 
+    The shape of the input tensor is ``(* ,2)``, (``*`` means arbitary shape), i.e.
     the size of the last axis shoule be 2, which represent the real and imag part
     of a complex number. The shape of the returned tensor is ``(*,)``.
 
@@ -3998,7 +3998,7 @@ def as_complex(x, name=None):
 
     Returns:
         Tensor: The output. Data type is 'complex64' or 'complex128', with the same precision as the input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -4028,9 +4028,9 @@ def as_complex(x, name=None):
 
 
 def as_real(x, name=None):
-    """Transform a complex tensor to a real tensor. 
-    
-    The data type of the input tensor is 'complex64' or 'complex128', and the data 
+    """Transform a complex tensor to a real tensor.
+
+    The data type of the input tensor is 'complex64' or 'complex128', and the data
     type of the returned tensor is 'float32' or 'float64', respectively.
 
     When the shape of the input tensor is ``(*, )``, (``*`` means arbitary shape),
@@ -4043,7 +4043,7 @@ def as_real(x, name=None):
 
     Returns:
         Tensor: The output. Data type is 'float32' or 'float64', with the same precision as the input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -4161,7 +4161,7 @@ def moveaxis(x, source, destination, name=None):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
 
             x = paddle.ones([3, 2, 4])
@@ -4170,7 +4170,7 @@ def moveaxis(x, source, destination, name=None):
 
             x = paddle.ones([2, 3])
             paddle.moveaxis(x, 0, 1).shape # equivalent to paddle.t(x)
-            # [3, 2]  
+            # [3, 2]
     """
     src = [source] if isinstance(source, int) else source
     dst = [destination] if isinstance(destination, int) else destination
@@ -4282,11 +4282,11 @@ def take_along_axis(arr, indices, axis):
         arr (Tensor) : The input Tensor. Supported data types are float32 and float64.
         indices (Tensor) : Indices to take along each 1d slice of arr. This must match the dimension of arr,
             and need to broadcast against arr. Supported data type are int and int64.
-        axis (int) : The axis to take 1d slices along. 
+        axis (int) : The axis to take 1d slices along.
 
-    Returns: 
+    Returns:
         Tensor: The indexed element, same dtype with arr
-    
+
     Examples:
         .. code-block:: python
 
@@ -4347,11 +4347,11 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'):
         arr (Tensor) : The Destination Tensor. Supported data types are float32 and float64.
         indices (Tensor) : Indices to put along each 1d slice of arr. This must match the dimension of arr,
             and need to broadcast against arr. Supported data type are int and int64.
-        axis (int) : The axis to put 1d slices along. 
+        axis (int) : The axis to put 1d slices along.
         reduce (string | optinal) : The reduce operation, default is 'assign', support 'add', 'assign', 'mul' and 'multiply'.
-    Returns : 
+    Returns :
         Tensor: The indexed element, same dtype with arr
-    
+
     Examples:
         .. code-block:: python
 
@@ -4438,7 +4438,7 @@ def index_add(x, index, axis, value, name=None):
         x (Tensor) : The Destination Tensor. Supported data types are int32, int64, float16, float32, float64.
         index (Tensor): The 1-D Tensor containing the indices to index.
             The data type of ``index`` must be int32 or int64.
-        axis (int): The dimension in which we index. 
+        axis (int): The dimension in which we index.
         value (Tensor): The tensor used to add the elements along the target axis.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
@@ -4491,7 +4491,7 @@ def index_add_(x, index, axis, value, name=None):
     """
     Inplace version of ``index_add`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_paddle_tensor_index_add`.
-    
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index c5b995454ae..e9369b5da38 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -158,7 +158,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
 
     Examples:
         .. code-block:: python
-            
+
             # scale as a float32 number
             import paddle
 
@@ -291,7 +291,7 @@ def multiplex(inputs, index, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             img1 = paddle.to_tensor([[1, 2], [3, 4]], dtype=paddle.float32)
             img2 = paddle.to_tensor([[5, 6], [7, 8]], dtype=paddle.float32)
             inputs = [img1, img2]
@@ -345,7 +345,7 @@ def pow(x, y, name=None):
     Compute the power of Tensor elements. The equation is:
 
     .. math::
-        out = x^{y} 
+        out = x^{y}
 
     Note:
         ``paddle.pow`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
@@ -355,7 +355,7 @@ def pow(x, y, name=None):
         x (Tensor): An N-D Tensor, the data type is float16, float32, float64, int32 or int64.
         y (float|int|Tensor): If it is an N-D Tensor, its data type should be the same as `x`.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         N-D Tensor. A location into which the result is stored. Its dimension and data type are the same as `x`.
 
@@ -1138,7 +1138,7 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
 
     Returns:
         Tensor: Results of summation operation on the specified axis of input Tensor `x`,
-        if `x.dtype='bool'`, `x.dtype='int32'`, it's data type is `'int64'`, 
+        if `x.dtype='bool'`, `x.dtype='int32'`, it's data type is `'int64'`,
         otherwise it's data type is the same as `x`.
 
     Examples:
@@ -1161,11 +1161,11 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
             #      [[[1, 2], [3, 4]],
             #      [[5, 6], [7, 8]]]
             # Each example is followed by the corresponding output tensor.
-            y = paddle.to_tensor([[[1, 2], [3, 4]], 
+            y = paddle.to_tensor([[[1, 2], [3, 4]],
                                   [[5, 6], [7, 8]]])
             out5 = paddle.sum(y, axis=[1, 2]) # [10, 26]
             out6 = paddle.sum(y, axis=[0, 1]) # [16, 20]
-            
+
             # x is a Tensor with following elements:
             #    [[True, True, True, True]
             #     [False, False, False, False]]
@@ -1293,7 +1293,7 @@ def nansum(x, axis=None, dtype=None, keepdim=False, name=None):
             #      [[[1, nan], [3, 4]],
             #      [[5, 6], [-nan, 8]]]
             # Each example is followed by the corresponding output tensor.
-            y = np.array([[[1, float('nan')], [3, 4]], 
+            y = np.array([[[1, float('nan')], [3, 4]],
                             [[5, 6], [float('-nan'), 8]]])
             y = paddle.to_tensor(y)
             out5 = paddle.nansum(y, axis=[1, 2]) # [8, 19]
@@ -1444,11 +1444,11 @@ def count_nonzero(x, axis=None, keepdim=False, name=None):
 def add_n(inputs, name=None):
     """
     Sum one or more Tensor of the input.
-    
+
     For example:
 
     .. code-block:: text
-    
+
         Case 1:
 
             Input:
@@ -1462,7 +1462,7 @@ def add_n(inputs, name=None):
                           [4, 5, 6]]
 
         Case 2:
-       
+
             Input:
                 First input:
                     input1.shape = [2, 3]
@@ -1495,7 +1495,7 @@ def add_n(inputs, name=None):
             input0 = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype='float32')
             input1 = paddle.to_tensor([[7, 8, 9], [10, 11, 12]], dtype='float32')
             output = paddle.add_n([input0, input1])
-            # [[8., 10., 12.], 
+            # [[8., 10., 12.],
             #  [14., 16., 18.]]
     """
     if in_dygraph_mode():
@@ -1536,14 +1536,14 @@ def add_n(inputs, name=None):
 def trunc(input, name=None):
     '''
     This API is used to returns a new tensor with the truncated integer values of input.
-    
+
     Args:
         input (Tensor): The input tensor, it's data type should be int32, int64, float32, float64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         Tensor: The output Tensor of trunc.
-    
+
     Examples:
         .. code-block:: python
 
@@ -1721,7 +1721,7 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 
     Examples:
         ..  code-block:: python
-            
+
             import paddle
 
             x = paddle.ones([2,2])
@@ -1785,7 +1785,7 @@ def renorm(x, p, axis, max_norm):
     This operator is used to calculate the p-norm along the axis,
     suppose the input-shape on axis dimension has the value of T, then
     the tensor is split into T parts, the p-norm should be calculated for each
-    part, if the p-norm for part i is larger than max-norm, then each element 
+    part, if the p-norm for part i is larger than max-norm, then each element
     in part i should be re-normalized at the same scale so that part-i' p-norm equals
     max-norm exactly, otherwise part-i stays unchanged.
 
@@ -1800,17 +1800,17 @@ def renorm(x, p, axis, max_norm):
 
     Examples:
         ..  code-block:: python
-            
+
             import paddle
             input = [[[2.0,2,-2],[3,0.3,3]],[[2,-8,2],[3.1,3.7,3]]]
             x = paddle.to_tensor(input,dtype='float32')
             y = paddle.renorm(x, 1.0, 2, 2.05)
-            print(y)        
+            print(y)
     #        [[[ 0.40594056,  0.29285714, -0.41000000],
     #          [ 0.60891086,  0.04392857,  0.61500001]],
     #         [[ 0.40594056, -1.17142856,  0.41000000],
     #          [ 0.62920785,  0.54178572,  0.61500001]]])
-    
+
     """
     input_shape = x.shape
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'renorm')
@@ -1843,7 +1843,7 @@ def inner(x, y, name=None):
     """
 
     Inner product of two input Tensor.
-    
+
     Ordinary inner product for 1-D Tensors, in higher dimensions a sum product over the last axes.
 
     Args:
@@ -1918,8 +1918,8 @@ def outer(x, y, name=None):
     Input is flattened if not already 1-dimensional.
 
     Args:
-        x (Tensor): An N-D Tensor or a Scalar Tensor. 
-        y (Tensor): An N-D Tensor or a Scalar Tensor. 
+        x (Tensor): An N-D Tensor or a Scalar Tensor.
+        y (Tensor): An N-D Tensor or a Scalar Tensor.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -1971,7 +1971,7 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
        logsumexp(x) = \log\sum exp(x)
 
     Args:
-        x (Tensor): The input Tensor with data type float32 or float64, which 
+        x (Tensor): The input Tensor with data type float32 or float64, which
             have no more than 4 dimensions.
         axis (int|list|tuple, optional): The axis along which to perform
             logsumexp calculations. ``axis`` should be int, list(int) or
@@ -2081,7 +2081,7 @@ def inverse(x, name=None):
 
 def _get_reduce_axis(axis):
     """
-    Internal function for max, min, amax and amin. 
+    Internal function for max, min, amax and amin.
     It computes the attribute reduce_all value based on axis.
     """
     if axis is not None and not isinstance(axis, list):
@@ -2104,7 +2104,7 @@ def _get_reduce_axis_with_tensor(axis):
 
 def _get_reduce_all_value(axis):
     """
-    Internal function for max, min, amax and amin. 
+    Internal function for max, min, amax and amin.
     It computes the attribute reduce_all value based on axis.
     """
     if axis is not None and not isinstance(axis, list):
@@ -2127,7 +2127,7 @@ def max(x, axis=None, keepdim=False, name=None):
 
     Note:
         The difference between max and amax is: If there are multiple maximum elements,
-        amax evenly distributes gradient between these equal values, 
+        amax evenly distributes gradient between these equal values,
         while max propagates gradient to all of them.
 
 
@@ -2156,45 +2156,45 @@ def max(x, axis=None, keepdim=False, name=None):
             # data_x is a Tensor with shape [2, 4]
             # the axis is a int element
             x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
-                                  [0.1, 0.2, 0.6, 0.7]], 
+                                  [0.1, 0.2, 0.6, 0.7]],
                                  dtype='float64', stop_gradient=False)
             result1 = paddle.max(x)
             result1.backward()
-            print(result1, x.grad) 
+            print(result1, x.grad)
             #[0.9], [[0., 0., 0., 1.], [0., 0., 0., 0.]]
 
             x.clear_grad()
             result2 = paddle.max(x, axis=0)
             result2.backward()
-            print(result2, x.grad) 
+            print(result2, x.grad)
             #[0.2, 0.3, 0.6, 0.9], [[1., 1., 0., 1.], [0., 0., 1., 0.]]
 
             x.clear_grad()
             result3 = paddle.max(x, axis=-1)
             result3.backward()
-            print(result3, x.grad) 
+            print(result3, x.grad)
             #[0.9, 0.7], [[0., 0., 0., 1.], [0., 0., 0., 1.]]
 
             x.clear_grad()
             result4 = paddle.max(x, axis=1, keepdim=True)
             result4.backward()
-            print(result4, x.grad) 
+            print(result4, x.grad)
             #[[0.9], [0.7]], [[0., 0., 0., 1.], [0., 0., 0., 1.]]
 
             # data_y is a Tensor with shape [2, 2, 2]
-            # the axis is list 
+            # the axis is list
             y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
                                   [[5.0, 6.0], [7.0, 8.0]]],
                                  dtype='float64', stop_gradient=False)
             result5 = paddle.max(y, axis=[1, 2])
             result5.backward()
-            print(result5, y.grad) 
+            print(result5, y.grad)
             #[4., 8.], [[[0., 0.], [0., 1.]], [[0., 0.], [0., 1.]]]
 
             y.clear_grad()
             result6 = paddle.max(y, axis=[0, 1])
             result6.backward()
-            print(result6, y.grad) 
+            print(result6, y.grad)
             #[7., 8.], [[[0., 0.], [0., 0.]], [[0., 0.], [1., 1.]]]
     """
 
@@ -2231,7 +2231,7 @@ def min(x, axis=None, keepdim=False, name=None):
 
     Note:
         The difference between min and amin is: If there are multiple minimum elements,
-        amin evenly distributes gradient between these equal values, 
+        amin evenly distributes gradient between these equal values,
         while min propagates gradient to all of them.
 
     Args:
@@ -2259,45 +2259,45 @@ def min(x, axis=None, keepdim=False, name=None):
             # data_x is a Tensor with shape [2, 4]
             # the axis is a int element
             x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
-                                  [0.1, 0.2, 0.6, 0.7]], 
+                                  [0.1, 0.2, 0.6, 0.7]],
                                  dtype='float64', stop_gradient=False)
             result1 = paddle.min(x)
             result1.backward()
-            print(result1, x.grad) 
+            print(result1, x.grad)
             #[0.1], [[0., 0., 0., 0.], [1., 0., 0., 0.]]
 
             x.clear_grad()
             result2 = paddle.min(x, axis=0)
             result2.backward()
-            print(result2, x.grad) 
+            print(result2, x.grad)
             #[0.1, 0.2, 0.5, 0.7], [[0., 0., 1., 0.], [1., 1., 0., 1.]]
 
             x.clear_grad()
             result3 = paddle.min(x, axis=-1)
             result3.backward()
-            print(result3, x.grad) 
+            print(result3, x.grad)
             #[0.2, 0.1], [[1., 0., 0., 0.], [1., 0., 0., 0.]]
 
             x.clear_grad()
             result4 = paddle.min(x, axis=1, keepdim=True)
             result4.backward()
-            print(result4, x.grad) 
+            print(result4, x.grad)
             #[[0.2], [0.1]], [[1., 0., 0., 0.], [1., 0., 0., 0.]]
 
             # data_y is a Tensor with shape [2, 2, 2]
-            # the axis is list 
+            # the axis is list
             y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
                                   [[5.0, 6.0], [7.0, 8.0]]],
                                  dtype='float64', stop_gradient=False)
             result5 = paddle.min(y, axis=[1, 2])
             result5.backward()
-            print(result5, y.grad) 
+            print(result5, y.grad)
             #[1., 5.], [[[1., 0.], [0., 0.]], [[1., 0.], [0., 0.]]]
 
             y.clear_grad()
             result6 = paddle.min(y, axis=[0, 1])
             result6.backward()
-            print(result6, y.grad) 
+            print(result6, y.grad)
             #[1., 2.], [[[1., 1.], [0., 0.]], [[0., 0.], [0., 0.]]]
     """
 
@@ -2334,7 +2334,7 @@ def amax(x, axis=None, keepdim=False, name=None):
 
     Note:
         The difference between max and amax is: If there are multiple maximum elements,
-        amax evenly distributes gradient between these equal values, 
+        amax evenly distributes gradient between these equal values,
         while max propagates gradient to all of them.
 
     Args:
@@ -2363,22 +2363,22 @@ def amax(x, axis=None, keepdim=False, name=None):
             # the axis is a int element
 
             x = paddle.to_tensor([[0.1, 0.9, 0.9, 0.9],
-                                  [0.9, 0.9, 0.6, 0.7]], 
+                                  [0.9, 0.9, 0.6, 0.7]],
                                  dtype='float64', stop_gradient=False)
-            # There are 5 maximum elements: 
-            # 1) amax evenly distributes gradient between these equal values, 
+            # There are 5 maximum elements:
+            # 1) amax evenly distributes gradient between these equal values,
             #    thus the corresponding gradients are 1/5=0.2;
-            # 2) while max propagates gradient to all of them, 
+            # 2) while max propagates gradient to all of them,
             #    thus the corresponding gradient are 1.
             result1 = paddle.amax(x)
             result1.backward()
-            print(result1, x.grad) 
+            print(result1, x.grad)
             #[0.9], [[0., 0.2, 0.2, 0.2], [0.2, 0.2, 0., 0.]]
 
             x.clear_grad()
             result1_max = paddle.max(x)
             result1_max.backward()
-            print(result1_max, x.grad) 
+            print(result1_max, x.grad)
             #[0.9], [[0., 1.0, 1.0, 1.0], [1.0, 1.0, 0., 0.]]
 
             ###############################
@@ -2386,35 +2386,35 @@ def amax(x, axis=None, keepdim=False, name=None):
             x.clear_grad()
             result2 = paddle.amax(x, axis=0)
             result2.backward()
-            print(result2, x.grad) 
+            print(result2, x.grad)
             #[0.9, 0.9, 0.9, 0.9], [[0., 0.5, 1., 1.], [1., 0.5, 0., 0.]]
 
             x.clear_grad()
             result3 = paddle.amax(x, axis=-1)
             result3.backward()
-            print(result3, x.grad) 
+            print(result3, x.grad)
             #[0.9, 0.9], [[0., 0.3333, 0.3333, 0.3333], [0.5, 0.5, 0., 0.]]
 
             x.clear_grad()
             result4 = paddle.amax(x, axis=1, keepdim=True)
             result4.backward()
-            print(result4, x.grad) 
+            print(result4, x.grad)
             #[[0.9], [0.9]], [[0., 0.3333, 0.3333, 0.3333.], [0.5, 0.5, 0., 0.]]
 
             # data_y is a Tensor with shape [2, 2, 2]
-            # the axis is list 
+            # the axis is list
             y = paddle.to_tensor([[[0.1, 0.9], [0.9, 0.9]],
                                   [[0.9, 0.9], [0.6, 0.7]]],
                                  dtype='float64', stop_gradient=False)
             result5 = paddle.amax(y, axis=[1, 2])
             result5.backward()
-            print(result5, y.grad) 
+            print(result5, y.grad)
             #[0.9., 0.9], [[[0., 0.3333], [0.3333, 0.3333]], [[0.5, 0.5], [0., 1.]]]
 
             y.clear_grad()
             result6 = paddle.amax(y, axis=[0, 1])
             result6.backward()
-            print(result6, y.grad) 
+            print(result6, y.grad)
             #[0.9., 0.9], [[[0., 0.3333], [0.5, 0.3333]], [[0.5, 0.3333], [1., 1.]]]
     """
 
@@ -2448,11 +2448,11 @@ def amin(x, axis=None, keepdim=False, name=None):
 
     Note:
         The difference between min and amin is: If there are multiple minimum elements,
-        amin evenly distributes gradient between these equal values, 
+        amin evenly distributes gradient between these equal values,
         while min propagates gradient to all of them.
 
     Args:
-        x (Tensor): A tensor, the data type is float32, float64, int32, int64, 
+        x (Tensor): A tensor, the data type is float32, float64, int32, int64,
             the dimension is no more than 4.
         axis (int|list|tuple, optional): The axis along which the minimum is computed.
             If :attr:`None`, compute the minimum over all elements of
@@ -2477,22 +2477,22 @@ def amin(x, axis=None, keepdim=False, name=None):
             # the axis is a int element
 
             x = paddle.to_tensor([[0.2, 0.1, 0.1, 0.1],
-                                  [0.1, 0.1, 0.6, 0.7]], 
+                                  [0.1, 0.1, 0.6, 0.7]],
                                  dtype='float64', stop_gradient=False)
-            # There are 5 minimum elements: 
-            # 1) amin evenly distributes gradient between these equal values, 
+            # There are 5 minimum elements:
+            # 1) amin evenly distributes gradient between these equal values,
             #    thus the corresponding gradients are 1/5=0.2;
-            # 2) while min propagates gradient to all of them, 
+            # 2) while min propagates gradient to all of them,
             #    thus the corresponding gradient are 1.
             result1 = paddle.amin(x)
             result1.backward()
-            print(result1, x.grad) 
+            print(result1, x.grad)
             #[0.1], [[0., 0.2, 0.2, 0.2], [0.2, 0.2, 0., 0.]]
 
             x.clear_grad()
             result1_min = paddle.min(x)
             result1_min.backward()
-            print(result1_min, x.grad) 
+            print(result1_min, x.grad)
             #[0.1], [[0., 1.0, 1.0, 1.0], [1.0, 1.0, 0., 0.]]
 
             ###############################
@@ -2500,35 +2500,35 @@ def amin(x, axis=None, keepdim=False, name=None):
             x.clear_grad()
             result2 = paddle.amin(x, axis=0)
             result2.backward()
-            print(result2, x.grad) 
+            print(result2, x.grad)
             #[0.1, 0.1, 0.1, 0.1], [[0., 0.5, 1., 1.], [1., 0.5, 0., 0.]]
 
             x.clear_grad()
             result3 = paddle.amin(x, axis=-1)
             result3.backward()
-            print(result3, x.grad) 
+            print(result3, x.grad)
             #[0.1, 0.1], [[0., 0.3333, 0.3333, 0.3333], [0.5, 0.5, 0., 0.]]
 
             x.clear_grad()
             result4 = paddle.amin(x, axis=1, keepdim=True)
             result4.backward()
-            print(result4, x.grad) 
+            print(result4, x.grad)
             #[[0.1], [0.1]], [[0., 0.3333, 0.3333, 0.3333.], [0.5, 0.5, 0., 0.]]
 
             # data_y is a Tensor with shape [2, 2, 2]
-            # the axis is list 
+            # the axis is list
             y = paddle.to_tensor([[[0.2, 0.1], [0.1, 0.1]],
                                   [[0.1, 0.1], [0.6, 0.7]]],
                                  dtype='float64', stop_gradient=False)
             result5 = paddle.amin(y, axis=[1, 2])
             result5.backward()
-            print(result5, y.grad) 
+            print(result5, y.grad)
             #[0.1., 0.1], [[[0., 0.3333], [0.3333, 0.3333]], [[0.5, 0.5], [0., 1.]]]
 
             y.clear_grad()
             result6 = paddle.amin(y, axis=[0, 1])
             result6.backward()
-            print(result6, y.grad) 
+            print(result6, y.grad)
             #[0.1., 0.1], [[[0., 0.3333], [0.5, 0.3333]], [[0.5, 0.3333], [1., 1.]]]
     """
 
@@ -2564,7 +2564,7 @@ def log1p(x, name=None):
     Args:
         x (Tensor): Input Tensor. Must be one of the following types: float32, float64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-        
+
     Returns:
         Tensor, the natural log of the input Tensor computed element-wise.
 
@@ -2610,7 +2610,7 @@ def log2(x, name=None):
     Examples:
 
         .. code-block:: python
-        
+
             import paddle
 
             # example 1: x is a float
@@ -2662,7 +2662,7 @@ def log10(x, name=None):
     Examples:
 
         .. code-block:: python
-        
+
             import paddle
 
             # example 1: x is a float
@@ -2912,7 +2912,7 @@ def diagonal(x, offset=0, axis1=0, axis2=1, name=None):
     This OP computes the diagonals of the input tensor x.
 
     If ``x`` is 2D, returns the diagonal.
-    If ``x`` has larger dimensions, diagonals be taken from the 2D planes specified by axis1 and axis2. 
+    If ``x`` has larger dimensions, diagonals be taken from the 2D planes specified by axis1 and axis2.
     By default, the 2D planes formed by the first and second axis of the input tensor x.
 
     The argument ``offset`` determines where diagonals are taken from input tensor x:
@@ -2920,7 +2920,7 @@ def diagonal(x, offset=0, axis1=0, axis2=1, name=None):
     - If offset = 0, it is the main diagonal.
     - If offset > 0, it is above the main diagonal.
     - If offset < 0, it is below the main diagonal.
-    
+
     Args:
         x (Tensor): The input tensor x. Must be at least 2-dimensional. The input data type should be bool, int32, int64, float16, float32, float64.
         offset (int, optional): Which diagonals in input tensor x will be taken. Default: 0 (main diagonals).
@@ -2970,7 +2970,7 @@ def diagonal(x, offset=0, axis1=0, axis2=1, name=None):
             #Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
             #       [[0.45661032, 0.86177313],
             #        [0.17020577, 0.27325270]])
-            
+
     """
     if in_dygraph_mode():
         return _C_ops.diagonal(x, offset, axis1, axis2)
@@ -3062,25 +3062,25 @@ def kron(x, y, name=None):
 
 def cumsum(x, axis=None, dtype=None, name=None):
     """
-    The cumulative sum of the elements along a given axis. 
-    
+    The cumulative sum of the elements along a given axis.
+
     Note:
-        The first element of the result is the same as the first element of the input. 
+        The first element of the result is the same as the first element of the input.
 
     Args:
         x (Tensor): The input tensor needed to be cumsumed.
         axis (int, optional): The dimension to accumulate along. -1 means the last dimension. The default (None) is to compute the cumsum over the flattened array.
-        dtype (str, optional): The data type of the output tensor, can be float32, float64, int32, int64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None. 
+        dtype (str, optional): The data type of the output tensor, can be float32, float64, int32, int64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor, the result of cumsum operator. 
+        Tensor, the result of cumsum operator.
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
-            
+
             data = paddle.arange(12)
             data = paddle.reshape(data, (3, 4))
 
@@ -3091,7 +3091,7 @@ def cumsum(x, axis=None, dtype=None, name=None):
             # [[ 0  1  2  3]
             #  [ 4  6  8 10]
             #  [12 15 18 21]]
-            
+
             y = paddle.cumsum(data, axis=-1)
             # [[ 0  1  3  6]
             #  [ 4  9 15 22]
@@ -3129,31 +3129,31 @@ def cumsum(x, axis=None, dtype=None, name=None):
 
 def logcumsumexp(x, axis=None, dtype=None, name=None):
     r"""
-    The logarithm of the cumulative summation of the exponentiation of the elements along a given axis. 
+    The logarithm of the cumulative summation of the exponentiation of the elements along a given axis.
 
     For summation index j given by `axis` and other indices i, the result is
 
     .. math::
 
         logcumsumexp(x)_{ij} = log \sum_{i=0}^{j}exp(x_{ij})
-    
+
     Note:
         The first element of the result is the same as the first element of the input.
 
     Args:
         x (Tensor): The input tensor.
         axis (int, optional): The dimension to do the operation along. -1 means the last dimension. The default (None) is to compute the cumsum over the flattened array.
-        dtype (str, optional): The data type of the output tensor, can be float32, float64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None. 
+        dtype (str, optional): The data type of the output tensor, can be float32, float64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor, the result of logcumsumexp operator. 
+        Tensor, the result of logcumsumexp operator.
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
-            
+
             data = paddle.arange(12, dtype='float64')
             data = paddle.reshape(data, (3, 4))
 
@@ -3165,7 +3165,7 @@ def logcumsumexp(x, axis=None, dtype=None, name=None):
             # [[ 0.        1.        2.        3.      ]
             #  [ 4.01815   5.01815   6.01815   7.01815 ]
             #  [ 8.018479  9.018479 10.018479 11.018479]]
-            
+
             y = paddle.logcumsumexp(data, axis=-1)
             # [[ 0.         1.3132617  2.4076061  3.4401898]
             #  [ 4.         5.3132615  6.407606   7.44019  ]
@@ -3340,7 +3340,7 @@ def isnan(x, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.to_tensor([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
             out = paddle.isnan(x)
             print(out)  # [False False False False False  True  True]
@@ -3363,21 +3363,21 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
 
     Args:
         x (Tensor): The input tensor, its data type should be float32, float64, int32, int64.
-        axis (int|list|tuple, optional): The axis along which the product is computed. If :attr:`None`, 
-            multiply all elements of `x` and return a Tensor with a single element, 
-            otherwise must be in the range :math:`[-x.ndim, x.ndim)`. If :math:`axis[i]<0`, 
+        axis (int|list|tuple, optional): The axis along which the product is computed. If :attr:`None`,
+            multiply all elements of `x` and return a Tensor with a single element,
+            otherwise must be in the range :math:`[-x.ndim, x.ndim)`. If :math:`axis[i]<0`,
             the axis to reduce is :math:`x.ndim + axis[i]`. Default is None.
-        keepdim (bool, optional): Whether to reserve the reduced dimension in the output Tensor. The result 
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the output Tensor. The result
             tensor will have one fewer dimension than the input unless `keepdim` is true. Default is False.
-        dtype (str|np.dtype, optional): The desired date type of returned tensor, can be float32, float64, 
-            int32, int64. If specified, the input tensor is casted to dtype before operator performed. 
-            This is very useful for avoiding data type overflows. The default value is None, the dtype 
+        dtype (str|np.dtype, optional): The desired date type of returned tensor, can be float32, float64,
+            int32, int64. If specified, the input tensor is casted to dtype before operator performed.
+            This is very useful for avoiding data type overflows. The default value is None, the dtype
             of output is the same as input Tensor `x`.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor, result of product on the specified dim of input tensor.
-    
+
     Examples:
         .. code-block:: python
 
@@ -3629,7 +3629,7 @@ def all(x, axis=None, keepdim=False, name=None):
             # keepdim=True, out4 should be [[False], [True]], out.shape should be (2,1)
             out4 = paddle.all(x, axis=1, keepdim=True) # [[False], [True]]
             print(out4)
-            
+
     """
     if axis is not None and not isinstance(axis, (list, tuple)):
         axis = [axis]
@@ -3719,8 +3719,8 @@ def any(x, axis=None, keepdim=False, name=None):
 
             # keepdim=True, result should be [[True], [True]], out.shape should be (2,1)
             out4 = paddle.any(x, axis=1, keepdim=True)  # [[True], [True]]
-            print(out4) 
-            
+            print(out4)
+
     """
     if axis is not None and not isinstance(axis, (list, tuple)):
         axis = [axis]
@@ -3770,7 +3770,7 @@ def broadcast_shape(x_shape, y_shape):
     Args:
         x_shape (list[int]|tuple[int]): A shape of tensor.
         y_shape (list[int]|tuple[int]): A shape of tensor.
-        
+
 
     Returns:
         list[int], the result shape.
@@ -3782,7 +3782,7 @@ def broadcast_shape(x_shape, y_shape):
 
             shape = paddle.broadcast_shape([2, 1, 3], [1, 3, 1])
             # [2, 3, 3]
-            
+
             # shape = paddle.broadcast_shape([2, 1, 3], [3, 3, 1])
             # ValueError (terminated with error message).
 
@@ -3795,7 +3795,7 @@ def conj(x, name=None):
     This function computes the conjugate of the Tensor elementwisely.
 
     Args:
-        x (Tensor): The input Tensor which hold the complex numbers. 
+        x (Tensor): The input Tensor which hold the complex numbers.
             Optional data types are: complex64, complex128, float32, float64, int32 or int64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -3806,7 +3806,7 @@ def conj(x, name=None):
         .. code-block:: python
 
           import paddle
-          
+
           data=paddle.to_tensor([[1+1j, 2+2j, 3+3j], [4+4j, 5+5j, 6+6j]])
           #Tensor(shape=[2, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
           #       [[(1+1j), (2+2j), (3+3j)],
@@ -3996,7 +3996,7 @@ def logit(x, eps=None, name=None):
     This function generates a new tensor with the logit of the elements of input x. x is clamped to [eps, 1-eps] when eps is not zero. When eps is zero and x < 0 or x > 1, the function will yields NaN.
 
     .. math::
- 
+
         logit(x) = ln(\frac{x}{1 - x})
 
     where
@@ -4028,7 +4028,7 @@ def logit(x, eps=None, name=None):
             x = paddle.to_tensor([0.2635, 0.0106, 0.2780, 0.2097, 0.8095])
             out1 = paddle.logit(x)
             print(out1)
-            # [-1.0277, -4.5365, -0.9544, -1.3269,  1.4468]  
+            # [-1.0277, -4.5365, -0.9544, -1.3269,  1.4468]
 
     """
 
@@ -4070,7 +4070,7 @@ def lerp(x, y, weight, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.arange(1., 5., dtype='float32')
             y = paddle.empty([4], dtype='float32')
             y.fill_(10.)
@@ -4140,7 +4140,7 @@ def erfinv(x, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.to_tensor([0, 0.5, -1.], dtype="float32")
             out = paddle.erfinv(x)
             # out: [0, 0.4769, -inf]
@@ -4173,7 +4173,7 @@ def erfinv_(x, name=None):
 def rad2deg(x, name=None):
     r"""
     Convert each of the elements of input x from angles in radians to degrees.
-    
+
     Equation:
         .. math::
 
@@ -4191,7 +4191,7 @@ def rad2deg(x, name=None):
 
             import paddle
             import numpy as np
-            
+
             x1 = paddle.to_tensor([3.142, -3.142, 6.283, -6.283, 1.570, -1.570])
             result1 = paddle.rad2deg(x1)
             print(result1)
@@ -4204,7 +4204,7 @@ def rad2deg(x, name=None):
             print(result2)
             # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
             #         [90.])
-                     
+
             x3 = paddle.to_tensor(1)
             result3 = paddle.rad2deg(x3)
             print(result3)
@@ -4236,7 +4236,7 @@ def rad2deg(x, name=None):
 def deg2rad(x, name=None):
     r"""
     Convert each of the elements of input x from degrees to angles in radians.
-    
+
     Equation:
         .. math::
 
@@ -4254,7 +4254,7 @@ def deg2rad(x, name=None):
 
             import paddle
             import numpy as np
-            
+
             x1 = paddle.to_tensor([180.0, -180.0, 360.0, -360.0, 90.0, -90.0])
             result1 = paddle.deg2rad(x1)
             print(result1)
@@ -4294,15 +4294,15 @@ def gcd(x, y, name=None):
     """
     Computes the element-wise greatest common divisor (GCD) of input |x| and |y|.
     Both x and y must have integer types.
-    
+
     Note:
         gcd(0,0)=0, gcd(0, y)=|y|
 
         If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
 
     Args:
-        x (Tensor): An N-D Tensor, the data type is int32，int64. 
-        y (Tensor): An N-D Tensor, the data type is int32，int64. 
+        x (Tensor): An N-D Tensor, the data type is int32，int64.
+        y (Tensor): An N-D Tensor, the data type is int32，int64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -4312,7 +4312,7 @@ def gcd(x, y, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             x1 = paddle.to_tensor(12)
             x2 = paddle.to_tensor(20)
             paddle.gcd(x1, x2)
@@ -4332,7 +4332,7 @@ def gcd(x, y, name=None):
             paddle.gcd(x4, x4)
             # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
             #        [0])
-            
+
             x5 = paddle.to_tensor(-20)
             paddle.gcd(x1, x5)
             # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
@@ -4372,15 +4372,15 @@ def lcm(x, y, name=None):
     """
     Computes the element-wise least common multiple (LCM) of input |x| and |y|.
     Both x and y must have integer types.
-    
+
     Note:
         lcm(0,0)=0, lcm(0, y)=0
 
         If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
 
     Args:
-        x (Tensor): An N-D Tensor, the data type is int32，int64. 
-        y (Tensor): An N-D Tensor, the data type is int32，int64. 
+        x (Tensor): An N-D Tensor, the data type is int32，int64.
+        y (Tensor): An N-D Tensor, the data type is int32，int64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -4390,7 +4390,7 @@ def lcm(x, y, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             x1 = paddle.to_tensor(12)
             x2 = paddle.to_tensor(20)
             paddle.lcm(x1, x2)
@@ -4410,7 +4410,7 @@ def lcm(x, y, name=None):
             paddle.lcm(x4, x4)
             # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
             #        [0])
-            
+
             x5 = paddle.to_tensor(-20)
             paddle.lcm(x1, x5)
             # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
@@ -4428,28 +4428,28 @@ def lcm(x, y, name=None):
 def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
     r"""
     Computes the n-th forward difference along the given axis.
-    The first-order differences is computed by using the following formula: 
+    The first-order differences is computed by using the following formula:
 
     .. math::
 
         out[i] = x[i+1] - x[i]
-    
-    Higher-order differences are computed by using paddle.diff() recursively. 
+
+    Higher-order differences are computed by using paddle.diff() recursively.
     Only n=1 is currently supported.
 
     Args:
         x (Tensor): The input tensor to compute the forward difference on
-        n (int, optional): The number of times to recursively compute the difference. 
+        n (int, optional): The number of times to recursively compute the difference.
                           Only support n=1. Default:1
         axis (int, optional): The axis to compute the difference along. Default:-1
         prepend (Tensor, optional): The tensor to prepend to input along axis before computing the difference.
-                                   It's dimensions must be equivalent to that of x, 
+                                   It's dimensions must be equivalent to that of x,
                                    and its shapes must match x's shape except on axis.
-        append (Tensor, optional): The tensor to append to input along axis before computing the difference, 
-                                   It's dimensions must be equivalent to that of x, 
+        append (Tensor, optional): The tensor to append to input along axis before computing the difference,
+                                   It's dimensions must be equivalent to that of x,
                                    and its shapes must match x's shape except on axis.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         Tensor: The output tensor with same dtype with x.
 
@@ -4467,7 +4467,7 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
             y = paddle.to_tensor([7, 9])
             out = paddle.diff(x, append=y)
             print(out)
-            # out: 
+            # out:
             # [3, 1, -3, 5, 2]
 
             z = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
@@ -4625,7 +4625,7 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
 
 def angle(x, name=None):
     r"""
-    Element-wise angle of complex numbers. For non-negative real numbers, the angle is 0 while 
+    Element-wise angle of complex numbers. For non-negative real numbers, the angle is 0 while
     for negative real numbers, the angle is :math:`\pi`.
 
     Equation:
diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py
index 4c3f7c55c49..84b54e93df0 100644
--- a/python/paddle/tensor/ops.py
+++ b/python/paddle/tensor/ops.py
@@ -218,7 +218,7 @@ Examples:
         import paddle.nn.functional as F
 
         x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = F.tanhshrink(x) 
+        out = F.tanhshrink(x)
         print(out)
         # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
 
@@ -485,7 +485,7 @@ Examples:
         import paddle.nn.functional as F
 
         x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = F.softplus(x) 
+        out = F.softplus(x)
         print(out)
         # [0.513015, 0.598139, 0.744397, 0.854355]
 
@@ -500,7 +500,7 @@ Examples:
         import paddle.nn.functional as F
 
         x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = F.softsign(x) 
+        out = F.softsign(x)
         print(out)
         # [-0.285714, -0.166667, 0.0909091, 0.230769]
 
@@ -541,11 +541,11 @@ Returns:
     Tensor: The output of Erf, dtype: float32 or float64, the same as the input, shape: the same as the input.
 
 Examples:
-    
+
     .. code-block:: python
-    
+
         import paddle
-        
+
         x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.erf(x)
         print(out)
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 25c825cda34..10e0f6f775f 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -42,7 +42,7 @@ def bernoulli(x, name=None):
         x (Tensor): The input Tensor, it's data type should be float32, float64.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
-    Returns: 
+    Returns:
         Tensor: A Tensor filled samples from Bernoulli distribution, whose shape and dtype are same as ``x``.
 
     Examples:
@@ -51,7 +51,7 @@ def bernoulli(x, name=None):
             import paddle
 
             paddle.set_device('cpu')  # on CPU device
-            paddle.seed(100) 
+            paddle.seed(100)
 
             x = paddle.rand([2,3])
             print(x)
@@ -93,12 +93,12 @@ def poisson(x, name=None):
         out_i \sim Poisson (lambda = x_i)
 
     Args:
-        x(Tensor):  A tensor with rate parameter of poisson Distribution. The data type 
+        x(Tensor):  A tensor with rate parameter of poisson Distribution. The data type
             should be float32, float64.
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
-    Returns: 
+    Returns:
         Tensor: A Tensor filled with random number with the same shape and dtype as ``x``.
 
     Examples:
@@ -228,7 +228,7 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
 
     Returns:
         Tensor: A Tensor filled with random values sampled from a Gaussian
-        distribution, with ``shape`` and ``dtype``. 
+        distribution, with ``shape`` and ``dtype``.
     """
     op_type_for_check = 'gaussian/standard_normal/randn/normal'
     seed = 0
@@ -507,7 +507,7 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
         max(float|int, optional): The upper bound on the range of random values
             to generate, ``max`` is excluded in the range. Default is 1.0.
         seed(int, optional): Random seed used for generating samples. If seed is 0,
-            it will use the seed of the global default generator (which can be set by paddle.seed). 
+            it will use the seed of the global default generator (which can be set by paddle.seed).
             Note that if seed is not 0, this operator will always generate the same random numbers every
             time. Default is 0.
         name(str, optional): Name for the operation (optional, default is None).
@@ -520,7 +520,7 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     Examples:
         .. code-block:: python
           :name: code-example1
-            
+
             import paddle
 
             # example 1:
@@ -592,18 +592,18 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
 @dygraph_only
 def uniform_(x, min=-1.0, max=1.0, seed=0, name=None):
     """
-    This is the inplace version of OP ``uniform``, which returns a Tensor filled 
+    This is the inplace version of OP ``uniform``, which returns a Tensor filled
     with random values sampled from a uniform distribution. The output Tensor will
     be inplaced with input ``x``. Please refer to :ref:`api_tensor_uniform`.
-    
+
     Args:
         x(Tensor): The input tensor to be filled with random values.
         min(float|int, optional): The lower bound on the range of random values
             to generate, ``min`` is included in the range. Default is -1.0.
         max(float|int, optional): The upper bound on the range of random values
             to generate, ``max`` is excluded in the range. Default is 1.0.
-        seed(int, optional): Random seed used for generating samples. If seed is 0, 
-            it will use the seed of the global default generator (which can be set by paddle.seed). 
+        seed(int, optional): Random seed used for generating samples. If seed is 0,
+            it will use the seed of the global default generator (which can be set by paddle.seed).
             Note that if seed is not 0, this operator will always generate the same random numbers every
             time. Default is 0.
         name(str, optional): The default value is None. Normally there is no
@@ -614,7 +614,7 @@ def uniform_(x, min=-1.0, max=1.0, seed=0, name=None):
         distribution in the range [``min``, ``max``).
     Examples:
         .. code-block:: python
-            
+
             import paddle
             # example:
             x = paddle.ones(shape=[3, 4])
@@ -656,7 +656,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
             need for user to set this property.  For more information, please
             refer to :ref:`api_guide_Name`.
 
-    Returns: 
+    Returns:
         Tensor: A Tensor filled with random integers from a discrete uniform
         distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``.
 
@@ -745,11 +745,11 @@ def randint_like(x, low=0, high=None, dtype=None, name=None):
     """
     Returns a Tensor filled with random integers from a discrete uniform
     distribution in the range [``low``, ``high``), with the same shape as ``x``.
-    (use ``dtype`` if ``dtype`` is not None) 
+    (use ``dtype`` if ``dtype`` is not None)
     If ``high`` is None (the default), the range is [0, ``low``).
 
     Args:
-        x (Tensor): The input tensor which specifies shape. The dtype of ``x`` 
+        x (Tensor): The input tensor which specifies shape. The dtype of ``x``
             can be bool, int32, int64, float16, float32, float64.
         low (int): The lower bound on the range of random values to generate.
             The ``low`` is included in the range. If ``high`` is None, the
@@ -758,14 +758,14 @@ def randint_like(x, low=0, high=None, dtype=None, name=None):
             generate, the ``high`` is excluded in the range. Default is None
             (see above for behavior if high = None). Default is None.
         dtype (str|np.dtype, optional): The data type of the
-            output tensor. Supported data types: bool, int32, int64, float16, 
+            output tensor. Supported data types: bool, int32, int64, float16,
             float32, float64. If ``dytpe`` is None, the data type is the
             same as x's data type. Default is None.
         name (str, optional): The default value is None.  Normally there is no
             need for user to set this property.  For more information, please
             refer to :ref:`api_guide_Name`.
 
-    Returns: 
+    Returns:
         Tensor: A Tensor filled with random integers from a discrete uniform
         distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``.
 
@@ -944,7 +944,7 @@ def randperm(n, dtype="int64", name=None):
 
             out2 = paddle.randperm(7, 'int32')
             # [1, 6, 2, 0, 4, 3, 5]  # random
- 
+
     """
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
@@ -1028,8 +1028,8 @@ def exponential_(x, lam=1.0, name=None):
     r"""
     This inplace OP fill input Tensor ``x`` with random number from a Exponential Distribution.
 
-    ``lam`` is :math:`\lambda` parameter of Exponential Distribution. 
-    
+    ``lam`` is :math:`\lambda` parameter of Exponential Distribution.
+
     .. math::
 
         f(x) = \lambda e^{-\lambda x}
@@ -1040,7 +1040,7 @@ def exponential_(x, lam=1.0, name=None):
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
-    Returns: 
+    Returns:
         Tensor: Input Tensor ``x``.
 
     Examples:
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index e4458048edc..ce950b7eb05 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -56,18 +56,18 @@ def argsort(x, axis=-1, descending=False, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.to_tensor([[[5,8,9,5],
                                    [0,0,1,7],
                                    [6,9,2,4]],
                                   [[5,2,4,2],
                                    [4,7,7,9],
-                                   [1,7,0,6]]], 
+                                   [1,7,0,6]]],
                                 dtype='float32')
             out1 = paddle.argsort(x, axis=-1)
             out2 = paddle.argsort(x, axis=0)
             out3 = paddle.argsort(x, axis=1)
-            
+
             print(out1)
             #[[[0 3 1 2]
             #  [0 1 2 3]
@@ -75,7 +75,7 @@ def argsort(x, axis=-1, descending=False, name=None):
             # [[1 3 2 0]
             #  [0 1 2 3]
             #  [2 0 3 1]]]
-            
+
             print(out2)
             #[[[0 1 1 1]
             #  [0 0 0 0]
@@ -83,7 +83,7 @@ def argsort(x, axis=-1, descending=False, name=None):
             # [[1 0 0 0]
             #  [1 1 1 1]
             #  [0 0 0 1]]]
-            
+
             print(out3)
             #[[[1 1 1 2]
             #  [0 0 2 0]
@@ -153,10 +153,10 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
             out1 = paddle.argmax(x)
             print(out1) # 2
             out2 = paddle.argmax(x, axis=0)
-            print(out2) 
+            print(out2)
             # [2, 2, 0, 1]
             out3 = paddle.argmax(x, axis=-1)
-            print(out3) 
+            print(out3)
             # [2, 3, 1]
             out4 = paddle.argmax(x, axis=0, keepdim=True)
             print(out4)
@@ -220,7 +220,7 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
                     be int32, int64. The default value is 'int64', and it will
                     return the int64 indices.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-        
+
     Returns:
         Tensor, return the tensor of `int32` if set :attr:`dtype` is `int32`, otherwise return the tensor of `int64`.
 
@@ -235,10 +235,10 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
             out1 = paddle.argmin(x)
             print(out1) # 4
             out2 = paddle.argmin(x, axis=0)
-            print(out2) 
+            print(out2)
             # [1, 1, 1, 2]
             out3 = paddle.argmin(x, axis=-1)
-            print(out3) 
+            print(out3)
             # [0, 0, 2]
             out4 = paddle.argmin(x, axis=0, keepdim=True)
             print(out4)
@@ -289,10 +289,10 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
 def index_select(x, index, axis=0, name=None):
     """
 
-    Returns a new tensor which indexes the ``input`` tensor along dimension ``axis`` using 
-    the entries in ``index`` which is a Tensor. The returned tensor has the same number 
-    of dimensions as the original ``x`` tensor. The dim-th dimension has the same 
-    size as the length of ``index``; other dimensions have the same size as in the ``x`` tensor. 
+    Returns a new tensor which indexes the ``input`` tensor along dimension ``axis`` using
+    the entries in ``index`` which is a Tensor. The returned tensor has the same number
+    of dimensions as the original ``x`` tensor. The dim-th dimension has the same
+    size as the length of ``index``; other dimensions have the same size as in the ``x`` tensor.
 
     Args:
         x (Tensor): The input Tensor to be operated. The data of ``x`` can be one of float32, float64, int32, int64.
@@ -302,10 +302,10 @@ def index_select(x, index, axis=0, name=None):
 
     Returns:
         Tensor: A Tensor with same data type as ``x``.
-    
+
     Examples:
         .. code-block:: python
-            
+
             import paddle
 
             x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
@@ -348,12 +348,12 @@ def index_select(x, index, axis=0, name=None):
 
 def nonzero(x, as_tuple=False):
     """
-    Return a tensor containing the indices of all non-zero elements of the `input` 
-    tensor. If as_tuple is True, return a tuple of 1-D tensors, one for each dimension 
-    in `input`, each containing the indices (in that dimension) of all non-zero elements 
-    of `input`. Given a n-Dimensional `input` tensor with shape [x_1, x_2, ..., x_n], If 
-    as_tuple is False, we can get a output tensor with shape [z, n], where `z` is the 
-    number of all non-zero elements in the `input` tensor. If as_tuple is True, we can get 
+    Return a tensor containing the indices of all non-zero elements of the `input`
+    tensor. If as_tuple is True, return a tuple of 1-D tensors, one for each dimension
+    in `input`, each containing the indices (in that dimension) of all non-zero elements
+    of `input`. Given a n-Dimensional `input` tensor with shape [x_1, x_2, ..., x_n], If
+    as_tuple is False, we can get a output tensor with shape [z, n], where `z` is the
+    number of all non-zero elements in the `input` tensor. If as_tuple is True, we can get
     a 1-D tensor tuple of length `n`, and the shape of each 1-D tensor is [z, 1].
 
     Args:
@@ -442,7 +442,7 @@ def sort(x, axis=-1, descending=False, name=None):
             algorithm will sort by descending order, else sort by
             ascending order. Default is false.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-        
+
     Returns:
         Tensor: sorted tensor(with the same shape and data type as ``x``).
     Examples:
@@ -456,7 +456,7 @@ def sort(x, axis=-1, descending=False, name=None):
                                    [6,9,2,4]],
                                   [[5,2,4,2],
                                    [4,7,7,9],
-                                   [1,7,0,6]]], 
+                                   [1,7,0,6]]],
                                  dtype='float32')
             out1 = paddle.sort(x=x, axis=-1)
             out2 = paddle.sort(x=x, axis=0)
@@ -529,7 +529,7 @@ def mode(x, axis=-1, keepdim=False, name=None):
         .. code-block:: python
 
            import paddle
-           
+
            tensor = paddle.to_tensor([[[1,2,2],[2,3,3]],[[0,5,5],[9,9,0]]], dtype=paddle.float32)
            res = paddle.mode(tensor, 2)
            print(res)
@@ -538,7 +538,7 @@ def mode(x, axis=-1, keepdim=False, name=None):
            #    [5., 9.]]), Tensor(shape=[2, 2], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
            #   [[1, 1],
            #    [1, 0]]))
-           
+
     """
     if in_dygraph_mode():
         return _C_ops.mode(x, axis, keepdim)
@@ -590,18 +590,18 @@ def where(condition, x=None, y=None, name=None):
         Tensor: A Tensor with the same shape as :attr:`condition` and same data type as :attr:`x` and :attr:`y`.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.to_tensor([0.9383, 0.1983, 3.2, 1.2])
             y = paddle.to_tensor([1.0, 1.0, 1.0, 1.0])
-            
+
             out = paddle.where(x>1, x, y)
             print(out)
             #out: [1.0, 1.0, 3.2, 1.2]
-            
+
             out = paddle.where(x>1)
             print(out)
             #out: (Tensor(shape=[2, 1], dtype=int64, place=CPUPlace, stop_gradient=True,
@@ -676,8 +676,8 @@ def index_sample(x, index):
     """
     **IndexSample Layer**
 
-    IndexSample OP returns the element of the specified location of X, 
-    and the location is specified by Index. 
+    IndexSample OP returns the element of the specified location of X,
+    and the location is specified by Index.
 
     .. code-block:: text
 
@@ -696,9 +696,9 @@ def index_sample(x, index):
                        [6, 8, 10]]
 
     Args:
-        x (Tensor): The source input tensor with 2-D shape. Supported data type is 
+        x (Tensor): The source input tensor with 2-D shape. Supported data type is
             int32, int64, float32, float64.
-        index (Tensor): The index input tensor with 2-D shape, first dimension should be same with X. 
+        index (Tensor): The index input tensor with 2-D shape, first dimension should be same with X.
             Data type is int32 or int64.
 
     Returns:
@@ -774,13 +774,13 @@ def masked_select(x, mask, name=None):
     which is a tensor with data type of bool.
 
     Args:
-        x (Tensor): The input Tensor, the data type can be int32, int64, float32, float64. 
+        x (Tensor): The input Tensor, the data type can be int32, int64, float32, float64.
         mask (Tensor): The Tensor containing the binary mask to index with, it's data type is bool.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
-    Returns: 
+    Returns:
         A 1-D Tensor which is the same data type  as ``x``.
-    
+
     Examples:
 
         .. code-block:: python
@@ -833,7 +833,7 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None):
         largest(bool, optional) : largest is a flag, if set to true,
             algorithm will sort by descending order, otherwise sort by
             ascending order. Default is True.
-        sorted(bool, optional): controls whether to return the elements in sorted order, default value is True. In gpu device, it always return the sorted value. 
+        sorted(bool, optional): controls whether to return the elements in sorted order, default value is True. In gpu device, it always return the sorted value.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -914,19 +914,19 @@ def bucketize(x, sorted_sequence, out_int32=False, right=False, name=None):
 
     Args:
         x(Tensor): An input N-D tensor value with type int32, int64, float32, float64.
-        sorted_sequence(Tensor): An input 1-D tensor with type int32, int64, float32, float64. The value of the tensor monotonically increases in the innermost dimension. 
+        sorted_sequence(Tensor): An input 1-D tensor with type int32, int64, float32, float64. The value of the tensor monotonically increases in the innermost dimension.
         out_int32(bool, optional): Data type of the output tensor which can be int32, int64. The default value is False, and it indicates that the output data type is int64.
         right(bool, optional): Find the upper or lower bounds of the sorted_sequence range in the innermost dimension based on the given `x`. If the value of the sorted_sequence is nan or inf, return the size of the innermost dimension.
-                               The default value is False and it shows the lower bounds.  
+                               The default value is False and it shows the lower bounds.
         name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
-        
+
     Returns:
-        Tensor（the same sizes of the `x`）, return the tensor of int32 if set :attr:`out_int32` is True, otherwise return the tensor of int64.  
-    
+        Tensor（the same sizes of the `x`）, return the tensor of int32 if set :attr:`out_int32` is True, otherwise return the tensor of int64.
+
     Examples:
 
         .. code-block:: python
-    
+
             import paddle
 
             sorted_sequence = paddle.to_tensor([2, 4, 8, 16], dtype='int32')
@@ -951,7 +951,7 @@ def bucketize(x, sorted_sequence, out_int32=False, right=False, name=None):
             # Tensor(shape=[2, 4], dtype=int64, place=CPUPlace, stop_gradient=True,
             #        [[0, 3, 2, 4],
             #         [0, 1, 3, 2]])
-            
+
     """
     check_variable_and_dtype(sorted_sequence, 'SortedSequence',
                              ['float32', 'float64', 'int32', 'int64'],
@@ -972,20 +972,20 @@ def searchsorted(sorted_sequence,
     Find the index of the corresponding `sorted_sequence` in the innermost dimension based on the given `values`.
 
     Args:
-        sorted_sequence(Tensor): An input N-D or 1-D tensor with type int32, int64, float32, float64. The value of the tensor monotonically increases in the innermost dimension. 
+        sorted_sequence(Tensor): An input N-D or 1-D tensor with type int32, int64, float32, float64. The value of the tensor monotonically increases in the innermost dimension.
         values(Tensor): An input N-D tensor value with type int32, int64, float32, float64.
         out_int32(bool, optional): Data type of the output tensor which can be int32, int64. The default value is False, and it indicates that the output data type is int64.
         right(bool, optional): Find the upper or lower bounds of the sorted_sequence range in the innermost dimension based on the given `values`. If the value of the sorted_sequence is nan or inf, return the size of the innermost dimension.
-                               The default value is False and it shows the lower bounds.  
+                               The default value is False and it shows the lower bounds.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-        
+
     Returns:
-        Tensor（the same sizes of the `values`）, return the tensor of int32 if set :attr:`out_int32` is True, otherwise return the tensor of int64.  
-    
+        Tensor（the same sizes of the `values`）, return the tensor of int32 if set :attr:`out_int32` is True, otherwise return the tensor of int64.
+
     Examples:
 
         .. code-block:: python
-    
+
             import paddle
 
             sorted_sequence = paddle.to_tensor([[1, 3, 5, 7, 9, 11],
@@ -1002,12 +1002,12 @@ def searchsorted(sorted_sequence,
             #        [[2, 3, 5, 5],
             #         [1, 3, 4, 5]])
             sorted_sequence_1d = paddle.to_tensor([1, 3, 5, 7, 9, 11, 13])
-            out3 = paddle.searchsorted(sorted_sequence_1d, values)     
+            out3 = paddle.searchsorted(sorted_sequence_1d, values)
             print(out3)
             # Tensor(shape=[2, 4], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
             #        [[1, 3, 4, 5],
             #         [1, 3, 4, 5]])
-            
+
     """
     if in_dygraph_mode():
         return _C_ops.searchsorted(sorted_sequence, values, out_int32, right)
@@ -1055,13 +1055,13 @@ def kthvalue(x, k, axis=None, keepdim=False, name=None):
 
     Returns:
         tuple(Tensor), return the values and indices. The value data type is the same as the input `x`. The indices data type is int64.
-   
+
     Examples:
 
         .. code-block:: python
-    
+
             import paddle
-            
+
             x = paddle.randn((2,3,2))
             # Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
             #       [[[ 0.22954939, -0.01296274],
@@ -1070,8 +1070,8 @@ def kthvalue(x, k, axis=None, keepdim=False, name=None):
             #
             #        [[ 0.15104349, -0.93965352],
             #         [ 0.14745511,  0.98209465],
-            #         [ 0.10732264, -0.55859774]]])           
-            y = paddle.kthvalue(x, 2, 1)    
+            #         [ 0.10732264, -0.55859774]]])
+            y = paddle.kthvalue(x, 2, 1)
             # (Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
             # [[ 0.22954939, -0.17573971],
             #  [ 0.14745511, -0.55859774]]), Tensor(shape=[2, 2], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index b3e14784c3d..b5946459d34 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -129,10 +129,10 @@ def var(x, axis=None, unbiased=True, keepdim=False, name=None):
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
-        axis (int|list|tuple, optional): The axis along which to perform variance calculations. ``axis`` should be int, list(int) or tuple(int). 
-        
-            - If ``axis`` is a list/tuple of dimension(s), variance is calculated along all element(s) of ``axis`` . ``axis`` or element(s) of ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` . 
-            - If ``axis`` or element(s) of ``axis`` is less than 0, it works the same way as :math:`axis + D` . 
+        axis (int|list|tuple, optional): The axis along which to perform variance calculations. ``axis`` should be int, list(int) or tuple(int).
+
+            - If ``axis`` is a list/tuple of dimension(s), variance is calculated along all element(s) of ``axis`` . ``axis`` or element(s) of ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` .
+            - If ``axis`` or element(s) of ``axis`` is less than 0, it works the same way as :math:`axis + D` .
             - If ``axis`` is None, variance is calculated over all elements of ``x``. Default is None.
 
         unbiased (bool, optional): Whether to use the unbiased estimation. If ``unbiased`` is True, the divisor used in the computation is :math:`N - 1`, where :math:`N` represents the number of elements along ``axis`` , otherwise the divisor is :math:`N`. Default is True.
@@ -236,7 +236,7 @@ def numel(x, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.full(shape=[4, 5, 7], fill_value=0, dtype='int32')
             numel = paddle.numel(x) # 140
 
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index fb21c793f42..5dcaf785929 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -44,8 +44,8 @@ def set_printoptions(precision=None,
         edgeitems (int, optional): Number of elements in summary at the beginning and ending of each dimension, default 3.
         sci_mode (bool, optional): Format the floating number with scientific notation or not, default False.
         linewidth (int, optional): Number of characters each line, default 80.
-       
-    
+
+
     Returns:
         None.
 
@@ -58,7 +58,7 @@ def set_printoptions(precision=None,
             a = paddle.rand([10, 20])
             paddle.set_printoptions(4, 100, 3)
             print(a)
-            
+
             '''
             Tensor(shape=[10, 20], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
                    [[0.0002, 0.8503, 0.0135, ..., 0.9508, 0.2621, 0.6661],
diff --git a/python/paddle/text/datasets/conll05.py b/python/paddle/text/datasets/conll05.py
index 09f54d674fd..119ce9fea51 100644
--- a/python/paddle/text/datasets/conll05.py
+++ b/python/paddle/text/datasets/conll05.py
@@ -299,9 +299,9 @@ class Conll05st(Dataset):
         Get the word, verb and label dictionary of Wikipedia corpus.
 
         Examples:
-    
+
             .. code-block:: python
-    
+
             	from paddle.text.datasets import Conll05st
 
             	conll05st = Conll05st()
@@ -314,9 +314,9 @@ class Conll05st(Dataset):
         Get the embedding dictionary file.
 
         Examples:
-    
+
             .. code-block:: python
-    
+
             	from paddle.text.datasets import Conll05st
 
             	conll05st = Conll05st()
diff --git a/python/paddle/text/datasets/uci_housing.py b/python/paddle/text/datasets/uci_housing.py
index c283aeaf733..97fa8e6acfa 100644
--- a/python/paddle/text/datasets/uci_housing.py
+++ b/python/paddle/text/datasets/uci_housing.py
@@ -47,7 +47,7 @@ class UCIHousing(Dataset):
         Dataset: instance of UCI housing dataset.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
diff --git a/python/paddle/text/datasets/wmt14.py b/python/paddle/text/datasets/wmt14.py
index 133c304a02a..82d11459268 100644
--- a/python/paddle/text/datasets/wmt14.py
+++ b/python/paddle/text/datasets/wmt14.py
@@ -182,14 +182,14 @@ class WMT14(Dataset):
         Args:
             reverse (bool): wether to reverse key and value in dictionary,
                 i.e. key: value to value: key.
-    
+
         Returns:
             Two dictionaries, the source and target dictionary.
-    
+
         Examples:
-    
+
             .. code-block:: python
-    
+
                 from paddle.text.datasets import WMT14
                 wmt14 = WMT14(mode='train', dict_size=50)
                 src_dict, trg_dict = wmt14.get_dict()
diff --git a/python/paddle/text/datasets/wmt16.py b/python/paddle/text/datasets/wmt16.py
index ee2245ae4fe..f031ee68e32 100644
--- a/python/paddle/text/datasets/wmt16.py
+++ b/python/paddle/text/datasets/wmt16.py
@@ -240,9 +240,9 @@ class WMT16(Dataset):
             dict: The word dictionary for the specific language.
 
         Examples:
-    
+
             .. code-block:: python
-    
+
                 from paddle.text.datasets import WMT16
                 wmt16 = WMT16(mode='train', src_dict_size=50, trg_dict_size=50)
                 en_dict = wmt16.get_dict('en')
diff --git a/python/paddle/text/viterbi_decode.py b/python/paddle/text/viterbi_decode.py
index a3c81b9c8e6..aa289a270c2 100644
--- a/python/paddle/text/viterbi_decode.py
+++ b/python/paddle/text/viterbi_decode.py
@@ -28,13 +28,13 @@ def viterbi_decode(potentials,
                    name=None):
     """
     Decode the highest scoring sequence of tags computed by transitions and potentials and get the viterbi path.
- 
+
     Args:
         potentials (Tensor): The input tensor of unary emission. This is a 3-D
-            tensor with shape of [batch_size, sequence_length, num_tags]. The data type is float32 or float64. 
+            tensor with shape of [batch_size, sequence_length, num_tags]. The data type is float32 or float64.
         transition_params (Tensor): The input tensor of transition matrix. This is a 2-D
-            tensor with shape of [num_tags, num_tags]. The data type is float32 or float64. 
-        lengths (Tensor):  The input tensor of length of each sequence. This is a 1-D tensor with shape of [batch_size]. The data type is int64. 
+            tensor with shape of [num_tags, num_tags]. The data type is float32 or float64.
+        lengths (Tensor):  The input tensor of length of each sequence. This is a 1-D tensor with shape of [batch_size]. The data type is int64.
         include_bos_eos_tag (`bool`, optional): If set to True, the last row and the last column of transitions will be considered
             as start tag, the second to last row and the second to last column of transitions will be considered as stop tag. Defaults to ``True``.
         name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please
@@ -91,8 +91,8 @@ def viterbi_decode(potentials,
 
 
 class ViterbiDecoder(Layer):
-    """ 
-    Decode the highest scoring sequence of tags computed by transitions and potentials and get the viterbi path. 
+    """
+    Decode the highest scoring sequence of tags computed by transitions and potentials and get the viterbi path.
 
     Args:
         transitions (`Tensor`): The transition matrix.  Its dtype is float32 and has a shape of `[num_tags, num_tags]`.
@@ -102,10 +102,10 @@ class ViterbiDecoder(Layer):
             refer to :ref:`api_guide_Name`.
 
     Shape:
-        potentials (Tensor): The input tensor of unary emission. This is a 3-D tensor with shape of 
-            [batch_size, sequence_length, num_tags]. The data type is float32 or float64. 
+        potentials (Tensor): The input tensor of unary emission. This is a 3-D tensor with shape of
+            [batch_size, sequence_length, num_tags]. The data type is float32 or float64.
         lengths (Tensor):  The input tensor of length of each sequence. This is a 1-D tensor with shape of
-            [batch_size]. The data type is int64. 
+            [batch_size]. The data type is int64.
 
     Returns:
         scores(Tensor): The output tensor containing the score for the Viterbi sequence. The shape is [batch_size]
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 0367e9ed3e3..220246ef615 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -51,33 +51,33 @@ if core.is_compiled_with_rocm():
 def setup(**attr):
     """
     The interface is used to config the process of compiling customized operators,
-    mainly includes how to compile shared library, automatically generate python API 
+    mainly includes how to compile shared library, automatically generate python API
     and install it into site-package. It supports using customized operators directly with
     ``import`` statement.
 
     It encapsulates the python built-in ``setuptools.setup`` function and keeps arguments
     and usage same as the native interface. Meanwhile, it hiddens Paddle inner framework
     concepts, such as necessary compiling flags, included paths of head files, and linking
-    flags. It also will automatically search and valid local environment and versions of 
-    ``cc(Linux)`` , ``cl.exe(Windows)`` and ``nvcc`` , then compiles customized operators 
+    flags. It also will automatically search and valid local environment and versions of
+    ``cc(Linux)`` , ``cl.exe(Windows)`` and ``nvcc`` , then compiles customized operators
     supporting CPU or GPU device according to the specified Extension type.
 
-    Moreover, `ABI compatibility <https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html>`_ 
+    Moreover, `ABI compatibility <https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html>`_
     will be checked to ensure that compiler version from ``cc(Linux)`` , ``cl.exe(Windows)``
     on local machine is compatible with pre-installed Paddle whl in python site-packages.
 
-    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
-    then the version of user's local machine should satisfy GCC >= 8.2. 
-    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2017). 
-    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
+    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2,
+    then the version of user's local machine should satisfy GCC >= 8.2.
+    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of
+    PaddlePaddle (Visual Studio 2017).
+    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may
     occur because of ABI compatibility.
 
     .. note::
-        
+
         1. Currently we support Linux, MacOS and Windows platfrom.
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
-           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
+           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking
            GCC version.
         3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).
 
@@ -86,11 +86,11 @@ def setup(**attr):
     ``python setup.py install`` . Then customized operators API will be available everywhere
     after importing it.
 
-    A simple example of ``setup.py`` as followed: 
+    A simple example of ``setup.py`` as followed:
 
     .. code-block:: text
 
-        # setup.py 
+        # setup.py
 
         # Case 1: Compiling customized operators supporting CPU and GPU devices
         from paddle.utils.cpp_extension import CUDAExtension, setup
@@ -124,11 +124,11 @@ def setup(**attr):
         x = paddle.randn([4, 10], dtype='float32')
         relu_out = relu(x)
         tanh_out = tanh(x)
-    
+
 
     Args:
         name(str): Specify the name of shared library file and installed python package.
-        ext_modules(Extension): Specify the Extension instance including customized operator source files, compiling flags et.al. 
+        ext_modules(Extension): Specify the Extension instance including customized operator source files, compiling flags et.al.
                                 If only compile operator supporting CPU device, please use ``CppExtension`` ; If compile operator
                                 supporting CPU and GPU devices, please use ``CUDAExtension`` .
         include_dirs(list[str], optional): Specify the extra include directories to search head files. The interface will automatically add
@@ -139,7 +139,7 @@ def setup(**attr):
                                 compiler using dict type with ``{'cxx': [...], 'nvcc': [...]}`` . Default is None.
         **attr(dict, optional): Specify other arguments same as ``setuptools.setup`` .
 
-    Returns: 
+    Returns:
         None
 
     """
@@ -219,7 +219,7 @@ def CppExtension(sources, *args, **kwargs):
 
     .. code-block:: text
 
-        # setup.py 
+        # setup.py
 
         # Compiling customized operators supporting only CPU device
         from paddle.utils.cpp_extension import CppExtension, setup
@@ -269,7 +269,7 @@ def CUDAExtension(sources, *args, **kwargs):
 
     .. code-block:: text
 
-        # setup.py 
+        # setup.py
 
         # Compiling customized operators supporting CPU and GPU devices
         from paddle.utils.cpp_extension import CUDAExtension, setup
@@ -750,22 +750,22 @@ def load(name,
     append user defined custom operators in background while building models.
 
     It will perform compiling, linking, Python API generation and module loading
-    processes under a individual subprocess. It does not require CMake or Ninja 
-    environment. On Linux platform, it requires GCC compiler whose version is 
-    greater than 5.4 and it should be soft linked to ``/usr/bin/cc`` . On Windows 
+    processes under a individual subprocess. It does not require CMake or Ninja
+    environment. On Linux platform, it requires GCC compiler whose version is
+    greater than 5.4 and it should be soft linked to ``/usr/bin/cc`` . On Windows
     platform, it requires Visual Studio whose version is greater than 2017.
-    On MacOS, clang++ is requited. In addition, if compiling Operators supporting 
+    On MacOS, clang++ is requited. In addition, if compiling Operators supporting
     GPU device, please make sure ``nvcc`` compiler is installed in local environment.
-    
-    Moreover, `ABI compatibility <https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html>`_ 
+
+    Moreover, `ABI compatibility <https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html>`_
     will be checked to ensure that compiler version from ``cc(Linux)`` , ``cl.exe(Windows)``
     on local machine is compatible with pre-installed Paddle whl in python site-packages.
 
-    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
-    then the version of user's local machine should satisfy GCC >= 8.2. 
-    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2017). 
-    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
+    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2,
+    then the version of user's local machine should satisfy GCC >= 8.2.
+    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of
+    PaddlePaddle (Visual Studio 2017).
+    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may
     occur because of ABI compatibility.
 
     Compared with ``setup`` interface, it doesn't need extra ``setup.py`` and excute
@@ -776,7 +776,7 @@ def load(name,
 
         1. Currently we support Linux, MacOS and Windows platfrom.
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
-           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
+           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking
            GCC version.
         3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).
 
@@ -784,7 +784,7 @@ def load(name,
     **A simple example:**
 
     .. code-block:: text
-    
+
         import paddle
         from paddle.utils.cpp_extension import load
 
@@ -807,7 +807,7 @@ def load(name,
         extra_cxx_cflags(list[str], optional): Specify additional flags used to compile CPP files. By default
                                all basic and framework related flags have been included.
         extra_cuda_cflags(list[str], optional): Specify additional flags used to compile CUDA files. By default
-                               all basic and framework related flags have been included. 
+                               all basic and framework related flags have been included.
                                See `Cuda Compiler Driver NVCC <https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html>`_
                                for details. Default is None.
         extra_ldflags(list[str], optional): Specify additional flags used to link shared library. See
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 62fce336004..6e6047ccda0 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -81,7 +81,7 @@ WRONG_COMPILER_WARNING = '''
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
-Found that your compiler ({user_compiler}) is not compatible with the compiler 
+Found that your compiler ({user_compiler}) is not compatible with the compiler
 built Paddle for this platform, which is {paddle_compiler} on {platform}. Please
 use {paddle_compiler} to compile your custom op. Or you may compile Paddle from
 source using {user_compiler}, and then also use it compile your custom op.
@@ -145,10 +145,10 @@ def custom_write_stub(resource, pyfile):
         import sys
         import types
         import paddle
-        
+
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         so_path = os.path.join(cur_dir, "{resource}")
-        
+
         def inject_ext_module(module_name, api_names):
             if module_name in sys.modules:
                 return sys.modules[module_name]
@@ -165,7 +165,7 @@ def custom_write_stub(resource, pyfile):
             # load custom op shared library with abs path
             new_custom_ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(so_path)
             m = inject_ext_module(__name__, new_custom_ops)
-        
+
         __bootstrap__()
 
         {custom_api}
@@ -281,7 +281,7 @@ def combine_hash(md5, value):
 
 def clean_object_if_change_cflags(so_path, extension):
     """
-    If already compiling source before, we should check whether cflags 
+    If already compiling source before, we should check whether cflags
     have changed and delete the built object to re-compile the source
     even though source file content keeps unchanaged.
     """
@@ -925,7 +925,7 @@ def _custom_api_content(op_name):
         from paddle.fluid.core import VarBase, CustomOpKernelContext
         from paddle.fluid.framework import _non_static_mode, _dygraph_tracer, _in_legacy_dygraph, in_dygraph_mode
         from paddle.fluid.layer_helper import LayerHelper
-        
+
         def {op_name}({inputs}):
             # prepare inputs and outputs
             ins = {ins}
diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index 5d4a8996936..00a38cb1368 100755
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -43,12 +43,12 @@ def deprecated(update_to="", since="", reason="", level=0):
             since(str, optional): The version at which the decorated method is considered deprecated.
             update_to(str, optional): The new API users should use.
             reason(str, optional): The reason why the API is deprecated.
-            level(int, optional): The deprecated warning log level. It must be 
-                an Integer and must be one of 0, 1, 2. 
-                If `level == 0`, the warning message will not be showed. 
+            level(int, optional): The deprecated warning log level. It must be
+                an Integer and must be one of 0, 1, 2.
+                If `level == 0`, the warning message will not be showed.
                 If `level == 1`, the warning message will be showed normally.
                 If `level == 2`, it will raise `RuntimeError`.
-           
+
        Returns:
            decorator: decorated function or class.
     """
diff --git a/python/paddle/utils/dlpack.py b/python/paddle/utils/dlpack.py
index 1ece08daa27..43c9e6b1787 100644
--- a/python/paddle/utils/dlpack.py
+++ b/python/paddle/utils/dlpack.py
@@ -34,7 +34,7 @@ def to_dlpack(x):
 
     Returns:
         dltensor, and the data type is PyCapsule.
-    
+
     Examples:
         .. code-block:: python
 
@@ -62,12 +62,12 @@ def to_dlpack(x):
 def from_dlpack(dlpack):
     """
     Decodes a DLPack to a tensor.
-    
+
     Args:
         dlpack (PyCapsule): a PyCapsule object with the dltensor.
 
     Returns:
-        out (Tensor): a tensor decoded from DLPack. One thing to be noted, if we get 
+        out (Tensor): a tensor decoded from DLPack. One thing to be noted, if we get
                       an input dltensor with data type as `bool`, we return the decoded
                       tensor as `uint8`.
 
@@ -83,7 +83,7 @@ def from_dlpack(dlpack):
             print(x)
             # Tensor(shape=[2, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
             #  [[0.20000000, 0.30000001, 0.50000000, 0.89999998],
-            #  [0.10000000, 0.20000000, 0.60000002, 0.69999999]]) 
+            #  [0.10000000, 0.20000000, 0.60000002, 0.69999999]])
     """
 
     t = type(dlpack)
diff --git a/python/paddle/vision/datasets/flowers.py b/python/paddle/vision/datasets/flowers.py
index 722f52acf69..99d20ee3874 100644
--- a/python/paddle/vision/datasets/flowers.py
+++ b/python/paddle/vision/datasets/flowers.py
@@ -64,7 +64,7 @@ class Flowers(Dataset):
         :ref:`api_paddle_io_Dataset`. An instance of Flowers dataset.
 
     Examples:
-        
+
         .. code-block:: python
 
             import itertools
diff --git a/python/paddle/vision/datasets/folder.py b/python/paddle/vision/datasets/folder.py
index 0d874765729..4a024fa2779 100644
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
@@ -236,7 +236,7 @@ class DatasetFolder(Dataset):
             dir (string): Root directory path.
 
         Returns:
-            tuple: (classes, class_to_idx) where classes are relative to (dir), 
+            tuple: (classes, class_to_idx) where classes are relative to (dir),
                     and class_to_idx is a dictionary.
 
         """
diff --git a/python/paddle/vision/datasets/mnist.py b/python/paddle/vision/datasets/mnist.py
index 34049ed2f72..07464b197cc 100644
--- a/python/paddle/vision/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
@@ -44,12 +44,12 @@ class MNIST(Dataset):
             PIL.Image or numpy.ndarray. Should be one of {'pil', 'cv2'}.
             If this option is not set, will get backend from :ref:`paddle.vision.get_image_backend <api_vision_image_get_image_backend>`,
             default backend is 'pil'. Default: None.
-            
+
     Returns:
         :ref:`api_paddle_io_Dataset`. An instance of MNIST dataset.
 
     Examples:
-        
+
         .. code-block:: python
 
             import itertools
@@ -226,12 +226,12 @@ class FashionMNIST(MNIST):
             PIL.Image or numpy.ndarray. Should be one of {'pil', 'cv2'}.
             If this option is not set, will get backend from :ref:`paddle.vision.get_image_backend <api_vision_image_get_image_backend>`,
             default backend is 'pil'. Default: None.
-            
+
     Returns:
         :ref:`api_paddle_io_Dataset`. An instance of FashionMNIST dataset.
 
     Examples:
-        
+
         .. code-block:: python
 
             import itertools
diff --git a/python/paddle/vision/image.py b/python/paddle/vision/image.py
index 755c8bcc9cc..c30b99799c0 100644
--- a/python/paddle/vision/image.py
+++ b/python/paddle/vision/image.py
@@ -22,15 +22,15 @@ _image_backend = 'pil'
 
 def set_image_backend(backend):
     """
-    Specifies the backend used to load images in class ``paddle.vision.datasets.ImageFolder`` 
-    and ``paddle.vision.datasets.DatasetFolder`` . Now support backends are pillow and opencv. 
-    If backend not set, will use 'pil' as default. 
+    Specifies the backend used to load images in class ``paddle.vision.datasets.ImageFolder``
+    and ``paddle.vision.datasets.DatasetFolder`` . Now support backends are pillow and opencv.
+    If backend not set, will use 'pil' as default.
 
     Args:
         backend (str): Name of the image load backend, should be one of {'pil', 'cv2'}.
 
     Examples:
-    
+
         .. code-block:: python
 
             import os
@@ -95,7 +95,7 @@ def get_image_backend():
         str: backend of image load.
 
     Examples:
-    
+
         .. code-block:: python
 
             from paddle.vision import get_image_backend
@@ -113,14 +113,14 @@ def image_load(path, backend=None):
     Args:
         path (str): Path of the image.
         backend (str, optional): The image decoding backend type. Options are
-            `cv2`, `pil`, `None`. If backend is None, the global _imread_backend 
+            `cv2`, `pil`, `None`. If backend is None, the global _imread_backend
             specified by ``paddle.vision.set_image_backend`` will be used. Default: None.
 
     Returns:
         PIL.Image or np.array: Loaded image.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -133,7 +133,7 @@ def image_load(path, backend=None):
             fake_img.save(path)
 
             set_image_backend('pil')
-            
+
             pil_img = image_load(path).convert('RGB')
 
             # should be PIL.Image.Image
@@ -145,7 +145,7 @@ def image_load(path, backend=None):
             # np_img = image_load(path)
             # # should get numpy.ndarray
             # print(type(np_img))
-    
+
     """
 
     if backend is None:
diff --git a/python/paddle/vision/models/alexnet.py b/python/paddle/vision/models/alexnet.py
index b24afac253f..61fd2334b63 100644
--- a/python/paddle/vision/models/alexnet.py
+++ b/python/paddle/vision/models/alexnet.py
@@ -77,7 +77,7 @@ class AlexNet(nn.Layer):
     <https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf>`_.
 
     Args:
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
                             will not be defined. Default: 1000.
 
     Returns:
@@ -196,7 +196,7 @@ def alexnet(pretrained=False, **kwargs):
 
     Returns:
         :ref:`api_paddle_nn_Layer`. An instance of AlexNet model.
-    
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/vision/models/densenet.py b/python/paddle/vision/models/densenet.py
index 5580b97a3a8..1178a09afcb 100644
--- a/python/paddle/vision/models/densenet.py
+++ b/python/paddle/vision/models/densenet.py
@@ -191,7 +191,7 @@ class DenseNet(nn.Layer):
         layers (int, optional): Layers of DenseNet. Default: 121.
         bn_size (int, optional): Expansion of growth rate in the middle layer. Default: 4.
         dropout (float, optional): Dropout rate. Default: :math:`0.0`.
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
                             will not be defined. Default: 1000.
         with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
 
diff --git a/python/paddle/vision/models/googlenet.py b/python/paddle/vision/models/googlenet.py
index b5fc9ae4ab2..69265bab17c 100644
--- a/python/paddle/vision/models/googlenet.py
+++ b/python/paddle/vision/models/googlenet.py
@@ -98,9 +98,9 @@ class Inception(nn.Layer):
 class GoogLeNet(nn.Layer):
     """GoogLeNet (Inception v1) model architecture from
     `"Going Deeper with Convolutions" <https://arxiv.org/pdf/1409.4842.pdf>`_.
-    
+
     Args:
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
                             will not be defined. Default: 1000.
         with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
 
@@ -224,7 +224,7 @@ class GoogLeNet(nn.Layer):
 def googlenet(pretrained=False, **kwargs):
     """GoogLeNet (Inception v1) model architecture from
     `"Going Deeper with Convolutions" <https://arxiv.org/pdf/1409.4842.pdf>`_.
-    
+
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
diff --git a/python/paddle/vision/models/inceptionv3.py b/python/paddle/vision/models/inceptionv3.py
index a2a06de3d85..461959ea62d 100644
--- a/python/paddle/vision/models/inceptionv3.py
+++ b/python/paddle/vision/models/inceptionv3.py
@@ -417,7 +417,7 @@ class InceptionV3(nn.Layer):
     `"Rethinking the Inception Architecture for Computer Vision" <https://arxiv.org/pdf/1512.00567.pdf>`_.
 
     Args:
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
                             will not be defined. Default: 1000.
         with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
 
diff --git a/python/paddle/vision/models/lenet.py b/python/paddle/vision/models/lenet.py
index 1d446569a66..44cfed2ef30 100644
--- a/python/paddle/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
@@ -23,7 +23,7 @@ class LeNet(nn.Layer):
     `"Gradient-based learning applied to document recognition" <https://ieeexplore.ieee.org/document/726791>`_.
 
     Args:
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
                             will not be defined. Default: 10.
 
     Returns:
diff --git a/python/paddle/vision/models/mobilenetv1.py b/python/paddle/vision/models/mobilenetv1.py
index 6359fcffcdf..22be953f2c1 100644
--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
@@ -59,7 +59,7 @@ class MobileNetV1(nn.Layer):
 
     Args:
         scale (float, optional): Scale of channels in each layer. Default: 1.0.
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
                             will not be defined. Default: 1000.
         with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
 
@@ -222,7 +222,7 @@ def _mobilenet(arch, pretrained=False, **kwargs):
 def mobilenet_v1(pretrained=False, scale=1.0, **kwargs):
     """MobileNetV1 from
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" <https://arxiv.org/abs/1704.04861>`_.
-    
+
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index 88fa8ebf6e6..f67ea54fa63 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -76,7 +76,7 @@ class MobileNetV2(nn.Layer):
 
     Args:
         scale (float, optional): Scale of channels in each layer. Default: 1.0.
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
                             will not be defined. Default: 1000.
         with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
 
@@ -186,7 +186,7 @@ def _mobilenet(arch, pretrained=False, **kwargs):
 def mobilenet_v2(pretrained=False, scale=1.0, **kwargs):
     """MobileNetV2 from
     `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
-    
+
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
diff --git a/python/paddle/vision/models/mobilenetv3.py b/python/paddle/vision/models/mobilenetv3.py
index 8d3c8520ec6..472443303a3 100644
--- a/python/paddle/vision/models/mobilenetv3.py
+++ b/python/paddle/vision/models/mobilenetv3.py
@@ -254,7 +254,7 @@ class MobileNetV3Small(MobileNetV3):
 
     Args:
         scale (float, optional): Scale of channels in each layer. Default: 1.0.
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
                             will not be defined. Default: 1000.
         with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
 
@@ -305,7 +305,7 @@ class MobileNetV3Large(MobileNetV3):
 
     Args:
         scale (float, optional): Scale of channels in each layer. Default: 1.0.
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
                             will not be defined. Default: 1000.
         with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
 
diff --git a/python/paddle/vision/models/resnet.py b/python/paddle/vision/models/resnet.py
index 4cf479c0572..9cc1d2d12c0 100644
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -183,7 +183,7 @@ class ResNet(nn.Layer):
         Block (BasicBlock|BottleneckBlock): Block module of model.
         depth (int, optional): Layers of ResNet, Default: 50.
         width (int, optional): Base width per convolution group for each convolution block, Default: 64.
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
                             will not be defined. Default: 1000.
         with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
         groups (int, optional): Number of groups for each convolution block, Default: 1.
@@ -495,7 +495,7 @@ def resnet152(pretrained=False, **kwargs):
 def resnext50_32x4d(pretrained=False, **kwargs):
     """ResNeXt-50 32x4d model from
     `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-    
+
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
@@ -530,7 +530,7 @@ def resnext50_32x4d(pretrained=False, **kwargs):
 def resnext50_64x4d(pretrained=False, **kwargs):
     """ResNeXt-50 64x4d model from
     `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-    
+
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
@@ -565,7 +565,7 @@ def resnext50_64x4d(pretrained=False, **kwargs):
 def resnext101_32x4d(pretrained=False, **kwargs):
     """ResNeXt-101 32x4d model from
     `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-    
+
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
@@ -601,7 +601,7 @@ def resnext101_32x4d(pretrained=False, **kwargs):
 def resnext101_64x4d(pretrained=False, **kwargs):
     """ResNeXt-101 64x4d model from
     `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-    
+
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
@@ -637,7 +637,7 @@ def resnext101_64x4d(pretrained=False, **kwargs):
 def resnext152_32x4d(pretrained=False, **kwargs):
     """ResNeXt-152 32x4d model from
     `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-    
+
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
@@ -673,7 +673,7 @@ def resnext152_32x4d(pretrained=False, **kwargs):
 def resnext152_64x4d(pretrained=False, **kwargs):
     """ResNeXt-152 64x4d model from
     `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-    
+
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
diff --git a/python/paddle/vision/models/shufflenetv2.py b/python/paddle/vision/models/shufflenetv2.py
index 900a1928304..f40731e8f1c 100644
--- a/python/paddle/vision/models/shufflenetv2.py
+++ b/python/paddle/vision/models/shufflenetv2.py
@@ -195,7 +195,7 @@ class ShuffleNetV2(nn.Layer):
     Args:
         scale (float, optional): Scale of output channels. Default: True.
         act (str, optional): Activation function of neural network. Default: "relu".
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
                             will not be defined. Default: 1000.
         with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
 
diff --git a/python/paddle/vision/models/squeezenet.py b/python/paddle/vision/models/squeezenet.py
index 997d50dca70..39ad8b54a70 100644
--- a/python/paddle/vision/models/squeezenet.py
+++ b/python/paddle/vision/models/squeezenet.py
@@ -80,7 +80,7 @@ class SqueezeNet(nn.Layer):
 
     Args:
         version (str): Version of SqueezeNet, which can be "1.0" or "1.1".
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
                             will not be defined. Default: 1000.
         with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
 
diff --git a/python/paddle/vision/models/vgg.py b/python/paddle/vision/models/vgg.py
index 8fb0ea26005..c9558c9c382 100644
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
@@ -33,7 +33,7 @@ class VGG(nn.Layer):
 
     Args:
         features (nn.Layer): Vgg features create by function make_layers.
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
+        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer
                             will not be defined. Default: 1000.
         with_pool (bool, optional): Use pool before the last three fc layer or not. Default: True.
 
@@ -142,7 +142,7 @@ def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
 def vgg11(pretrained=False, batch_norm=False, **kwargs):
     """VGG 11-layer model from
     `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
-    
+
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
@@ -179,7 +179,7 @@ def vgg11(pretrained=False, batch_norm=False, **kwargs):
 def vgg13(pretrained=False, batch_norm=False, **kwargs):
     """VGG 13-layer model from
     `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
-    
+
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
@@ -216,7 +216,7 @@ def vgg13(pretrained=False, batch_norm=False, **kwargs):
 def vgg16(pretrained=False, batch_norm=False, **kwargs):
     """VGG 16-layer model from
     `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
-    
+
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
@@ -253,7 +253,7 @@ def vgg16(pretrained=False, batch_norm=False, **kwargs):
 def vgg19(pretrained=False, batch_norm=False, **kwargs):
     """VGG 19-layer model from
     `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
-    
+
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 032fe4bd356..a83647349d9 100755
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -47,14 +47,14 @@ def yolo_loss(x,
 
     This operator generates YOLOv3 loss based on given predict result and ground
     truth boxes.
-    
+
     The output of previous network is in shape [N, C, H, W], while H and W
-    should be the same, H and W specify the grid size, each grid point predict 
+    should be the same, H and W specify the grid size, each grid point predict
     given number bounding boxes, this given number, which following will be represented as S,
     is specified by the number of anchor clusters in each scale. In the second dimension(the channel
-    dimension), C should be equal to S * (class_num + 5), class_num is the object 
-    category number of source dataset(such as 80 in coco dataset), so in the 
-    second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+    dimension), C should be equal to S * (class_num + 5), class_num is the object
+    category number of source dataset(such as 80 in coco dataset), so in the
+    second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
     also includes confidence score of the box and class one-hot key of each anchor box.
 
     Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
@@ -77,21 +77,21 @@ def yolo_loss(x,
     and :math:`p_w, p_h` is specified by anchors.
 
     As for confidence score, it is the logistic regression value of IoU between
-    anchor boxes and ground truth boxes, the score of the anchor box which has 
-    the max IoU should be 1, and if the anchor box has IoU bigger than ignore 
+    anchor boxes and ground truth boxes, the score of the anchor box which has
+    the max IoU should be 1, and if the anchor box has IoU bigger than ignore
     thresh, the confidence score loss of this anchor box will be ignored.
 
     Therefore, the YOLOv3 loss consists of three major parts: box location loss,
-    objectness loss and classification loss. The L1 loss is used for 
-    box coordinates (w, h), sigmoid cross entropy loss is used for box 
+    objectness loss and classification loss. The L1 loss is used for
+    box coordinates (w, h), sigmoid cross entropy loss is used for box
     coordinates (x, y), objectness loss and classification loss.
 
-    Each groud truth box finds a best matching anchor box in all anchors. 
+    Each groud truth box finds a best matching anchor box in all anchors.
     Prediction of this anchor box will incur all three parts of losses, and
     prediction of anchor boxes with no GT box matched will only incur objectness
     loss.
 
-    In order to trade off box coordinate losses between big boxes and small 
+    In order to trade off box coordinate losses between big boxes and small
     boxes, box coordinate losses will be mutiplied by scale weight, which is
     calculated as follows.
 
@@ -106,12 +106,12 @@ def yolo_loss(x,
     $$
 
     While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
-    target will be smoothed when calculating classification loss, target of 
+    target will be smoothed when calculating classification loss, target of
     positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of
     negetive samples will be smoothed to :math:`1.0 / class\_num`.
 
-    While :attr:`gt_score` is given, which means the mixup score of ground truth 
-    boxes, all losses incured by a ground truth box will be multiplied by its 
+    While :attr:`gt_score` is given, which means the mixup score of ground truth
+    boxes, all losses incured by a ground truth box will be multiplied by its
     mixup score.
 
     Args:
@@ -119,16 +119,16 @@ def yolo_loss(x,
                       tensor with shape of [N, C, H, W]. H and W should be same,
                       and the second dimension(C) stores box locations, confidence
                       score and classification one-hot keys of each anchor box.
-                      The data type is float32 or float64. 
+                      The data type is float32 or float64.
         gt_box (Tensor): groud truth boxes, should be in shape of [N, B, 4],
-                          in the third dimension, x, y, w, h should be stored. 
+                          in the third dimension, x, y, w, h should be stored.
                           x,y is the center coordinate of boxes, w, h are the
-                          width and height, x, y, w, h should be divided by 
+                          width and height, x, y, w, h should be divided by
                           input image height to scale to [0, 1].
-                          N is the batch number and B is the max box number in 
-                          an image.The data type is float32 or float64. 
+                          N is the batch number and B is the max box number in
+                          an image.The data type is float32 or float64.
         gt_label (Tensor): class id of ground truth boxes, should be in shape
-                            of [N, B].The data type is int32. 
+                            of [N, B].The data type is int32.
         anchors (list|tuple): The anchor width and height, it will be parsed
                               pair by pair.
         anchor_mask (list|tuple): The mask index of anchors used in current
@@ -137,13 +137,13 @@ def yolo_loss(x,
         ignore_thresh (float): The ignore threshold to ignore confidence loss.
         downsample_ratio (int): The downsample ratio from network input to YOLOv3
                                 loss input, so 32, 16, 8 should be set for the
-                                first, second, and thrid YOLOv3 loss operators. 
-        name (string): The default value is None.  Normally there is no need 
-                       for user to set this property.  For more information, 
+                                first, second, and thrid YOLOv3 loss operators.
+        name (string): The default value is None.  Normally there is no need
+                       for user to set this property.  For more information,
                        please refer to :ref:`api_guide_Name`
         gt_score (Tensor): mixup score of ground truth boxes, should be in shape
                             of [N, B]. Default None.
-        use_label_smooth (bool): Whether to use label smooth. Default True. 
+        use_label_smooth (bool): Whether to use label smooth. Default True.
         scale_x_y (float): Scale the center point of decoded bounding box.
                            Default 1.0
 
@@ -152,9 +152,9 @@ def yolo_loss(x,
 
     Raises:
         TypeError: Input x of yolov3_loss must be Tensor
-        TypeError: Input gtbox of yolov3_loss must be Tensor 
-        TypeError: Input gtlabel of yolov3_loss must be Tensor 
-        TypeError: Input gtscore of yolov3_loss must be None or Tensor 
+        TypeError: Input gtbox of yolov3_loss must be Tensor
+        TypeError: Input gtlabel of yolov3_loss must be Tensor
+        TypeError: Input gtscore of yolov3_loss must be None or Tensor
         TypeError: Attr anchors of yolov3_loss must be list or tuple
         TypeError: Attr class_num of yolov3_loss must be an integer
         TypeError: Attr ignore_thresh of yolov3_loss must be a float number
@@ -261,19 +261,19 @@ def yolo_box(x,
     r"""
 
     This operator generates YOLO detection boxes from output of YOLOv3 network.
-    
+
     The output of previous network is in shape [N, C, H, W], while H and W
-    should be the same, H and W specify the grid size, each grid point predict 
+    should be the same, H and W specify the grid size, each grid point predict
     given number boxes, this given number, which following will be represented as S,
     is specified by the number of anchors. In the second dimension(the channel
     dimension), C should be equal to S * (5 + class_num) if :attr:`iou_aware` is false,
     otherwise C should be equal to S * (6 + class_num). class_num is the object
-    category number of source dataset(such as 80 in coco dataset), so the 
-    second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
-    also includes confidence score of the box and class one-hot key of each anchor 
+    category number of source dataset(such as 80 in coco dataset), so the
+    second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
+    also includes confidence score of the box and class one-hot key of each anchor
     box.
 
-    Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box 
+    Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box
     predictions should be as follows:
 
     $$
@@ -294,9 +294,9 @@ def yolo_box(x,
 
     The logistic regression value of the 5th channel of each anchor prediction boxes
     represents the confidence score of each prediction box, and the logistic
-    regression value of the last :attr:`class_num` channels of each anchor prediction 
+    regression value of the last :attr:`class_num` channels of each anchor prediction
     boxes represents the classifcation scores. Boxes with confidence scores less than
-    :attr:`conf_thresh` should be ignored, and box final scores is the product of 
+    :attr:`conf_thresh` should be ignored, and box final scores is the product of
     confidence scores and classification scores.
 
     $$
@@ -317,11 +317,11 @@ def yolo_box(x,
                       shape of [N, C, H, W]. The second dimension(C) stores box
                       locations, confidence score and classification one-hot keys
                       of each anchor box. Generally, X should be the output of
-                      YOLOv3 network. The data type is float32 or float64. 
+                      YOLOv3 network. The data type is float32 or float64.
         img_size (Tensor): The image size tensor of YoloBox operator, This is a
                            2-D tensor with shape of [N, 2]. This tensor holds
                            height and width of each input image used for resizing
-                           output box in input image scale. The data type is int32. 
+                           output box in input image scale. The data type is int32.
         anchors (list|tuple): The anchor width and height, it will be parsed pair
                               by pair.
         class_num (int): The number of classes.
@@ -336,15 +336,15 @@ def yolo_box(x,
                           boundary. Default true.
         scale_x_y (float): Scale the center point of decoded bounding box.
                            Default 1.0
-        name (string): The default value is None.  Normally there is no need 
-                       for user to set this property.  For more information, 
+        name (string): The default value is None.  Normally there is no need
+                       for user to set this property.  For more information,
                        please refer to :ref:`api_guide_Name`
         iou_aware (bool): Whether use iou aware. Default false
         iou_aware_factor (float): iou aware factor. Default 0.5
 
     Returns:
         Tensor: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
-        and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification 
+        and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification
         scores of boxes.
 
     Raises:
@@ -723,9 +723,9 @@ class DeformConv2D(Layer):
         - offset: :math:`(N, 2 * H_f * W_f, H_{out}, W_{out})`
         - mask: :math:`(N, H_f * W_f, H_{out}, W_{out})`
         - output: :math:`(N, C_{out}, H_{out}, W_{out})`
-        
+
         Where
-        
+
         ..  math::
 
             H_{out}&= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1 \\
@@ -842,12 +842,12 @@ def distribute_fpn_proposals(fpn_rois,
                              rois_num=None,
                              name=None):
     r"""
-        In Feature Pyramid Networks (FPN) models, it is needed to distribute 
-    all proposals into different FPN level, with respect to scale of the proposals, 
-    the referring scale and the referring level. Besides, to restore the order of 
-    proposals, we return an array which indicates the original index of rois 
+        In Feature Pyramid Networks (FPN) models, it is needed to distribute
+    all proposals into different FPN level, with respect to scale of the proposals,
+    the referring scale and the referring level. Besides, to restore the order of
+    proposals, we return an array which indicates the original index of rois
     in current proposals. To compute FPN level for each roi, the formula is given as follows:
-    
+
     .. math::
         roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
         level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
@@ -856,30 +856,30 @@ def distribute_fpn_proposals(fpn_rois,
     Args:
         fpn_rois (Tensor): The input fpn_rois. 2-D Tensor with shape [N, 4] and data type can be
             float32 or float64.
-        min_level (int): The lowest level of FPN layer where the proposals come 
+        min_level (int): The lowest level of FPN layer where the proposals come
             from.
         max_level (int): The highest level of FPN layer where the proposals
             come from.
         refer_level (int): The referring level of FPN layer with specified scale.
         refer_scale (int): The referring scale of FPN layer with specified level.
-        pixel_offset (bool, optional): Whether there is pixel offset. If True, the offset of 
+        pixel_offset (bool, optional): Whether there is pixel offset. If True, the offset of
             image shape will be 1. 'False' by default.
-        rois_num (Tensor, optional): 1-D Tensor contains the number of RoIs in each image. 
+        rois_num (Tensor, optional): 1-D Tensor contains the number of RoIs in each image.
             The shape is [B] and data type is int32. B is the number of images.
-            If rois_num not None, it will return a list of 1-D Tensor. Each element 
+            If rois_num not None, it will return a list of 1-D Tensor. Each element
             is the output RoIs' number of each image on the corresponding level
             and the shape is [B]. None by default.
-        name (str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default. 
+        name (str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
 
     Returns:
         multi_rois (List) : The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
-            and data type is same as `fpn_rois` . The length is max_level-min_level+1.         
+            and data type is same as `fpn_rois` . The length is max_level-min_level+1.
         restore_ind (Tensor): The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
-            , where N is the number of total rois. The data type is int32. 
-        rois_num_per_level (List): A list of 1-D Tensor and each Tensor is 
-            the RoIs' number in each image on the corresponding level. The shape 
+            , where N is the number of total rois. The data type is int32.
+        rois_num_per_level (List): A list of 1-D Tensor and each Tensor is
+            the RoIs' number in each image on the corresponding level. The shape
             is [B] and data type of int32, where B is the number of images.
 
     Examples:
@@ -977,12 +977,12 @@ def read_file(filename, name=None):
             import cv2
             import paddle
 
-            fake_img = (paddle.rand((400, 300, 3)).numpy() * 255).astype('uint8')            
+            fake_img = (paddle.rand((400, 300, 3)).numpy() * 255).astype('uint8')
 
             cv2.imwrite('fake.jpg', fake_img)
 
             img_bytes = paddle.vision.ops.read_file('fake.jpg')
-            
+
             print(img_bytes.shape)
             # [142915]
     """
@@ -1005,14 +1005,14 @@ def read_file(filename, name=None):
 
 def decode_jpeg(x, mode='unchanged', name=None):
     """
-    Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. 
-    Optionally converts the image to the desired format. 
+    Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor.
+    Optionally converts the image to the desired format.
     The values of the output tensor are uint8 between 0 and 255.
 
     Args:
-        x (Tensor): A one dimensional uint8 tensor containing the raw bytes 
+        x (Tensor): A one dimensional uint8 tensor containing the raw bytes
             of the JPEG image.
-        mode (str): The read mode used for optionally converting the image. 
+        mode (str): The read mode used for optionally converting the image.
             Default: 'unchanged'.
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
@@ -1057,7 +1057,7 @@ def decode_jpeg(x, mode='unchanged', name=None):
 def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     """
     Position sensitive region of interest pooling (also known as PSROIPooling) is to perform
-    position-sensitive average pooling on regions of interest specified by input. It performs 
+    position-sensitive average pooling on regions of interest specified by input. It performs
     on inputs of nonuniform sizes to obtain fixed-size feature maps.
 
     PSROIPooling is proposed by R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details.
@@ -1065,13 +1065,13 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     Args:
         x (Tensor): Input features with shape (N, C, H, W). The data type can be float32 or float64.
         boxes (Tensor): Box coordinates of ROIs (Regions of Interest) to pool over. It should be
-                         a 2-D Tensor with shape (num_rois, 4). Given as [[x1, y1, x2, y2], ...], 
+                         a 2-D Tensor with shape (num_rois, 4). Given as [[x1, y1, x2, y2], ...],
                          (x1, y1) is the top left coordinates, and (x2, y2) is the bottom
                          right coordinates.
         boxes_num (Tensor): The number of boxes contained in each picture in the batch.
-        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type 
+        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type
                                is int32. If int, H and W are both equal to output_size.
-        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their 
+        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their
                                input scale to the scale used when pooling. Default: 1.0
         name(str, optional): The default value is None.
                              Normally there is no need for user to set this property.
@@ -1134,9 +1134,9 @@ class PSRoIPool(Layer):
     refer to :ref:`api_paddle_vision_ops_psroi_pool`.
 
     Args:
-        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type 
+        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type
                                is int32. If int, H and W are both equal to output_size.
-        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their 
+        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their
                                input scale to the scale used when pooling. Default: 1.0.
 
     Shape:
@@ -1153,7 +1153,7 @@ class PSRoIPool(Layer):
         .. code-block:: python
 
             import paddle
-            
+
             psroi_module = paddle.vision.ops.PSRoIPool(7, 1.0)
             x = paddle.uniform([2, 490, 28, 28], dtype='float32')
             boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32')
@@ -1176,16 +1176,16 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     """
     This operator implements the roi_pooling layer.
     Region of interest pooling (also known as RoI pooling) is to perform max pooling on inputs of nonuniform sizes to obtain fixed-size feature maps (e.g. 7*7).
-    The operator has three steps: 1. Dividing each region proposal into equal-sized sections with output_size(h, w) 2. Finding the largest value in each section 3. Copying these max values to the output buffer  
+    The operator has three steps: 1. Dividing each region proposal into equal-sized sections with output_size(h, w) 2. Finding the largest value in each section 3. Copying these max values to the output buffer
     For more information, please refer to https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn.
 
     Args:
-        x (Tensor): input feature, 4D-Tensor with the shape of [N,C,H,W], 
-            where N is the batch size, C is the input channel, H is Height, W is weight. 
+        x (Tensor): input feature, 4D-Tensor with the shape of [N,C,H,W],
+            where N is the batch size, C is the input channel, H is Height, W is weight.
             The data type is float32 or float64.
-        boxes (Tensor): boxes (Regions of Interest) to pool over. 
-            2D-Tensor with the shape of [num_boxes,4]. 
-            Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates, 
+        boxes (Tensor): boxes (Regions of Interest) to pool over.
+            2D-Tensor with the shape of [num_boxes,4].
+            Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates,
             and (x2, y2) is the bottom right coordinates.
         boxes_num (Tensor): the number of RoIs in each image, data type is int32. Default: None
         output_size (int or tuple[int, int]): the pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size.
@@ -1193,7 +1193,7 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
         name(str, optional): for detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
 
     Returns:
-        pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].  
+        pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].
 
     Examples:
         .. code-block:: python
@@ -1257,21 +1257,21 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
 class RoIPool(Layer):
     """
     This interface is used to construct a callable object of the `RoIPool` class. Please
-    refer to :ref:`api_paddle_vision_ops_roi_pool`.  
+    refer to :ref:`api_paddle_vision_ops_roi_pool`.
 
     Args:
         output_size (int or tuple[int, int]): the pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size.
         spatial_scale (float, optional): multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0.
 
     Returns:
-        pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].  
+        pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].
 
     Examples:
         .. code-block:: python
 
             import paddle
             from paddle.vision.ops import RoIPool
-            
+
             data = paddle.rand([1, 256, 32, 32])
             boxes = paddle.rand([3, 4])
             boxes[:, 2] += boxes[:, 0] + 3
@@ -1318,13 +1318,13 @@ def roi_align(x,
 
     In each ROI bin, the value of the four regularly sampled locations are
     computed directly through bilinear interpolation. The output is the mean of
-    four locations. Thus avoid the misaligned problem. 
+    four locations. Thus avoid the misaligned problem.
 
     Args:
-        x (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W], 
+        x (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W],
             where N is the batch size, C is the input channel, H is Height,
             W is weight. The data type is float32 or float64.
-        boxes (Tensor): Boxes (RoIs, Regions of Interest) to pool over. It 
+        boxes (Tensor): Boxes (RoIs, Regions of Interest) to pool over. It
             should be a 2-D Tensor of shape (num_boxes, 4). The data type is
             float32 or float64. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
             the top left coordinates, and (x2, y2) is the bottom right coordinates.
@@ -1522,9 +1522,9 @@ def nms(boxes,
         top_k=None):
     r"""
     This operator implements non-maximum suppression. Non-maximum suppression (NMS)
-    is used to select one bounding box out of many overlapping bounding boxes in object detection. 
-    Boxes with IoU > iou_threshold will be considered as overlapping boxes, 
-    just one with highest score can be kept. Here IoU is Intersection Over Union, 
+    is used to select one bounding box out of many overlapping bounding boxes in object detection.
+    Boxes with IoU > iou_threshold will be considered as overlapping boxes,
+    just one with highest score can be kept. Here IoU is Intersection Over Union,
     which can be computed by:
 
     ..  math::
@@ -1533,25 +1533,25 @@ def nms(boxes,
 
     If scores are provided, input boxes will be sorted by their scores firstly.
 
-    If category_idxs and categories are provided, NMS will be performed with a batched style, 
+    If category_idxs and categories are provided, NMS will be performed with a batched style,
     which means NMS will be applied to each category respectively and results of each category
     will be concated and sorted by scores.
-    
+
     If K is provided, only the first k elements will be returned. Otherwise, all box indices sorted by scores will be returned.
 
     Args:
-        boxes(Tensor): The input boxes data to be computed, it's a 2D-Tensor with 
-            the shape of [num_boxes, 4]. The data type is float32 or float64. 
-            Given as [[x1, y1, x2, y2], …],  (x1, y1) is the top left coordinates, 
-            and (x2, y2) is the bottom right coordinates. 
+        boxes(Tensor): The input boxes data to be computed, it's a 2D-Tensor with
+            the shape of [num_boxes, 4]. The data type is float32 or float64.
+            Given as [[x1, y1, x2, y2], …],  (x1, y1) is the top left coordinates,
+            and (x2, y2) is the bottom right coordinates.
             Their relation should be ``0 <= x1 < x2 && 0 <= y1 < y2``.
         iou_threshold(float32, optional): IoU threshold for determine overlapping boxes. Default value: 0.3.
-        scores(Tensor, optional): Scores corresponding to boxes, it's a 1D-Tensor with 
+        scores(Tensor, optional): Scores corresponding to boxes, it's a 1D-Tensor with
             shape of [num_boxes]. The data type is float32 or float64. Default: None.
-        category_idxs(Tensor, optional): Category indices corresponding to boxes. 
+        category_idxs(Tensor, optional): Category indices corresponding to boxes.
             it's a 1D-Tensor with shape of [num_boxes]. The data type is int64. Default: None.
         categories(List, optional): A list of unique id of all categories. The data type is int64. Default: None.
-        top_k(int64, optional): The top K boxes who has higher score and kept by NMS preds to 
+        top_k(int64, optional): The top K boxes who has higher score and kept by NMS preds to
             consider. top_k should be smaller equal than num_boxes. Default: None.
 
     Returns:
@@ -1559,7 +1559,7 @@ def nms(boxes,
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import numpy as np
 
@@ -1578,14 +1578,14 @@ def nms(boxes,
             # [0.98015213 0.3156527  0.8199343  0.874901 ]
 
             categories = [0, 1, 2, 3]
-            category_idxs = np.random.choice(categories, 4)                        
+            category_idxs = np.random.choice(categories, 4)
             # [2 0 0 3]
 
-            out =  paddle.vision.ops.nms(paddle.to_tensor(boxes), 
-                                                    0.1, 
-                                                    paddle.to_tensor(scores), 
-                                                    paddle.to_tensor(category_idxs), 
-                                                    categories, 
+            out =  paddle.vision.ops.nms(paddle.to_tensor(boxes),
+                                                    0.1,
+                                                    paddle.to_tensor(scores),
+                                                    paddle.to_tensor(category_idxs),
+                                                    categories,
                                                     4)
             # [0, 3, 2]
     """
@@ -1680,17 +1680,17 @@ def generate_proposals(scores,
                        name=None):
     """
     This operation proposes RoIs according to each box with their
-    probability to be a foreground object. And 
-    the proposals of RPN output are  calculated by anchors, bbox_deltas and scores. Final proposals 
+    probability to be a foreground object. And
+    the proposals of RPN output are  calculated by anchors, bbox_deltas and scores. Final proposals
     could be used to train detection net.
 
     For generating proposals, this operation performs following steps:
 
     1. Transpose and resize scores and bbox_deltas in size of
        (H * W * A, 1) and (H * W * A, 4)
-    2. Calculate box locations as proposals candidates. 
+    2. Calculate box locations as proposals candidates.
     3. Clip boxes to image
-    4. Remove predicted boxes with small area. 
+    4. Remove predicted boxes with small area.
     5. Apply non-maximum suppression (NMS) to get final proposals as output.
 
     Args:
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index ecc160b0c0e..74507a48ba8 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -51,7 +51,7 @@ def to_tensor(pic, data_format='CHW'):
 
     Args:
         pic (PIL.Image|np.ndarray): Image to be converted to tensor.
-        data_format (str, optional): Data format of output tensor, should be 'HWC' or 
+        data_format (str, optional): Data format of output tensor, should be 'HWC' or
             'CHW'. Default: 'CHW'.
 
     Returns:
@@ -93,19 +93,19 @@ def resize(img, size, interpolation='bilinear'):
     Args:
         input (PIL.Image|np.ndarray): Image to be resized.
         size (int|list|tuple): Target size of input data, with (height, width) shape.
-        interpolation (int|str, optional): Interpolation method. when use pil backend, 
-            support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
-            - "bicubic": Image.BICUBIC, 
-            - "box": Image.BOX, 
-            - "lanczos": Image.LANCZOS, 
+        interpolation (int|str, optional): Interpolation method. when use pil backend,
+            support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
+            - "bicubic": Image.BICUBIC,
+            - "box": Image.BOX,
+            - "lanczos": Image.LANCZOS,
             - "hamming": Image.HAMMING
-            when use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
-            - "area": cv2.INTER_AREA, 
-            - "bicubic": cv2.INTER_CUBIC, 
+            when use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
+            - "area": cv2.INTER_AREA,
+            - "bicubic": cv2.INTER_CUBIC,
             - "lanczos": cv2.INTER_LANCZOS4
 
     Returns:
@@ -157,7 +157,7 @@ def pad(img, padding, fill=0, padding_mode='constant'):
             respectively.
         fill (float, optional): Pixel fill value for constant fill. If a tuple of
             length 3, it is used to fill R, G, B channels respectively.
-            This value is only used when the padding_mode is constant. Default: 0. 
+            This value is only used when the padding_mode is constant. Default: 0.
         padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default: 'constant'.
 
             - constant: pads with a constant value, this value is specified with fill
@@ -212,7 +212,7 @@ def crop(img, top, left, height, width):
     """Crops the given Image.
 
     Args:
-        img (PIL.Image|np.array): Image to be cropped. (0,0) denotes the top left 
+        img (PIL.Image|np.array): Image to be cropped. (0,0) denotes the top left
             corner of the image.
         top (int): Vertical component of the top left corner of the crop box.
         left (int): Horizontal component of the top left corner of the crop box.
@@ -258,7 +258,7 @@ def center_crop(img, output_size):
             img (PIL.Image|np.array): Image to be cropped. (0,0) denotes the top left corner of the image.
             output_size (sequence or int): (height, width) of the crop box. If int,
                 it is used for both directions
-        
+
         Returns:
             PIL.Image or np.array: Cropped image.
 
@@ -589,16 +589,16 @@ def affine(img,
         translate (list[float]): Maximum absolute fraction for horizontal and vertical translations.
         scale (float): Scale factor for the image, scale should be positive.
         shear (list[float]): Shear angle values which are parallel to the x-axis and y-axis in clockwise order.
-        interpolation (str, optional): Interpolation method. If omitted, or if the 
-            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
-            according the backend. 
-            When use pil backend, support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
+        interpolation (str, optional): Interpolation method. If omitted, or if the
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST
+            according the backend.
+            When use pil backend, support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
             - "bicubic": Image.BICUBIC
-            When use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
+            When use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
             - "bicubic": cv2.INTER_CUBIC
         fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
@@ -714,15 +714,15 @@ def rotate(img,
     Args:
         img (PIL.Image|np.array): Image to be rotated.
         angle (float or int): In degrees degrees counter clockwise order.
-        interpolation (str, optional): Interpolation method. If omitted, or if the 
-            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
-            according the backend. when use pil backend, support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
+        interpolation (str, optional): Interpolation method. If omitted, or if the
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST
+            according the backend. when use pil backend, support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
             - "bicubic": Image.BICUBIC
-            when use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
+            when use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
             - "bicubic": cv2.INTER_CUBIC
         expand (bool, optional): Optional expansion flag.
             If true, expands the output image to make it large enough to hold the entire rotated image.
@@ -812,16 +812,16 @@ def perspective(img, startpoints, endpoints, interpolation='nearest', fill=0):
             ``[top-left, top-right, bottom-right, bottom-left]`` of the original image.
         endpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
             ``[top-left, top-right, bottom-right, bottom-left]`` of the transformed image.
-        interpolation (str, optional): Interpolation method. If omitted, or if the 
-            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
-            according the backend. 
-            When use pil backend, support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
+        interpolation (str, optional): Interpolation method. If omitted, or if the
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST
+            according the backend.
+            When use pil backend, support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
             - "bicubic": Image.BICUBIC
-            When use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
+            When use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
             - "bicubic": cv2.INTER_CUBIC
         fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
@@ -872,7 +872,7 @@ def to_grayscale(img, num_output_channels=1):
             if num_output_channels = 1 : returned image is single channel
 
             if num_output_channels = 3 : returned image is 3 channel with r = g = b
-    
+
     Examples:
         .. code-block:: python
 
@@ -909,14 +909,14 @@ def normalize(img, mean, std, data_format='CHW', to_rgb=False):
         img (PIL.Image|np.array|paddle.Tensor): input data to be normalized.
         mean (list|tuple): Sequence of means for each channel.
         std (list|tuple): Sequence of standard deviations for each channel.
-        data_format (str, optional): Data format of input img, should be 'HWC' or 
+        data_format (str, optional): Data format of input img, should be 'HWC' or
             'CHW'. Default: 'CHW'.
-        to_rgb (bool, optional): Whether to convert to rgb. If input is tensor, 
+        to_rgb (bool, optional): Whether to convert to rgb. If input is tensor,
             this option will be igored. Default: False.
 
     Returns:
         np.ndarray or Tensor: Normalized mage. Data format is same as input img.
-    
+
     Examples:
         .. code-block:: python
 
@@ -947,16 +947,16 @@ def normalize(img, mean, std, data_format='CHW', to_rgb=False):
 
 def erase(img, i, j, h, w, v, inplace=False):
     """Erase the pixels of selected area in input image with given value.
-    
+
         Args:
-            img (paddle.Tensor | np.array | PIL.Image): input Tensor image. 
-                 For Tensor input, the shape should be (C, H, W). For np.array input, 
+            img (paddle.Tensor | np.array | PIL.Image): input Tensor image.
+                 For Tensor input, the shape should be (C, H, W). For np.array input,
                  the shape should be (H, W, C).
             i (int): y coordinate of the top-left point of erased region.
             j (int): x coordinate of the top-left point of erased region.
             h (int): Height of the erased region.
             w (int): Width of the erased region.
-            v (paddle.Tensor | np.array): value used to replace the pixels in erased region. It 
+            v (paddle.Tensor | np.array): value used to replace the pixels in erased region. It
                 should be np.array when img is np.array or PIL.Image.
             inplace (bool, optional): Whether this transform is inplace. Default: False.
 
@@ -967,7 +967,7 @@ def erase(img, i, j, h, w, v, inplace=False):
             .. code-block:: python
 
                 import paddle
-                
+
                 fake_img = paddle.randn((3, 2, 4)).astype(paddle.float32)
                 print(fake_img)
 
@@ -983,7 +983,7 @@ def erase(img, i, j, h, w, v, inplace=False):
 
                 values = paddle.zeros((1,1,1), dtype=paddle.float32)
                 result = paddle.vision.transforms.erase(fake_img, 0, 1, 1, 2, values)
-                
+
                 print(result)
 
                 #Tensor(shape=[3, 2, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
index df31add6f77..715bc80ec60 100644
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -43,7 +43,7 @@ def to_tensor(pic, data_format='CHW'):
 
     Args:
         pic (np.ndarray): Image to be converted to tensor.
-        data_format (str, optional): Data format of output tensor, should be 'HWC' or 
+        data_format (str, optional): Data format of output tensor, should be 'HWC' or
             'CHW'. Default: 'CHW'.
 
     Returns:
@@ -76,12 +76,12 @@ def resize(img, size, interpolation='bilinear'):
     Args:
         input (np.ndarray): Image to be resized.
         size (int|list|tuple): Target size of input data, with (height, width) shape.
-        interpolation (int|str, optional): Interpolation method. when use cv2 backend, 
-            support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
-            - "area": cv2.INTER_AREA, 
-            - "bicubic": cv2.INTER_CUBIC, 
+        interpolation (int|str, optional): Interpolation method. when use cv2 backend,
+            support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
+            - "area": cv2.INTER_AREA,
+            - "bicubic": cv2.INTER_CUBIC,
             - "lanczos": cv2.INTER_LANCZOS4
 
     Returns:
@@ -143,7 +143,7 @@ def pad(img, padding, fill=0, padding_mode='constant'):
             respectively.
         fill (float, optional): Pixel fill value for constant fill. If a tuple of
             length 3, it is used to fill R, G, B channels respectively.
-            This value is only used when the padding_mode is constant. Default: 0. 
+            This value is only used when the padding_mode is constant. Default: 0.
         padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default: 'constant'.
 
             - constant: pads with a constant value, this value is specified with fill
@@ -222,7 +222,7 @@ def crop(img, top, left, height, width):
     """Crops the given image.
 
     Args:
-        img (np.array): Image to be cropped. (0,0) denotes the top left 
+        img (np.array): Image to be cropped. (0,0) denotes the top left
             corner of the image.
         top (int): Vertical component of the top left corner of the crop box.
         left (int): Horizontal component of the top left corner of the crop box.
@@ -244,8 +244,8 @@ def center_crop(img, output_size):
             img (np.array): Image to be cropped. (0,0) denotes the top left corner of the image.
             output_size (sequence or int): (height, width) of the crop box. If int,
                 it is used for both directions
-            backend (str, optional): The image proccess backend type. Options are `pil`, `cv2`. Default: 'pil'. 
-        
+            backend (str, optional): The image proccess backend type. Options are `pil`, `cv2`. Default: 'pil'.
+
         Returns:
             np.array: Cropped image.
 
@@ -425,11 +425,11 @@ def affine(img,
         shear (sequence or float): shear angle value in degrees between -180 to 180, clockwise direction.
             If a sequence is specified, the first value corresponds to a shear parallel to the x axis, while
             the second value corresponds to a shear parallel to the y axis.
-        interpolation (int|str, optional): Interpolation method. If omitted, or if the 
+        interpolation (int|str, optional): Interpolation method. If omitted, or if the
             image has only one channel, it is set to cv2.INTER_NEAREST.
-            when use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
+            when use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
             - "bicubic": cv2.INTER_CUBIC
         fill (3-tuple or int): RGB pixel fill value for area outside the affined image.
             If int, it is used for all channels respectively.
@@ -497,11 +497,11 @@ def rotate(img,
     Args:
         img (np.array): Image to be rotated.
         angle (float or int): In degrees degrees counter clockwise order.
-        interpolation (int|str, optional): Interpolation method. If omitted, or if the 
+        interpolation (int|str, optional): Interpolation method. If omitted, or if the
             image has only one channel, it is set to cv2.INTER_NEAREST.
-            when use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
+            when use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
             - "bicubic": cv2.INTER_CUBIC
         expand (bool, optional): Optional expansion flag.
             If true, expands the output image to make it large enough to hold the entire rotated image.
@@ -589,11 +589,11 @@ def perspective(img, startpoints, endpoints, interpolation='nearest', fill=0):
         img (np.array): Image to be perspectived.
         startpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the original image,
         endpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the transformed image.
-        interpolation (int|str, optional): Interpolation method. If omitted, or if the 
+        interpolation (int|str, optional): Interpolation method. If omitted, or if the
             image has only one channel, it is set to cv2.INTER_NEAREST.
-            when use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
+            when use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
             - "bicubic": cv2.INTER_CUBIC
         fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
             If int, it is used for all channels respectively.
@@ -664,7 +664,7 @@ def normalize(img, mean, std, data_format='CHW', to_rgb=False):
         img (np.array): input data to be normalized.
         mean (list|tuple): Sequence of means for each channel.
         std (list|tuple): Sequence of standard deviations for each channel.
-        data_format (str, optional): Data format of img, should be 'HWC' or 
+        data_format (str, optional): Data format of img, should be 'HWC' or
             'CHW'. Default: 'CHW'.
         to_rgb (bool, optional): Whether to convert to rgb. Default: False.
 
@@ -701,7 +701,7 @@ def erase(img, i, j, h, w, v, inplace=False):
 
         Returns:
             np.array: Erased image.
-        
+
     """
     if not inplace:
         img = img.copy()
diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
index 50ed01f53e2..0dbcce265b9 100644
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -62,7 +62,7 @@ def to_tensor(pic, data_format='CHW'):
 
     Args:
         pic (PIL.Image): Image to be converted to tensor.
-        data_format (str, optional): Data format of output tensor, should be 'HWC' or 
+        data_format (str, optional): Data format of output tensor, should be 'HWC' or
             'CHW'. Default: 'CHW'.
 
     Returns:
@@ -113,13 +113,13 @@ def resize(img, size, interpolation='bilinear'):
     Args:
         input (PIL.Image): Image to be resized.
         size (int|list|tuple): Target size of input data, with (height, width) shape.
-        interpolation (int|str, optional): Interpolation method. when use pil backend, 
-            support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
-            - "bicubic": Image.BICUBIC, 
-            - "box": Image.BOX, 
-            - "lanczos": Image.LANCZOS, 
+        interpolation (int|str, optional): Interpolation method. when use pil backend,
+            support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
+            - "bicubic": Image.BICUBIC,
+            - "box": Image.BOX,
+            - "lanczos": Image.LANCZOS,
             - "hamming": Image.HAMMING
 
     Returns:
@@ -160,7 +160,7 @@ def pad(img, padding, fill=0, padding_mode='constant'):
             respectively.
         fill (float, optional): Pixel fill value for constant fill. If a tuple of
             length 3, it is used to fill R, G, B channels respectively.
-            This value is only used when the padding_mode is constant. Default: 0. 
+            This value is only used when the padding_mode is constant. Default: 0.
         padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default: 'constant'.
 
             - constant: pads with a constant value, this value is specified with fill
@@ -246,7 +246,7 @@ def crop(img, top, left, height, width):
     """Crops the given PIL Image.
 
     Args:
-        img (PIL.Image): Image to be cropped. (0,0) denotes the top left 
+        img (PIL.Image): Image to be cropped. (0,0) denotes the top left
             corner of the image.
         top (int): Vertical component of the top left corner of the crop box.
         left (int): Horizontal component of the top left corner of the crop box.
@@ -267,8 +267,8 @@ def center_crop(img, output_size):
             img (PIL.Image): Image to be cropped. (0,0) denotes the top left corner of the image.
             output_size (sequence or int): (height, width) of the crop box. If int,
                 it is used for both directions
-            backend (str, optional): The image proccess backend type. Options are `pil`, `cv2`. Default: 'pil'. 
-        
+            backend (str, optional): The image proccess backend type. Options are `pil`, `cv2`. Default: 'pil'.
+
         Returns:
             PIL.Image: Cropped image.
 
@@ -417,11 +417,11 @@ def affine(img, matrix, interpolation="nearest", fill=0):
     Args:
         img (PIL.Image): Image to be affined.
         matrix (float or int): Affine matrix.
-        interpolation (str, optional): Interpolation method. If omitted, or if the 
-            image has only one channel, it is set to PIL.Image.NEAREST . when use pil backend, 
-            support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
+        interpolation (str, optional): Interpolation method. If omitted, or if the
+            image has only one channel, it is set to PIL.Image.NEAREST . when use pil backend,
+            support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
             - "bicubic": Image.BICUBIC
         fill (3-tuple or int): RGB pixel fill value for area outside the affined image.
             If int, it is used for all channels respectively.
@@ -448,11 +448,11 @@ def rotate(img,
     Args:
         img (PIL.Image): Image to be rotated.
         angle (float or int): In degrees degrees counter clockwise order.
-        interpolation (str, optional): Interpolation method. If omitted, or if the 
-            image has only one channel, it is set to PIL.Image.NEAREST . when use pil backend, 
-            support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
+        interpolation (str, optional): Interpolation method. If omitted, or if the
+            image has only one channel, it is set to PIL.Image.NEAREST . when use pil backend,
+            support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
             - "bicubic": Image.BICUBIC
         expand (bool, optional): Optional expansion flag.
             If true, expands the output image to make it large enough to hold the entire rotated image.
@@ -485,11 +485,11 @@ def perspective(img, coeffs, interpolation="nearest", fill=0):
     Args:
         img (PIL.Image): Image to be perspectived.
         coeffs (list[float]): coefficients (a, b, c, d, e, f, g, h) of the perspective transforms.
-        interpolation (str, optional): Interpolation method. If omitted, or if the 
-            image has only one channel, it is set to PIL.Image.NEAREST . when use pil backend, 
-            support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
+        interpolation (str, optional): Interpolation method. If omitted, or if the
+            image has only one channel, it is set to PIL.Image.NEAREST . when use pil backend,
+            support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
             - "bicubic": Image.BICUBIC
         fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
             If int, it is used for all channels respectively.
@@ -511,8 +511,8 @@ def to_grayscale(img, num_output_channels=1):
 
     Args:
         img (PIL.Image): Image to be converted to grayscale.
-        backend (str, optional): The image proccess backend type. Options are `pil`, 
-                    `cv2`. Default: 'pil'. 
+        backend (str, optional): The image proccess backend type. Options are `pil`,
+                    `cv2`. Default: 'pil'.
 
     Returns:
         PIL.Image: Grayscale version of the image.
@@ -550,7 +550,7 @@ def erase(img, i, j, h, w, v, inplace=False):
 
         Returns:
             PIL.Image: Erased image.
-        
+
     """
     np_img = np.array(img, dtype=np.uint8)
     np_img[i:i + h, j:j + w, ...] = v
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index 4cf8253ec8b..bc9501833d0 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -153,7 +153,7 @@ def normalize(img, mean, std, data_format='CHW'):
         img (paddle.Tensor): input data to be normalized.
         mean (list|tuple): Sequence of means for each channel.
         std (list|tuple): Sequence of standard deviations for each channel.
-        data_format (str, optional): Data format of img, should be 'HWC' or 
+        data_format (str, optional): Data format of img, should be 'HWC' or
             'CHW'. Default: 'CHW'.
 
     Returns:
@@ -179,8 +179,8 @@ def to_grayscale(img, num_output_channels=1, data_format='CHW'):
         img (paddel.Tensor): Image to be converted to grayscale.
         num_output_channels (int, optionl[1, 3]):
             if num_output_channels = 1 : returned image is single channel
-            if num_output_channels = 3 : returned image is 3 channel 
-        data_format (str, optional): Data format of img, should be 'HWC' or 
+            if num_output_channels = 3 : returned image is 3 channel
+        data_format (str, optional): Data format of img, should be 'HWC' or
             'CHW'. Default: 'CHW'.
 
     Returns:
@@ -262,15 +262,15 @@ def affine(img, matrix, interpolation="nearest", fill=None, data_format='CHW'):
     Args:
         img (paddle.Tensor): Image to be rotated.
         matrix (float or int): Affine matrix.
-        interpolation (str, optional): Interpolation method. If omitted, or if the 
-            image has only one channel, it is set NEAREST . when use pil backend, 
-            support method are as following: 
-            - "nearest" 
+        interpolation (str, optional): Interpolation method. If omitted, or if the
+            image has only one channel, it is set NEAREST . when use pil backend,
+            support method are as following:
+            - "nearest"
             - "bilinear"
             - "bicubic"
         fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
             If int, it is used for all channels respectively.
-        data_format (str, optional): Data format of img, should be 'HWC' or 
+        data_format (str, optional): Data format of img, should be 'HWC' or
             'CHW'. Default: 'CHW'.
 
     Returns:
@@ -316,10 +316,10 @@ def rotate(img,
     Args:
         img (paddle.Tensor): Image to be rotated.
         angle (float or int): In degrees degrees counter clockwise order.
-        interpolation (str, optional): Interpolation method. If omitted, or if the 
-            image has only one channel, it is set NEAREST . when use pil backend, 
-            support method are as following: 
-            - "nearest" 
+        interpolation (str, optional): Interpolation method. If omitted, or if the
+            image has only one channel, it is set NEAREST . when use pil backend,
+            support method are as following:
+            - "nearest"
             - "bilinear"
             - "bicubic"
         expand (bool, optional): Optional expansion flag.
@@ -437,10 +437,10 @@ def perspective(img,
     Args:
         img (paddle.Tensor): Image to be rotated.
         coeffs (list[float]): coefficients (a, b, c, d, e, f, g, h) of the perspective transforms.
-        interpolation (str, optional): Interpolation method. If omitted, or if the 
-            image has only one channel, it is set NEAREST. When use pil backend, 
-            support method are as following: 
-            - "nearest" 
+        interpolation (str, optional): Interpolation method. If omitted, or if the
+            image has only one channel, it is set NEAREST. When use pil backend,
+            support method are as following:
+            - "nearest"
             - "bilinear"
             - "bicubic"
         fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
@@ -474,7 +474,7 @@ def vflip(img, data_format='CHW'):
 
     Args:
         img (paddle.Tensor): Image to be flipped.
-        data_format (str, optional): Data format of img, should be 'HWC' or 
+        data_format (str, optional): Data format of img, should be 'HWC' or
             'CHW'. Default: 'CHW'.
 
     Returns:
@@ -493,7 +493,7 @@ def hflip(img, data_format='CHW'):
 
     Args:
         img (paddle.Tensor): Image to be flipped.
-        data_format (str, optional): Data format of img, should be 'HWC' or 
+        data_format (str, optional): Data format of img, should be 'HWC' or
             'CHW'. Default: 'CHW'.
 
     Returns:
@@ -511,13 +511,13 @@ def crop(img, top, left, height, width, data_format='CHW'):
     """Crops the given paddle.Tensor Image.
 
     Args:
-        img (paddle.Tensor): Image to be cropped. (0,0) denotes the top left 
+        img (paddle.Tensor): Image to be cropped. (0,0) denotes the top left
             corner of the image.
         top (int): Vertical component of the top left corner of the crop box.
         left (int): Horizontal component of the top left corner of the crop box.
         height (int): Height of the crop box.
         width (int): Width of the crop box.
-        data_format (str, optional): Data format of img, should be 'HWC' or 
+        data_format (str, optional): Data format of img, should be 'HWC' or
             'CHW'. Default: 'CHW'.
     Returns:
         paddle.Tensor: Cropped image.
@@ -545,7 +545,7 @@ def erase(img, i, j, h, w, v, inplace=False):
 
         Returns:
             paddle.Tensor: Erased image.
-        
+
     """
     _assert_image_tensor(img, 'CHW')
     if not inplace:
@@ -561,9 +561,9 @@ def center_crop(img, output_size, data_format='CHW'):
         Args:
             img (paddle.Tensor): Image to be cropped. (0,0) denotes the top left corner of the image.
             output_size (sequence or int): (height, width) of the crop box. If int,
-                it is used for both directions   
-            data_format (str, optional): Data format of img, should be 'HWC' or 
-                'CHW'. Default: 'CHW'.     
+                it is used for both directions
+            data_format (str, optional): Data format of img, should be 'HWC' or
+                'CHW'. Default: 'CHW'.
         Returns:
             paddle.Tensor: Cropped image.
 
@@ -598,7 +598,7 @@ def pad(img, padding, fill=0, padding_mode='constant', data_format='CHW'):
             respectively.
         fill (float, optional): Pixel fill value for constant fill. If a tuple of
             length 3, it is used to fill R, G, B channels respectively.
-            This value is only used when the padding_mode is constant. Default: 0. 
+            This value is only used when the padding_mode is constant. Default: 0.
         padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default: 'constant'.
 
             - constant: pads with a constant value, this value is specified with fill
@@ -672,9 +672,9 @@ def resize(img, size, interpolation='bilinear', data_format='CHW'):
     Args:
         input (paddle.Tensor): Image to be resized.
         size (int|list|tuple): Target size of input data, with (height, width) shape.
-        interpolation (int|str, optional): Interpolation method. when use paddle backend, 
-            support method are as following: 
-            - "nearest"  
+        interpolation (int|str, optional): Interpolation method. when use paddle backend,
+            support method are as following:
+            - "nearest"
             - "bilinear"
             - "bicubic"
             - "trilinear"
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 301252a048b..a9c0d7f182a 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -97,7 +97,7 @@ class Compose(object):
         object will call each given :attr:`transforms` sequencely.
 
     Examples:
-    
+
         .. code-block:: python
 
             from paddle.vision.datasets import Flowers
@@ -139,12 +139,12 @@ class BaseTransform(object):
     """
     Base class of all transforms used in computer vision.
 
-    calling logic: 
+    calling logic:
 
         if keys is None:
             _get_params -> _apply_image()
         else:
-            _get_params -> _apply_*() for * in keys 
+            _get_params -> _apply_*() for * in keys
 
     If you want to implement a self-defined transform method for image,
     rewrite _apply_* method in subclass.
@@ -153,25 +153,25 @@ class BaseTransform(object):
         keys (list[str]|tuple[str], optional): Input type. Input is a tuple contains different structures,
             key is used to specify the type of input. For example, if your input
             is image type, then the key can be None or ("image"). if your input
-            is (image, image) type, then the keys should be ("image", "image"). 
+            is (image, image) type, then the keys should be ("image", "image").
             if your input is (image, boxes), then the keys should be ("image", "boxes").
 
             Current available strings & data type are describe below:
 
-            - "image": input image, with shape of (H, W, C) 
-            - "coords": coordinates, with shape of (N, 2) 
-            - "boxes": bounding boxes, with shape of (N, 4), "xyxy" format, 
-            
-                       the 1st "xy" represents top left point of a box, 
+            - "image": input image, with shape of (H, W, C)
+            - "coords": coordinates, with shape of (N, 2)
+            - "boxes": bounding boxes, with shape of (N, 4), "xyxy" format,
+
+                       the 1st "xy" represents top left point of a box,
                        the 2nd "xy" represents right bottom point.
 
             - "mask": map used for segmentation, with shape of (H, W, 1)
-            
+
             You can also customize your data types only if you implement the corresponding
             _apply_*() methods, otherwise ``NotImplementedError`` will be raised.
-    
+
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -220,7 +220,7 @@ class BaseTransform(object):
                     maxxy = coords.max(axis=1)
                     trans_boxes = np.concatenate((minxy, maxxy), axis=1)
                     return trans_boxes
-                    
+
                 # if you only want to transform image, do not need to rewrite this function
                 def _apply_mask(self, mask):
                     if self.params['flip']:
@@ -302,22 +302,22 @@ class ToTensor(BaseTransform):
 
     Converts a PIL.Image or numpy.ndarray (H x W x C) to a paddle.Tensor of shape (C x H x W).
 
-    If input is a grayscale image (H x W), it will be converted to a image of shape (H x W x 1). 
+    If input is a grayscale image (H x W), it will be converted to a image of shape (H x W x 1).
     And the shape of output tensor will be (1 x H x W).
 
     If you want to keep the shape of output tensor as (H x W x C), you can set data_format = ``HWC`` .
 
-    Converts a PIL.Image or numpy.ndarray in the range [0, 255] to a paddle.Tensor in the 
-    range [0.0, 1.0] if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, 
-    RGBA, CMYK, 1) or if the numpy.ndarray has dtype = np.uint8. 
+    Converts a PIL.Image or numpy.ndarray in the range [0, 255] to a paddle.Tensor in the
+    range [0.0, 1.0] if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr,
+    RGBA, CMYK, 1) or if the numpy.ndarray has dtype = np.uint8.
 
     In the other cases, tensors are returned without scaling.
 
     Args:
-        data_format (str, optional): Data format of output tensor, should be 'HWC' or 
+        data_format (str, optional): Data format of output tensor, should be 'HWC' or
             'CHW'. Default: 'CHW'.
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
-    
+
     Shape:
         - img(PIL.Image|np.ndarray): The input image with shape (H x W x C).
         - output(np.ndarray): A tensor with shape (C x H x W) or (H x W x C) according option data_format.
@@ -326,7 +326,7 @@ class ToTensor(BaseTransform):
         A callable object of ToTensor.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -340,10 +340,10 @@ class ToTensor(BaseTransform):
             transform = T.ToTensor()
 
             tensor = transform(fake_img)
-            
+
             print(tensor.shape)
             # [3, 4, 5]
-    
+
             print(tensor.dtype)
             # paddle.float32
     """
@@ -372,19 +372,19 @@ class Resize(BaseTransform):
             smaller edge of the image will be matched to this number.
             i.e, if height > width, then image will be rescaled to
             (size * height / width, size)
-        interpolation (int|str, optional): Interpolation method. Default: 'bilinear'. 
-            when use pil backend, support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
-            - "bicubic": Image.BICUBIC, 
-            - "box": Image.BOX, 
-            - "lanczos": Image.LANCZOS, 
+        interpolation (int|str, optional): Interpolation method. Default: 'bilinear'.
+            when use pil backend, support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
+            - "bicubic": Image.BICUBIC,
+            - "box": Image.BOX,
+            - "lanczos": Image.LANCZOS,
             - "hamming": Image.HAMMING
-            when use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
-            - "area": cv2.INTER_AREA, 
-            - "bicubic": cv2.INTER_CUBIC, 
+            when use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
+            - "area": cv2.INTER_AREA,
+            - "bicubic": cv2.INTER_CUBIC,
             - "lanczos": cv2.INTER_LANCZOS4
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
@@ -396,7 +396,7 @@ class Resize(BaseTransform):
         A callable object of Resize.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -435,22 +435,22 @@ class RandomResizedCrop(BaseTransform):
 
     Args:
         size (int|list|tuple): Target size of output image, with (height, width) shape.
-        scale (list|tuple): Scale range of the cropped image before resizing, relatively to the origin 
+        scale (list|tuple): Scale range of the cropped image before resizing, relatively to the origin
             image. Default: (0.08, 1.0)
         ratio (list|tuple): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
-        interpolation (int|str, optional): Interpolation method. Default: 'bilinear'. when use pil backend, 
-            support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
-            - "bicubic": Image.BICUBIC, 
-            - "box": Image.BOX, 
-            - "lanczos": Image.LANCZOS, 
+        interpolation (int|str, optional): Interpolation method. Default: 'bilinear'. when use pil backend,
+            support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
+            - "bicubic": Image.BICUBIC,
+            - "box": Image.BOX,
+            - "lanczos": Image.LANCZOS,
             - "hamming": Image.HAMMING
-            when use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
-            - "area": cv2.INTER_AREA, 
-            - "bicubic": cv2.INTER_CUBIC, 
+            when use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
+            - "area": cv2.INTER_AREA,
+            - "bicubic": cv2.INTER_CUBIC,
             - "lanczos": cv2.INTER_LANCZOS4
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
@@ -462,7 +462,7 @@ class RandomResizedCrop(BaseTransform):
         A callable object of RandomResizedCrop.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -550,7 +550,7 @@ class CenterCrop(BaseTransform):
         A callable object of CenterCrop.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -591,7 +591,7 @@ class RandomHorizontalFlip(BaseTransform):
         A callable object of RandomHorizontalFlip.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -632,7 +632,7 @@ class RandomVerticalFlip(BaseTransform):
         A callable object of RandomVerticalFlip.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -668,7 +668,7 @@ class Normalize(BaseTransform):
     Args:
         mean (int|float|list|tuple, optional): Sequence of means for each channel.
         std (int|float|list|tuple, optional): Sequence of standard deviations for each channel.
-        data_format (str, optional): Data format of img, should be 'HWC' or 
+        data_format (str, optional): Data format of img, should be 'HWC' or
             'CHW'. Default: 'CHW'.
         to_rgb (bool, optional): Whether to convert to rgb. Default: False.
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
@@ -681,7 +681,7 @@ class Normalize(BaseTransform):
         A callable object of Normalize.
 
     Examples:
-    
+
         .. code-block:: python
           :name: code-example
             import paddle
@@ -698,7 +698,7 @@ class Normalize(BaseTransform):
             # (300, 320, 3)
             print(fake_img.max(), fake_img.min())
             # 0.99999905 -0.999974
-    
+
     """
 
     def __init__(self,
@@ -728,22 +728,22 @@ class Transpose(BaseTransform):
     """Transpose input data to a target format.
     For example, most transforms use HWC mode image,
     while the Neural Network might use CHW mode input tensor.
-    output image will be an instance of numpy.ndarray. 
+    output image will be an instance of numpy.ndarray.
 
     Args:
         order (list|tuple, optional): Target order of input data. Default: (2, 0, 1).
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
-    
+
     Shape:
         - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
-        - output(np.ndarray|Paddle.Tensor): A transposed array or tensor. If input 
+        - output(np.ndarray|Paddle.Tensor): A transposed array or tensor. If input
             is a PIL.Image, output will be converted to np.ndarray automatically.
 
     Returns:
         A callable object of Transpose.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -756,7 +756,7 @@ class Transpose(BaseTransform):
 
             fake_img = transform(fake_img)
             print(fake_img.shape)
-    
+
     """
 
     def __init__(self, order=(2, 0, 1), keys=None):
@@ -791,7 +791,7 @@ class BrightnessTransform(BaseTransform):
         A callable object of BrightnessTransform.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -803,7 +803,7 @@ class BrightnessTransform(BaseTransform):
             fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            
+
     """
 
     def __init__(self, value, keys=None):
@@ -834,7 +834,7 @@ class ContrastTransform(BaseTransform):
         A callable object of ContrastTransform.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -879,7 +879,7 @@ class SaturationTransform(BaseTransform):
         A callable object of SaturationTransform.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -889,7 +889,7 @@ class SaturationTransform(BaseTransform):
             transform = SaturationTransform(0.4)
 
             fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
-        
+
             fake_img = transform(fake_img)
 
     """
@@ -922,7 +922,7 @@ class HueTransform(BaseTransform):
         A callable object of HueTransform.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -975,7 +975,7 @@ class ColorJitter(BaseTransform):
         A callable object of ColorJitter.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -1051,7 +1051,7 @@ class RandomCrop(BaseTransform):
             int instead of sequence like (h, w), a square crop (size, size) is
             made.
         padding (int|sequence, optional): Optional padding on each border
-            of the image. If a sequence of length 4 is provided, it is used to pad left, 
+            of the image. If a sequence of length 4 is provided, it is used to pad left,
             top, right, bottom borders respectively. Default: None, without padding.
         pad_if_needed (boolean, optional): It will pad the image if smaller than the
             desired size to avoid raising an exception. Default: False.
@@ -1074,7 +1074,7 @@ class RandomCrop(BaseTransform):
                      padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
                      will result in [2, 1, 1, 2, 3, 4, 4, 3]
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
-    
+
     Shape
         - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
         - output(PIL.Image|np.ndarray|Paddle.Tensor): A random cropped image.
@@ -1083,7 +1083,7 @@ class RandomCrop(BaseTransform):
         A callable object of RandomCrop.
 
     Examples:
-    
+
         .. code-block:: python
           :name: code-example1
 
@@ -1174,16 +1174,16 @@ class Pad(BaseTransform):
             length 3, it is used to fill R, G, B channels respectively.
             This value is only used when the padding_mode is constant
         padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
-            ``constant`` means pads with a constant value, this value is specified with fill. 
-            ``edge`` means pads with the last value at the edge of the image. 
-            ``reflect`` means pads with reflection of image (without repeating the last value on the edge) 
-            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in reflect mode 
+            ``constant`` means pads with a constant value, this value is specified with fill.
+            ``edge`` means pads with the last value at the edge of the image.
+            ``reflect`` means pads with reflection of image (without repeating the last value on the edge)
+            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in reflect mode
             will result in ``[3, 2, 1, 2, 3, 4, 3, 2]``.
             ``symmetric`` menas pads with reflection of image (repeating the last value on the edge)
-            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in symmetric mode 
+            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in symmetric mode
             will result in ``[2, 1, 1, 2, 3, 4, 4, 3]``.
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
-    
+
     Shape:
         - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
         - output(PIL.Image|np.ndarray|Paddle.Tensor): A paded image.
@@ -1192,7 +1192,7 @@ class Pad(BaseTransform):
         A callable object of Pad.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -1268,25 +1268,25 @@ class RandomAffine(BaseTransform):
             will be (-degrees, +degrees) in clockwise order. If set 0, will not rotate.
         translate (tuple, optional): Maximum absolute fraction for horizontal and vertical translations.
             For example translate=(a, b), then horizontal shift is randomly sampled in the range -img_width * a < dx < img_width * a
-            and vertical shift is randomly sampled in the range -img_height * b < dy < img_height * b. 
+            and vertical shift is randomly sampled in the range -img_height * b < dy < img_height * b.
             Default is None, will not translate.
-        scale (tuple, optional): Scaling factor interval, e.g (a, b), then scale is randomly sampled from the range a <= scale <= b. 
+        scale (tuple, optional): Scaling factor interval, e.g (a, b), then scale is randomly sampled from the range a <= scale <= b.
             Default is None, will keep original scale and not scale.
         shear (sequence or number, optional): Range of degrees to shear, ranges from -180 to 180 in clockwise order.
-            If set as a number, a shear parallel to the x axis in the range (-shear, +shear) will be applied. 
-            Else if set as a sequence of 2 values a shear parallel to the x axis in the range (shear[0], shear[1]) will be applied. 
+            If set as a number, a shear parallel to the x axis in the range (-shear, +shear) will be applied.
+            Else if set as a sequence of 2 values a shear parallel to the x axis in the range (shear[0], shear[1]) will be applied.
             Else if set as a sequence of 4 values, a x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
             Default is None, will not apply shear.
-        interpolation (str, optional): Interpolation method. If omitted, or if the 
-            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
-            according the backend. 
-            When use pil backend, support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
+        interpolation (str, optional): Interpolation method. If omitted, or if the
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST
+            according the backend.
+            When use pil backend, support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
             - "bicubic": Image.BICUBIC
-            When use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
+            When use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
             - "bicubic": cv2.INTER_CUBIC
         fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
@@ -1303,7 +1303,7 @@ class RandomAffine(BaseTransform):
         A callable object of RandomAffine.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
@@ -1427,15 +1427,15 @@ class RandomRotation(BaseTransform):
         degrees (sequence or float or int): Range of degrees to select from.
             If degrees is a number instead of sequence like (min, max), the range of degrees
             will be (-degrees, +degrees) clockwise order.
-        interpolation (str, optional): Interpolation method. If omitted, or if the 
-            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
-            according the backend. when use pil backend, support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
+        interpolation (str, optional): Interpolation method. If omitted, or if the
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST
+            according the backend. when use pil backend, support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
             - "bicubic": Image.BICUBIC
-            when use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
+            when use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
             - "bicubic": cv2.INTER_CUBIC
         expand (bool|optional): Optional expansion flag. Default: False.
             If true, expands the output to make it large enough to hold the entire rotated image.
@@ -1445,7 +1445,7 @@ class RandomRotation(BaseTransform):
             Origin is the upper left corner.
             Default is the center of the image.
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
-    
+
     Shape:
         - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
         - output(PIL.Image|np.ndarray|Paddle.Tensor): A rotated image.
@@ -1454,7 +1454,7 @@ class RandomRotation(BaseTransform):
         A callable object of RandomRotation.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -1524,13 +1524,13 @@ class RandomPerspective(BaseTransform):
         interpolation (str, optional): Interpolation method. If omitted, or if
             the image has only one channel, it is set to PIL.Image.NEAREST or
             cv2.INTER_NEAREST.
-            When use pil backend, support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
+            When use pil backend, support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
             - "bicubic": Image.BICUBIC
-            When use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
+            When use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
             - "bicubic": cv2.INTER_CUBIC
         fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
@@ -1544,7 +1544,7 @@ class RandomPerspective(BaseTransform):
         A callable object of RandomPerspective.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
@@ -1645,7 +1645,7 @@ class Grayscale(BaseTransform):
 
     Shape:
         - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
-        - output(PIL.Image|np.ndarray|Paddle.Tensor): Grayscale version of the input image. 
+        - output(PIL.Image|np.ndarray|Paddle.Tensor): Grayscale version of the input image.
             - If output_channels == 1 : returned image is single channel
             - If output_channels == 3 : returned image is 3 channel with r == g == b
 
@@ -1653,7 +1653,7 @@ class Grayscale(BaseTransform):
         A callable object of Grayscale.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -1688,19 +1688,19 @@ class RandomErasing(BaseTransform):
 
     Args:
         prob (float, optional): Probability of the input data being erased. Default: 0.5.
-        scale (sequence, optional): The proportional range of the erased area to the input image. 
+        scale (sequence, optional): The proportional range of the erased area to the input image.
                                     Default: (0.02, 0.33).
         ratio (sequence, optional): Aspect ratio range of the erased area. Default: (0.3, 3.3).
         value (int|float|sequence|str, optional): The value each pixel in erased area will be replaced with.
-                               If value is a single number, all pixels will be erased with this value. 
-                               If value is a sequence with length 3, the R, G, B channels will be ereased 
-                               respectively. If value is set to "random", each pixel will be erased with 
+                               If value is a single number, all pixels will be erased with this value.
+                               If value is a sequence with length 3, the R, G, B channels will be ereased
+                               respectively. If value is set to "random", each pixel will be erased with
                                random values. Default: 0.
         inplace (bool, optional): Whether this transform is inplace. Default: False.
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
-    
+
     Shape:
-        - img(paddle.Tensor | np.array | PIL.Image): The input image. For Tensor input, the shape should be (C, H, W). 
+        - img(paddle.Tensor | np.array | PIL.Image): The input image. For Tensor input, the shape should be (C, H, W).
                  For np.array input, the shape should be (H, W, C).
         - output(paddle.Tensor | np.array | PIL.Image): A random erased image.
 
@@ -1708,11 +1708,11 @@ class RandomErasing(BaseTransform):
         A callable object of RandomErasing.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
-            
+
             fake_img = paddle.randn((3, 10, 10)).astype(paddle.float32)
             transform = paddle.vision.transforms.RandomErasing()
             result = transform(fake_img)
@@ -1755,10 +1755,10 @@ class RandomErasing(BaseTransform):
 
         Args:
             img (paddle.Tensor | np.array | PIL.Image): Image to be erased.
-            scale (sequence, optional): The proportional range of the erased area to the input image. 
+            scale (sequence, optional): The proportional range of the erased area to the input image.
             ratio (sequence, optional): Aspect ratio range of the erased area.
             value (sequence | None): The value each pixel in erased area will be replaced with.
-                               If value is a sequence with length 3, the R, G, B channels will be ereased 
+                               If value is a sequence with length 3, the R, G, B channels will be ereased
                                respectively. If value is None, each pixel will be erased with random values.
 
         Returns:
diff --git a/r/example/mobilenet.py b/r/example/mobilenet.py
index 99e755ab69f..deaa6582cd2 100755
--- a/r/example/mobilenet.py
+++ b/r/example/mobilenet.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python3.7
 
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tools/check_ut.py b/tools/check_ut.py
index f5fe4c687dd..a9172799225 100644
--- a/tools/check_ut.py
+++ b/tools/check_ut.py
@@ -28,10 +28,10 @@ class PRChecker(object):
         self.repo = None
 
     def check(self, filename, msg):
-        """ 
+        """
         Args:
-            filename (str): File to get block names.  
-            msg (str): Error message.  
+            filename (str): File to get block names.
+            msg (str): Error message.
         """
         pr_id = os.getenv('GIT_PR_ID')
         if not pr_id:
diff --git a/tools/codestyle/test_docstring_checker.py b/tools/codestyle/test_docstring_checker.py
index b05983dd343..8c6c7f8de53 100644
--- a/tools/codestyle/test_docstring_checker.py
+++ b/tools/codestyle/test_docstring_checker.py
@@ -24,8 +24,8 @@ class TestDocstring(pylint.testutils.CheckerTestCase):
 
     def test_one_line(self):
         func_node = astroid.extract_node('''
-        def test(): 
-            """get 
+        def test():
+            """get
             news.
             """
             if True:
@@ -40,7 +40,7 @@ class TestDocstring(pylint.testutils.CheckerTestCase):
 
     def test_one_line_1(self):
         func_node = astroid.extract_node('''
-        def test(): 
+        def test():
             """get news"""
             if True:
                 return 5
@@ -54,7 +54,7 @@ class TestDocstring(pylint.testutils.CheckerTestCase):
 
     def test_args(self):
         func_node = astroid.extract_node('''
-        def test(scale, mean): 
+        def test(scale, mean):
             """get news.
             Args:
                 scale (int): scale is the number.
@@ -75,7 +75,7 @@ class TestDocstring(pylint.testutils.CheckerTestCase):
 
     def test_missing(self):
         func_node = astroid.extract_node('''
-        def test(): 
+        def test():
             mean=scale
             mean=scale
             mean=scale
@@ -96,11 +96,11 @@ class TestDocstring(pylint.testutils.CheckerTestCase):
 
     def test_indent(self):
         func_node = astroid.extract_node('''
-        def test(): 
+        def test():
             """ get get get get get get get get
               get get get get get get get get.
             """
-            pass 
+            pass
         ''')
 
         self.checker.visit_functiondef(func_node)
@@ -110,7 +110,7 @@ class TestDocstring(pylint.testutils.CheckerTestCase):
 
     def test_with_resturns(self):
         func_node = astroid.extract_node('''
-        def test(): 
+        def test():
             """get news.
             Args:
                 scale (int): scale is the number.
@@ -136,7 +136,7 @@ class TestDocstring(pylint.testutils.CheckerTestCase):
 
     def test_with_raises(self):
         func_node = astroid.extract_node('''
-        def test(): 
+        def test():
             """get news.
             Args:
                 scale (int): scale is the number.
diff --git a/tools/count_api_without_core_ops.py b/tools/count_api_without_core_ops.py
index c408b30a136..8daddde398b 100644
--- a/tools/count_api_without_core_ops.py
+++ b/tools/count_api_without_core_ops.py
@@ -209,8 +209,8 @@ if __name__ == "__main__":
                 print(name, func_dict[name])
 
     else:
-        print("""Usage: 
-            1. Count and list all operator-raleated APIs that contains append_op but not _legacy_C_ops.xx. 
+        print("""Usage:
+            1. Count and list all operator-raleated APIs that contains append_op but not _legacy_C_ops.xx.
                 python ./count_api_without_core_ops.py -c paddle
             2. Print api and the md5 of source code of the api.
                 python ./count_api_without_core_ops.py -p paddle
diff --git a/tools/coverage/coverage_diff.py b/tools/coverage/coverage_diff.py
index fc5a34364c5..713afe1ebc3 100644
--- a/tools/coverage/coverage_diff.py
+++ b/tools/coverage/coverage_diff.py
@@ -24,7 +24,7 @@ import sys
 def get_diff_file_lines(diff_file):
     """
     Args:
-        diff_file (str): File to get modified lines.  
+        diff_file (str): File to get modified lines.
 
     Returns:
         dict: The diff lines of files.
@@ -67,7 +67,7 @@ def get_info_file_lines(info_file, diff_file):
     """
     Args:
         info_file (str): File generated by lcov.
-        diff_file (str): File to get modified lines.  
+        diff_file (str): File to get modified lines.
 
     Returns:
         None
diff --git a/tools/coverage/coverage_diff_list.py b/tools/coverage/coverage_diff_list.py
index 13ba471c13a..c57c1e9d964 100644
--- a/tools/coverage/coverage_diff_list.py
+++ b/tools/coverage/coverage_diff_list.py
@@ -24,8 +24,8 @@ import sys
 def filter_by(list_file, max_rate):
     """
     Args:
-        list_file (str): File of list.  
-        max_rate (float): Max rate.  
+        list_file (str): File of list.
+        max_rate (float): Max rate.
 
     Returns:
         tuple: File and coverage rate.
diff --git a/tools/coverage/pull_request.py b/tools/coverage/pull_request.py
index 53325d36820..cecb22bfc3f 100644
--- a/tools/coverage/pull_request.py
+++ b/tools/coverage/pull_request.py
@@ -45,7 +45,7 @@ def get_pull(pull_id):
 def get_files(args):
     """
     Args:
-        args (argparse.ArgumentParser().parse_args()): Arguments. 
+        args (argparse.ArgumentParser().parse_args()): Arguments.
 
     Returns:
         None.
@@ -60,7 +60,7 @@ def get_files(args):
 def diff(args):
     """
     Args:
-        args (argparse.ArgumentParser().parse_args()): Arguments. 
+        args (argparse.ArgumentParser().parse_args()): Arguments.
 
     Returns:
         None.
diff --git a/tools/diff_api.py b/tools/diff_api.py
index 8dabf316c2d..8f786474242 100644
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -40,7 +40,7 @@ for each_diff in result:
     if each_diff[0] != ' ':
         diffs.append(each_diff)
 '''
-If you modify/add/delete the API files, including code and comment, 
+If you modify/add/delete the API files, including code and comment,
 please follow these steps in order to pass the CI:
 
   1. cd ${paddle_path}, compile paddle;
diff --git a/tools/diff_unittest.py b/tools/diff_unittest.py
index 178fd1647d9..0bd5c2cf96a 100644
--- a/tools/diff_unittest.py
+++ b/tools/diff_unittest.py
@@ -42,7 +42,7 @@ for i in origin:
         error = True
         diffs.append(i)
 '''
-If you delete the unit test, such as commenting it out, 
+If you delete the unit test, such as commenting it out,
 please ask for approval of one RD below for passing CI:
 
     - kolinwei(recommended) or zhouwei25
diff --git a/tools/final_ut_parallel_rule.py b/tools/final_ut_parallel_rule.py
index 2c696e3139d..5f091219b1f 100644
--- a/tools/final_ut_parallel_rule.py
+++ b/tools/final_ut_parallel_rule.py
@@ -135,7 +135,7 @@ def classify_cases_by_mem(rootPath):
                                 always_timeout_list[0])
                         always_timeout_list.pop(0)
                     else:
-                        f.write(case_mem_1_line + '\n') 
+                        f.write(case_mem_1_line + '\n')
                     count += 1
                     '''
                     case_mem_1_line = '^job$|^' + index[0] + '$'
diff --git a/tools/gen_ut_cmakelists.py b/tools/gen_ut_cmakelists.py
index 58cb6b42461..32ec3010602 100644
--- a/tools/gen_ut_cmakelists.py
+++ b/tools/gen_ut_cmakelists.py
@@ -143,7 +143,7 @@ def _process_run_serial(run_serial):
 def _file_with_extension(prefix, suffixes):
     """
     Desc:
-        check whether test file exists. 
+        check whether test file exists.
     """
     for ext in suffixes:
         if os.path.isfile(prefix + ext):
@@ -329,7 +329,7 @@ class DistUTPortManager():
         If the directories are newly created or there is no CMakeLists.txt before, or ignore this error, you
         must specify the directories using the args option --ignore-cmake-dirs/-i.
         If you want to keep the dist ports of old tests unchanged, please ensure the old
-        verson CMakeLists.txt file existing before using the gen_ut_cmakelists tool to 
+        verson CMakeLists.txt file existing before using the gen_ut_cmakelists tool to
         generate new CmakeLists.txt files.
     ====================================================================================
     """
@@ -481,7 +481,7 @@ class CMakeGenerator():
             return
         cmds = """# This file is generated by ${PADDLE_ROOT}/tools/gen_ut_cmakelists.py.
     # Please don't modify this file manually.
-    # If you need to change unittests in this file, please modify testslist.csv in the current directory 
+    # If you need to change unittests in this file, please modify testslist.csv in the current directory
     # and then run the command `python3 ${PADDLE_ROOT}/tools/gen_ut_cmakelists.py -f ${CURRENT_DIRECTORY}/testslist.csv`
     set(LOCAL_ALL_ARCH ON)
     set(LOCAL_ALL_PLAT ON)\n"""
diff --git a/tools/jetson_infer_op.py b/tools/jetson_infer_op.py
index d046483efda..c84d7d50d63 100644
--- a/tools/jetson_infer_op.py
+++ b/tools/jetson_infer_op.py
@@ -152,7 +152,7 @@ def set_diff_value(file, atol="1e-5", inplace_atol="1e-7"):
     """
     :param file: refer to op_test.py
     :param atol: refer to op_test.py
-    :param inplace_atol: 
+    :param inplace_atol:
     :return:
     """
     os.system("sed -i 's/self.check_output(/self\.check_output\(atol=" + atol +
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 5e5003cf53e..575ad9c8821 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -230,7 +230,7 @@ HIGH_PARALLEL_JOB_NEW = [
     'test_launch_coverage',
     'test_mkldnn_conv_activation_fuse_pass',
     'test_inference_model_io',
-    'test_fusion_repeated_fc_relu_op',  #'heter_listen_and_server_test', 
+    'test_fusion_repeated_fc_relu_op',  #'heter_listen_and_server_test',
     'cudnn_desc_test',
     'test_beam_search_op',
     'test_var_conv_2d',
diff --git a/tools/prune_for_jetson.py b/tools/prune_for_jetson.py
index cbc03393360..4be99a85c41 100644
--- a/tools/prune_for_jetson.py
+++ b/tools/prune_for_jetson.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-This script simply removes all grad ops and kernels. You should use this script 
+This script simply removes all grad ops and kernels. You should use this script
 when cmake ON_INFER=ON, which can greatly reduce the volume of the prediction library.
 """
 
diff --git a/tools/remove_grad_op_and_kernel.py b/tools/remove_grad_op_and_kernel.py
index bbf5616fc43..8f01895d01b 100644
--- a/tools/remove_grad_op_and_kernel.py
+++ b/tools/remove_grad_op_and_kernel.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-This script simply removes all grad ops and kernels. You should use this script 
+This script simply removes all grad ops and kernels. You should use this script
 when cmake ON_INFER=ON, which can greatly reduce the volume of the prediction library.
 """
 
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 6a9b4729e40..9858925ca72 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 """
 please make sure to run in the tools path
-usage: python sample_test.py {cpu or gpu} 
+usage: python sample_test.py {cpu or gpu}
     {cpu or gpu}: running in cpu version or gpu version
 
 for example, you can run cpu version python2 testing like this:
 
-    python sampcd_processor.py cpu 
+    python sampcd_processor.py cpu
 
 """
 import os
@@ -115,7 +115,7 @@ def extract_code_blocks_from_docstr(docstr):
     Args:
         docstr(str): docstring
     Return:
-        code_blocks: A list of code-blocks, indent removed. 
+        code_blocks: A list of code-blocks, indent removed.
                      element {'name': the code-block's name, 'id': sequence id.
                               'codes': codes, 'required': 'gpu'}
     """
@@ -237,7 +237,7 @@ def get_test_capacity():
 def is_required_match(requirestr, cbtitle='not-specified'):
     """
     search the required instruction in the code-block, and check it match the current running environment.
-    
+
     environment values of equipped: cpu, gpu, xpu, distributed, skip
     the 'skip' is the special flag to skip the test, so is_required_match will return False directly.
 
@@ -387,7 +387,7 @@ def execute_samplecode(tfname):
 
     Args:
         tfname: the filename of the sample code
-    
+
     Returns:
         result: success or not
         tfname: same as the input argument
@@ -482,7 +482,7 @@ def get_api_md5(path):
 
     Args:
         path: the api spec file. ATTENTION the path relative
-    
+
     Returns:
         api_md5(dict): key is the api's real fullname, value is the md5sum.
     """
-- 
GitLab