From 4087de2c76a0636bb209246924b4f2c1eaee523e Mon Sep 17 00:00:00 2001
From: Cheerego <35982308+shanyi15@users.noreply.github.com>
Date: Tue, 16 Apr 2019 23:05:14 +0800
Subject: [PATCH] fix_typo_and_adjust_structure (#797)

---
 .../cpu_train_best_practice.rst               |    4 +-
 .../cpu_train_best_practice_en.rst            |    0
 .../best_practice/dist_training_gpu.rst       |    9 +-
 .../advanced_usage/best_practice/index_cn.rst |    9 +
 .../advanced_usage/best_practice/index_en.rst |    8 +
 .../design_idea/fluid_design_idea.md          |   28 +-
 doc/fluid/advanced_usage/index.rst            |    4 +-
 doc/fluid/advanced_usage/index_en.rst         |    1 +
 doc/fluid/api_cn/layers_cn.rst                | 1022 +++++++++--------
 doc/fluid/api_guides/high_low_level_api.md    |   14 -
 doc/fluid/api_guides/high_low_level_api_en.md |   14 -
 doc/fluid/api_guides/index.rst                |   15 +-
 doc/fluid/api_guides/index_en.rst             |   18 +-
 .../low_level/distributed/async_training.rst  |    4 +-
 .../distributed/async_training_en.rst         |   10 +-
 .../low_level/distributed/index.rst           |    3 -
 .../low_level/distributed/index_en.rst        |    1 -
 .../low_level/distributed/sync_training.rst   |   12 +-
 .../distributed/sync_training_en.rst          |   12 +-
 doc/fluid/api_guides/low_level/program.rst    |   16 +-
 doc/fluid/api_guides/low_level/program_en.rst |   12 +-
 .../install/compile/compile_MacOS.md          |    1 +
 .../install/compile/compile_MacOS_en.md       |    4 +-
 .../programming_guide/programming_guide.md    |    2 +-
 24 files changed, 606 insertions(+), 617 deletions(-)
 rename doc/fluid/{api_guides/low_level/distributed => advanced_usage/best_practice}/cpu_train_best_practice.rst (98%)
 rename doc/fluid/{api_guides/low_level/distributed => advanced_usage/best_practice}/cpu_train_best_practice_en.rst (100%)
 create mode 100644 doc/fluid/advanced_usage/best_practice/index_cn.rst
 create mode 100644 doc/fluid/advanced_usage/best_practice/index_en.rst
 delete mode 100644 doc/fluid/api_guides/high_low_level_api.md
 delete mode 100644 doc/fluid/api_guides/high_low_level_api_en.md

diff --git a/doc/fluid/api_guides/low_level/distributed/cpu_train_best_practice.rst b/doc/fluid/advanced_usage/best_practice/cpu_train_best_practice.rst
similarity index 98%
rename from doc/fluid/api_guides/low_level/distributed/cpu_train_best_practice.rst
rename to doc/fluid/advanced_usage/best_practice/cpu_train_best_practice.rst
index 06ebbbc27..7461346a9 100644
--- a/doc/fluid/api_guides/low_level/distributed/cpu_train_best_practice.rst
+++ b/doc/fluid/advanced_usage/best_practice/cpu_train_best_practice.rst
@@ -1,8 +1,8 @@
 .. _api_guide_cpu_training_best_practice:
 
-##################
+####################
 分布式CPU训练最佳实践
-##################
+####################
 
 提高CPU分布式训练的训练速度，主要要从两个方面来考虑：
 1）提高训练速度，主要是提高CPU的使用率；2）提高通信速度，主要是减少通信传输的数据量。
diff --git a/doc/fluid/api_guides/low_level/distributed/cpu_train_best_practice_en.rst b/doc/fluid/advanced_usage/best_practice/cpu_train_best_practice_en.rst
similarity index 100%
rename from doc/fluid/api_guides/low_level/distributed/cpu_train_best_practice_en.rst
rename to doc/fluid/advanced_usage/best_practice/cpu_train_best_practice_en.rst
diff --git a/doc/fluid/advanced_usage/best_practice/dist_training_gpu.rst b/doc/fluid/advanced_usage/best_practice/dist_training_gpu.rst
index ca21cc621..cf15e4510 100644
--- a/doc/fluid/advanced_usage/best_practice/dist_training_gpu.rst
+++ b/doc/fluid/advanced_usage/best_practice/dist_training_gpu.rst
@@ -1,7 +1,8 @@
 .. _best_practice_dist_training_gpu:
 
-性能优化最佳实践之：GPU分布式训练
-============================
+#####################
+分布式GPU训练最佳实践
+#####################
 
 开始优化您的GPU分布式训练任务
 -------------------------
@@ -170,7 +171,7 @@ PaddlePaddle Fluid使用“线程池” [#]_ 模型调度并执行Op，Op在启
 数据读取的优化在GPU训练中至关重要，尤其在不断增加batch_size提升吞吐时，计算对reader性能会有更高对要求，
 优化reader性能需要考虑的点包括：
 
-1. 使用 :code:`pyreader` 
+1. 使用 :code:`pyreader`
    参考 `这里 <../../user_guides/howto/prepare_data/use_py_reader.html>`_
    使用pyreader，并开启 :code:`use_double_buffer`
 2. reader返回uint8类型数据
@@ -229,7 +230,7 @@ PaddlePaddle Fluid使用“线程池” [#]_ 模型调度并执行Op，Op在启
               for batch_id in (iters_per_pass):
                   exe.run()
           pyreader.reset()
-   
+
 
 使用混合精度训练
 ++++++++++++++
diff --git a/doc/fluid/advanced_usage/best_practice/index_cn.rst b/doc/fluid/advanced_usage/best_practice/index_cn.rst
new file mode 100644
index 000000000..f3555a3e8
--- /dev/null
+++ b/doc/fluid/advanced_usage/best_practice/index_cn.rst
@@ -0,0 +1,9 @@
+#########
+最佳实践
+#########
+
+..  toctree::
+    :hidden:
+
+    cpu_train_best_practice.rst
+    dist_training_gpu.rst
diff --git a/doc/fluid/advanced_usage/best_practice/index_en.rst b/doc/fluid/advanced_usage/best_practice/index_en.rst
new file mode 100644
index 000000000..45ffc31af
--- /dev/null
+++ b/doc/fluid/advanced_usage/best_practice/index_en.rst
@@ -0,0 +1,8 @@
+###############
+Best Practice
+###############
+
+..  toctree::
+    :hidden:
+
+    cpu_train_best_practice_en.rst
diff --git a/doc/fluid/advanced_usage/design_idea/fluid_design_idea.md b/doc/fluid/advanced_usage/design_idea/fluid_design_idea.md
index 43feba2d5..189ea5c0f 100644
--- a/doc/fluid/advanced_usage/design_idea/fluid_design_idea.md
+++ b/doc/fluid/advanced_usage/design_idea/fluid_design_idea.md
@@ -21,28 +21,28 @@ Fluid使用一种编译器式的执行流程，分为编译时和运行时两个
 </p>
 
  1. 编译时，用户编写一段python程序，通过调用 Fluid 提供的算子，向一段 Program 中添加变量（Tensor）以及对变量的操作（Operators 或者 Layers）。用户只需要描述核心的前向计算，不需要关心反向计算、分布式下以及异构设备下如何计算。
- 
+
  2. 原始的 Program 在平台内部转换为中间描述语言： `ProgramDesc`。
- 
+
  3. 编译期最重要的一个功能模块是 `Transpiler`。`Transpiler` 接受一段 `ProgramDesc` ，输出一段变化后的 `ProgramDesc` ，作为后端 `Executor` 最终需要执行的 Fluid Program
 
  4. 后端 Executor 接受 Transpiler 输出的这段 Program ，依次执行其中的 Operator（可以类比为程序语言中的指令），在执行过程中会为 Operator 创建所需的输入输出并进行管理。
-	
 
 
- 
-## 2. Program设计思想 
+
+
+## 2. Program设计思想
 
 用户完成网络定义后，一段 Fluid 程序中通常存在 2 段 Program：
 
   1. fluid.default_startup_program：定义了创建模型参数，输入输出，以及模型中可学习参数的初始化等各种操作
-    
+
     default_startup_program 可以由框架自动生成，使用时无需显示地创建
-    
+
     如果调用修改了参数的默认初始化方式，框架会自动的将相关的修改加入default_startup_program
-  
+
   2. fluid.default_main_program ：定义了神经网络模型，前向反向计算，以及优化算法对网络中可学习参数的更新
-    
+
     使用Fluid的核心就是构建起 default_main_program
 
 
@@ -53,7 +53,7 @@ Fluid 的 Program 的基本结构是一些嵌套 blocks，形式上类似一段
 blocks中包含：
 
 -  本地变量的定义
--  一系列的operator 
+-  一系列的operator
 
 block的概念与通用程序一致，例如在下列这段C++代码中包含三个block：
 
@@ -95,7 +95,7 @@ prob = ie()
 ```
 ### BlockDesc and ProgramDesc
 
-用户描述的block与program信息在Fluid中以[protobuf](https://en.wikipedia.org/wiki/Protocol_Buffers) 格式保存，所有的`protobub`信息被定义在`framework.proto`中，在Fluid中被称为BlockDesc和ProgramDesc。ProgramDesc和BlockDesc的概念类似于一个[抽象语法树](https://en.wikipedia.org/wiki/Abstract_syntax_tree)。
+用户描述的block与program信息在Fluid中以[protobuf](https://en.wikipedia.org/wiki/Protocol_Buffers) 格式保存，所有的`protobuf`信息被定义在`framework.proto`中，在Fluid中被称为BlockDesc和ProgramDesc。ProgramDesc和BlockDesc的概念类似于一个[抽象语法树](https://en.wikipedia.org/wiki/Abstract_syntax_tree)。
 
 `BlockDesc`中包含本地变量的定义`vars`，和一系列的operator`ops`：
 
@@ -172,12 +172,12 @@ class Executor{
 				Scope* scope,
 				int block_id) {
 			auto& block = pdesc.Block(block_id);
-			
+
 			//创建所有变量
 			for (auto& var : block.AllVars())
 				scope->Var(Var->Name());
 			}
-			
+
 			//创建OP并按顺序执行
 			for (auto& op_desc : block.AllOps()){
 				auto op = CreateOp(*op_desc);
@@ -300,7 +300,7 @@ BlockDesc中包含定义的 vars 和一系列的 ops，以输入x为例，python
 x = fluid.layers.data(name="x",shape=[1],dtype='float32')
 ```
 在BlockDesc中，变量x被描述为：
-``` 
+```
 vars {
     name: "x"
     type {
diff --git a/doc/fluid/advanced_usage/index.rst b/doc/fluid/advanced_usage/index.rst
index 6b79fee1c..51ed37fda 100644
--- a/doc/fluid/advanced_usage/index.rst
+++ b/doc/fluid/advanced_usage/index.rst
@@ -29,5 +29,5 @@
     development/profiling/index_cn.rst
     development/contribute_to_paddle/index_cn.rst
     development/write_docs_cn.md
-    best_practice/dist_training_gpu.rst
-    paddle_slim/paddle_slim.md 
+    best_practice/index_cn.rst
+    paddle_slim/paddle_slim.md
diff --git a/doc/fluid/advanced_usage/index_en.rst b/doc/fluid/advanced_usage/index_en.rst
index 0a839adc0..eef0527ca 100644
--- a/doc/fluid/advanced_usage/index_en.rst
+++ b/doc/fluid/advanced_usage/index_en.rst
@@ -29,3 +29,4 @@ We gladly encourage your contributions of codes and documentation to our communi
     development/profiling/index_en.rst
     development/contribute_to_paddle/index_en.rst
     development/write_docs_en.md
+    best_practice/index_en.rst
diff --git a/doc/fluid/api_cn/layers_cn.rst b/doc/fluid/api_cn/layers_cn.rst
index 56de3309c..ccc1d4e87 100644
--- a/doc/fluid/api_cn/layers_cn.rst
+++ b/doc/fluid/api_cn/layers_cn.rst
@@ -4,7 +4,7 @@ fluid.layers
 
 
 ============
-control_flow 
+control_flow
 ============
 
 
@@ -17,7 +17,7 @@ array_length
 
 **得到输入LoDTensorArray的长度**
 
-此功能用于查找输入数组LOD_TENSOR_ARRAY的长度。  
+此功能用于查找输入数组LOD_TENSOR_ARRAY的长度。
 
 相关API:
     - :ref:`cn_api_fluid_layers_array_read`
@@ -92,7 +92,7 @@ array_read
 
 
 .. _cn_api_fluid_layers_array_write:
-    
+
 array_write
 -------------------------------
 
@@ -106,8 +106,8 @@ array_write
     - **x** (Variable|list) – 待从中读取数据的输入张量(tensor)
     - **i** (Variable|list) – 输出结果 ``LOD_TENSOR_ARRAY`` 的下标, 该下标指向输入张量 ``x`` 写入输出数组的位置
     - **array** (Variable|list) – 会被输入张量 ``x`` 写入的输出结果 ``LOD_TENSOR_ARRAY`` 。如果该项值为None， 一个新的 ``LOD_TENSOR_ARRAY`` 将会被创建并作为结果返回
- 
-返回:	输入张量 ``x`` 所写入的输出结果 ``LOD_TENSOR_ARRAY``  
+
+返回:	输入张量 ``x`` 所写入的输出结果 ``LOD_TENSOR_ARRAY``
 
 返回类型:	变量（Variable）
 
@@ -139,7 +139,7 @@ create_array
 
 创建LoDTensorArray数组。它主要用于实现RNN与array_write, array_read和While。
 
-参数: 
+参数:
     - **dtype** (int |float) — lod_tensor_array中存储元素的数据类型。
 
 返回: lod_tensor_array， 元素数据类型为dtype。
@@ -150,10 +150,10 @@ create_array
 **代码示例**
 
 ..  code-block:: python
-  
+
   data = fluid.layers.create_array(dtype='float32')
-  
-  
+
+
 
 
 
@@ -211,9 +211,8 @@ memory用于缓存分段数据。memory的初始值可以是零，也可以是
 
 参数:
     	- **x** (Variable) - 输入序列
-        - **level** (int) - 用于拆分步骤的LOD层级，默认值0
-	
-    	
+      - **level** (int) - 用于拆分步骤的LOD层级，默认值0
+
 返回:当前的输入序列中的timestep。
 
 .. py:method:: static_input(x)
@@ -232,13 +231,13 @@ memory用于缓存分段数据。memory的初始值可以是零，也可以是
 .. py:method:: memory(init=None, shape=None, value=0.0, need_reorder=False, dtype='float32')
 
 为动态rnn创建一个memory 变量。
-    
+
 如果 ``init`` 不是None， ``memory`` 将由这个变量初始化。参数 ``need_reorder`` 用于将memory重新排序作为输入变量。当memory初始化依赖于输入样本时，应该将其设置为true。
 
 **例如**
 
 ..  code-block:: python
-  
+
   	import paddle.fluid as fluid
   	sentence = fluid.layers.data(
                  name='sentence', dtype='float32', shape=[32])
@@ -253,15 +252,15 @@ memory用于缓存分段数据。memory的初始值可以是零，也可以是
 			 input=[word, memory], size=10, act='tanh')
 	     drnn.update_memory(ex_mem=memory, new_mem=hidden)
 	     drnn.output(hidden)
-	   
+
 	rnn_output = drnn()
 
 
 
 否则，如果已经设置 ``shape`` 、 ``value`` 、 ``dtype`` ，memory将被 ``value`` 初始化
-  
+
 ..  code-block:: python
-  
+
 	import paddle.fluid as fluid
 
 	sentence = fluid.layers.data(
@@ -293,7 +292,7 @@ memory用于缓存分段数据。memory的初始值可以是零，也可以是
 将内存从 ``ex_mem`` 更新到 ``new_mem`` 。注意， ``ex_mem`` 和 ``new_mem`` 的 ``shape`` 和数据类型必须相同。
 
 参数：
-	- **ex_mem** （memory Variable）-  memory 变量（Variable） 
+	- **ex_mem** （memory Variable）-  memory 变量（Variable）
 	- **new_mem** （memory Variable）- RNN块中生成的平坦变量（plain  variable）
 
 返回：None
@@ -307,8 +306,8 @@ memory用于缓存分段数据。memory的初始值可以是零，也可以是
     - **\*outputs** - 输出变量。
 
 返回:None
- 
- 
+
+
 
 
 
@@ -332,11 +331,11 @@ equal
     - **y** (Variable)-equal的第二个操作数
     - **cond** (Variable|None)-输出变量（可选），用来存储equal的结果
 
-返回：张量类型的变量，存储equal的输出结果 
+返回：张量类型的变量，存储equal的输出结果
 
-返回类型：变量（Variable） 
+返回类型：变量（Variable）
 
-**代码示例**: 
+**代码示例**:
 
 .. code-block:: python
 
@@ -357,7 +356,7 @@ IfElse
 
 .. py:class:: paddle.fluid.layers.IfElse(cond, name=None)
 
-if-else控制流。  
+if-else控制流。
 
 参数：
     - **cond** (Variable)-用于比较的条件
@@ -394,13 +393,13 @@ if-else控制流。
 
 
 .. _cn_api_fluid_layers_increment:
-  
+
 increment
 -------------------------------
-  
+
 .. py:function:: paddle.fluid.layers.increment(x, value=1.0, in_place=True)
 
-   
+
 该函数为输入 ``x`` 增加 ``value`` 大小, ``value`` 即函数中待传入的参数。该函数默认直接在原变量 ``x`` 上进行运算。
 
 .. note::
@@ -418,13 +417,13 @@ increment
 **代码示例**
 
 ..  code-block:: python
-  
+
     data = fluid.layers.data(name='data', shape=[1], dtype='float32',
                          append_batch_size=False)
     data = fluid.layers.increment(x=data, value=3.0, in_place=True)
- 
- 
- 
+
+
+
 
 
 
@@ -486,7 +485,7 @@ less_than
     import paddle.fluid as fluid
     less = fluid.layers.less_than(x=label, y=limit)
 
-参数：  
+参数：
     - **x** (Variable) – ``less_than`` 运算的左操作数
     - **y** (Variable) – ``less_than`` 运算的右操作数
     - **force_cpu** (BOOLEAN) – 值True则强制将输出变量写入CPU内存中。否则，将其写入目前所在的运算设备上。默认为True
@@ -564,19 +563,19 @@ reorder_lod_tensor_by_rank
 
 
 ::
-	
+
   例如:
- 
+
   假设在 RankTable 中存储的序列索引为 [3,0,2,1]， X 将会被这样被重新排列：
   X 中的第四个序列（即索引为3的序列，后面以此类推）会变成排列后的batch中的第一个，紧接着就是原来batch中的第一个元素，第三个元素，和第二个元素。
-  简言之，若有原batch：X = [Seq0, Seq1, Seq2, Seq3] 且 RankTable 中的索引为 [3,0,2,1]，那么输出即为 Out = [Seq3, Seq0, Seq2, Seq1] ，它携带着新的LoD信息。	
+  简言之，若有原batch：X = [Seq0, Seq1, Seq2, Seq3] 且 RankTable 中的索引为 [3,0,2,1]，那么输出即为 Out = [Seq3, Seq0, Seq2, Seq1] ，它携带着新的LoD信息。
   如果 X 的LoD信息是空的，这表明 X 不是序列型数据。这和由多个定长为1的序列组成的batch是相同的情况。此时，该函数将对 X 中的切片（slice） 在第一轴(axis)上按 rank_table 里的规则加以排列。
   例如，现有 X = [Slice0, Slice1, Slice2, Slice3] ，并且它LoD信息为空，在 RankTable 索引为[3, 0, 2, 1]。则 Out = [Slice3, Slice0, Slice2, Slice1] ，并且不在其中追加LoD信息。
 
 注意，该operator对 ``X`` 进行的排序所依据的 ``LoDRankTable`` 不一定是在 ``X`` 的基础上得出来的。它可以由
 其他不同的序列batch得出，并由该operator依据这个 ``LoDRankTable`` 来对  ``X`` 排序。
 
-参数：   
+参数：
     - **x** (LoDTensor)-待根据提供的 ``RankTable`` 进行排序的LoD tensor
     - **rank_table** (LoDRankTable)- ``X`` 重新排序的依据规则表
 
@@ -620,7 +619,7 @@ StaticRNN
 
 
 
- 
+
 
 
 
@@ -637,7 +636,7 @@ Switch
 .. py:class:: paddle.fluid.layers.Switch (name=None)
 
 Switch类实现的功能十分类似if-elif-else。它可以在学习率调度器(learning rate scheduler)中调整学习率。
-:: 
+::
   语义上，
       1. switch控制流挨个检查cases
       2. 各个case的条件是一个布尔值(boolean)，它是一个标量(scalar)变量
@@ -647,7 +646,7 @@ Switch类实现的功能十分类似if-elif-else。它可以在学习率调度
 **代码示例**
 
 ..  code-block:: python
-    
+
     lr = fluid.layers.tensor.create_global_var(
         shape=[1],
         value=0.0,
@@ -664,12 +663,12 @@ Switch类实现的功能十分类似if-elif-else。它可以在学习率调度
             fluid.layers.tensor.assign(input=one_var, output=lr)
         with switch.default():
             fluid.layers.tensor.assign(input=two_var, output=lr)
- 
+
 .. py:method:: case(condition)
 
 为该condition（情况，条件）建立新的block（块）。
-  
-  
+
+
 .. py:method:: default()
 
 为该switch建立default case。
@@ -696,11 +695,11 @@ While
 该类用于实现while循环控制功能。
 
 
-参数：  
+参数：
 		- **cond** (Variable) – 用于比较的条件
 		- **is_test** (bool) – 用于表明是不是在测试阶段执行
 		- **name** (str) - 该层的命名
- 
+
 **代码示例**
 
 ..  code-block:: python
@@ -708,7 +707,7 @@ While
   d0 = fluid.layers.data("d0", shape=[10], dtype='float32')
   data_array = fluid.layers.array_write(x=d0, i=i)
   array_len = fluid.layers.fill_constant(shape=[1],dtype='int64', value=3)
-  
+
   cond = fluid.layers.less_than(x=i, y=array_len)
   while_op = fluid.layers.While(cond=cond)
   with while_op.block():
@@ -728,7 +727,7 @@ While
 
 
 ============
- io 
+ io
 ============
 
 
@@ -800,12 +799,12 @@ create_py_reader_by_data
 **代码示例：**
 
  :code:`py_reader` 的基本用法如下所示：
-        
+
 .. code-block:: python
 
     import paddle.fluid as fluid
     import paddle.dataset.mnist as mnist
-    
+
     image = fluid.layers.data(name='image', shape=[3,224,224], dtypes='float32')
     label = fluid.layers.data(name='label', shape=[1], dtypes='int64')
     reader = fluid.layers.create_py_reader_by_data(capacity=64, feed_list=[image, label])
@@ -815,7 +814,7 @@ create_py_reader_by_data
     loss = network(img, label) # some network definition
 
     fluid.Executor(fluid.CUDAPlace(0)).run(fluid.default_startup_program())
-    
+
     exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
     for epoch_id in range(10):
         reader.start()
@@ -1044,7 +1043,7 @@ py_reader
 
 该layer返回一个Reader Variable。reader提供了 ``decorate_paddle_reader()`` 和 ``decorate_tensor_provider()`` 来设置Python generator，作为Python端的数据源。在c++端调用 ``Executor::Run()`` 时，来自generator的数据将被自动读取。与 ``DataFeeder.feed()`` 不同，数据读取进程和  ``Executor::Run()`` 进程可以使用 ``py_reader`` 并行运行。reader的 ``start()`` 方法应该在每次数据传递开始时调用，在传递结束和抛出  ``fluid.core.EOFException`` 后执行 ``reset()`` 方法。注意， ``Program.clone()`` 方法不能克隆 ``py_reader`` 。
 
-参数:	
+参数:
   - **capacity** (int) –  ``py_reader`` 维护的缓冲区容量
   - **shapes** (list|tuple) –数据形状的元组或列表.
   - **dtypes** (list|tuple) –  ``shapes`` 对应元素的数据类型
@@ -1055,7 +1054,7 @@ py_reader
 返回:    reader，从reader中可以获取feed的数据
 
 返回类型:	Variable
-	
+
 
 
 **代码示例**
@@ -1279,7 +1278,7 @@ shuffle
 
 
 ============
- nn 
+ nn
 ============
 
 .. _cn_api_fluid_layers_adaptive_pool2d:
@@ -1439,14 +1438,14 @@ add_position_encoding
 
 接受形状为[N×M×P]的三维输入张量，并返回一个形为[N×M×P]的输出张量，且输出张量具有位置编码值。
 
-可参考论文: `Attention Is All You Need <http://arxiv.org/pdf/1706.03762.pdf>`_ 
+可参考论文: `Attention Is All You Need <http://arxiv.org/pdf/1706.03762.pdf>`_
 
 .. math::
 
   PE(pos, 2i) &= \sin{(pos / 10000^{2i / P})}\\
   PE(pos, 2i + 1) &= \cos{(pos / 10000^{2i / P})}\\
   Out(:, pos, i) &= \alpha * input(:, pos, i) + \beta * PE(pos, i)
-	
+
 其中:
     - PE(pos, 2i): 偶数位置上数字的增量
     - PE(pos, 2i + 1): 奇数位置上数字的增量
@@ -1519,7 +1518,7 @@ affine_grid
 
 
 .. code-block:: text
-        
+
         * 例 1:
           给定:
               theta = [[[x_11, x_12, x_13]
@@ -1527,15 +1526,15 @@ affine_grid
                        [[x_21, x_22, x_23]
                         [x_24, x_25, x_26]]]
               out_shape = [2, 3, 5, 5]
-          
+
           Step 1:
-              
+
               根据out_shape生成标准化坐标
 
               归一化坐标的值在-1和1之间
-              
+
               归一化坐标的形状为[2,H, W]，如下所示:
-              
+
               C = [[[-1.  -1.  -1.  -1.  -1. ]
                     [-0.5 -0.5 -0.5 -0.5 -0.5]
                     [ 0.   0.   0.   0.   0. ]
@@ -1546,11 +1545,11 @@ affine_grid
                     [-1.  -0.5  0.   0.5  1. ]
                     [-1.  -0.5  0.   0.5  1. ]
                     [-1.  -0.5  0.   0.5  1. ]]]
-              
+
               C[0]是高轴坐标，C[1]是宽轴坐标。
 
           Step2:
-              
+
               将C转换并重组成形为[H * W, 2]的张量,并追加到最后一个维度
 
               我们得到:
@@ -1581,9 +1580,9 @@ affine_grid
                     [ 0.5  1.   1. ]
                     [ 1.   1.   1. ]]
           Step3:
-              按下列公式计算输出 
+              按下列公式计算输出
 .. math::
-  
+
   Output[i] = C\_ * Theta[i]^T
 
 参数：
@@ -1695,7 +1694,7 @@ batch_norm
     - **moving_variance_name** （string，默认None）- moving_variance的名称，存储全局变量
     - **do_model_average_for_mean_and_var** （bool，默认False）- 是否为mean和variance做模型均值
     - **fuse_with_relu** （bool）- 如果为True，batch norm后该操作符执行relu
-    - **use_global_stats** （bool, Default False） – 是否使用全局均值和方差。 在预测或测试模式下，将use_global_stats设置为true或将is_test设置为true，并且行为是等效的。 在训练模式中，当设置use_global_stats为True时，在训练期间也使用全局均值和方差。 
+    - **use_global_stats** （bool, Default False） – 是否使用全局均值和方差。 在预测或测试模式下，将use_global_stats设置为true或将is_test设置为true，并且行为是等效的。 在训练模式中，当设置use_global_stats为True时，在训练期间也使用全局均值和方差。
 
 返回： 张量，在输入中运用批正则后的结果
 
@@ -1734,15 +1733,15 @@ beam_search
 注意，如果 ``is_accumulated`` 为 True，传入的 ``scores`` 应该是累积分数。反之，``scores`` 会被认为为直接得分(straightforward scores)， 并且会被转化为log值并且在此运算中会被累积到 ``pre_scores`` 中。在计算累积分数之前应该使用额外的 operators 进行长度惩罚。
 
 有关束搜索用法演示，请参阅以下示例：
-  
+
      fluid/tests/book/test_machine_translation.py
-  
+
 
 
 参数:
-  - **pre_ids** （Variable） -  LodTensor变量，它是上一步 ``beam_search`` 的输出。在第一步中。它应该是LodTensor，shape为 :math:`(batch\_size，1)` ， :math:`lod [[0,1，...，batch\_size]，[0,1，...，batch\_size]]` 
+  - **pre_ids** （Variable） -  LodTensor变量，它是上一步 ``beam_search`` 的输出。在第一步中。它应该是LodTensor，shape为 :math:`(batch\_size，1)` ， :math:`lod [[0,1，...，batch\_size]，[0,1，...，batch\_size]]`
   - **pre_scores** （Variable） -  LodTensor变量，它是上一步中beam_search的输出
-  - **ids** （Variable） - 包含候选ID的LodTensor变量。shpae为 :math:`（batch\_size×beam\_ize，K）` ，其中 ``K`` 应该是 ``beam_size``
+  - **ids** （Variable） - 包含候选ID的LodTensor变量。shape为 :math:`（batch\_size×beam\_ize，K）` ，其中 ``K`` 应该是 ``beam_size``
   - **scores** （Variable） - 与 ``ids`` 及其shape对应的累积分数的LodTensor变量, 与 ``ids`` 的shape相同。
   - **beam_size** （int） - 束搜索中的束宽度。
   - **end_id** （int） - 结束标记的id。
@@ -1759,7 +1758,7 @@ beam_search
 **代码示例**
 
 ..  code-block:: python
-    
+
     # 假设 `probs` 包含计算神经元所得的预测结果
     # `pre_ids` 和 `pre_scores` 为beam_search之前时间步的输出
     topk_scores, topk_indices = fluid.layers.topk(probs, k=beam_size)
@@ -1805,7 +1804,7 @@ beam_search_decode
         - **beam_size** (int) - 束搜索中波束的宽度。
         - **end_id** (int) - 结束token的id。
         - **name** (str|None) - 该层的名称(可选)。如果设置为None，该层将被自动命名。
-    
+
 返回：	LodTensor 对（pair）， 由生成的id序列和相应的score序列组成。两个LodTensor的shape和lod是相同的。lod的level=2，这两个level分别表示每个源句有多少个假设，每个假设有多少个id。
 
 返回类型:	变量（variable）
@@ -1814,7 +1813,7 @@ beam_search_decode
 **代码示例**
 
 .. code-block:: python
-            
+
 	    # 假设 `ids` 和 `scores` 为 LodTensorArray变量，它们保留了
             # 选择出的所有时间步的id和score
             finished_ids, finished_scores = fluid.layers.beam_search_decode(
@@ -1882,12 +1881,12 @@ Bayesian Personalized Ranking Loss Operator. (贝叶斯个性化排序损失计
 该算子属于pairwise的排序类型，其标签是期望物品。在某次会话中某一给定点的损失值由下式计算而得:
 
 .. math::
-  
+
   Y[i] = -\frac{1}{N_{i}-1} * \sum_{0\le j<N_{i},~ j\neq Label[i]}\log(\sigma(X[i, Label[i]]-X[i, j]))
 
 更多细节请参考 `Session Based Recommendations with Recurrent Neural Networks <https://arxiv.org/abs/1511.06939>`_
 
-参数: 
+参数:
   - **input** (Variable|list):  一个形为[N x D]的2-D tensor , 其中 N 为批大小batch size ，D 为种类的数量。该输入为logits而非概率。
   - **label** (Variable|list):  2-D tensor<int64> 类型的真实值, 形为[N x 1]
   - **name** (str|None): （可选）该层的命名。 如果为None, 则自动为该层命名。 默认为None.
@@ -1916,7 +1915,7 @@ BRelu 激活函数
 
 .. math::   out=max(min(x,tmin),tmax)
 
-参数: 
+参数:
     - **x** (Variable) - BReluoperator的输入
     - **t_min** (FLOAT|0.0) - BRelu的最小值
     - **t_max** (FLOAT|24.0) - BRelu的最大值
@@ -2037,12 +2036,12 @@ clip
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.clip(x, min, max, name=None)
-        
+
 clip算子
 
 clip算子限制给定输入的值在一个区间内。间隔使用参数"min"和"max"来指定：公式为
 
-.. math:: 
+.. math::
         Out=min(max(X,min),max)
 
 参数：
@@ -2053,7 +2052,7 @@ clip算子限制给定输入的值在一个区间内。间隔使用参数"min"
 
 返回：        （Tensor）clip操作后的输出和输入（X）具有形状（shape）
 
-返回类型：        输出（Variable）。        
+返回类型：        输出（Variable）。
 
 **代码示例：**
 
@@ -2074,13 +2073,13 @@ clip_by_norm
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.clip_by_norm(x, max_norm, name=None)
-     
+
 ClipByNorm算子
 
 此算子将输入 ``X`` 的L2范数限制在 ``max_norm`` 内。如果 ``X`` 的L2范数小于或等于 ``max_norm``  ，则输出（Out）将与 ``X`` 相同。如果X的L2范数大于 ``max_norm`` ，则 ``X`` 将被线性缩放，使得输出（Out）的L2范数等于 ``max_norm`` ，如下面的公式所示：
 
-.. math:: 
-         Out = \frac{max\_norm * X}{norm(X)} 
+.. math::
+         Out = \frac{max\_norm * X}{norm(X)}
 
 其中， :math:`norm（X）` 代表 ``x`` 的L2范数。
 
@@ -2092,7 +2091,7 @@ ClipByNorm算子
 
 返回：        (Tensor)clip_by_norm操作后的输出和输入(X)具有形状(shape).
 
-返回类型：       Variable        
+返回类型：       Variable
 
 **代码示例：**
 
@@ -2205,7 +2204,7 @@ conv2d_transpose
 
 输入 :math:`X` 和输出 :math:`Out` 函数关系如下：
 
-.. math::			   
+.. math::
                         Out=\sigma (W*X+b)\\
 
 其中：
@@ -2218,34 +2217,34 @@ conv2d_transpose
     -  :math:`b` : 偏置（bias），二维张量，shape为 ``[M,1]``
 
     -  :math:`σ` : 激活函数
- 
+
     -  :math:`Out` : 输出值，Out和 ``X`` 的 ``shape`` 可能不一样
 
 **样例**：
 
 输入：
 
-.. math:: 
+.. math::
 
     输入张量的shape :  （N，C_{in}， H_{in}， W_{in})
 
-    滤波器（filter）shape ： （C_{in}, C_{out}, H_f, W_f)  
+    滤波器（filter）shape ： （C_{in}, C_{out}, H_f, W_f)
 
 输出：
-        
-.. math:: 
+
+.. math::
     输出张量的 shape ： （N，C_{out}, H_{out}, W_{out})
 
 其中
 
-.. math:: 
+.. math::
 
         & H'_{out} = (H_{in}-1)*strides[0]-2*paddings[0]+dilations[0]*(H_f-1)+1\\
         & W'_{out} = (W_{in}-1)*strides[1]-2*paddings[1]+dilations[1]*(W_f-1)+1 \\
         & H_{out}\in[H'_{out},H'_{out} + strides[0])\\
         & W_{out}\in[W'_{out},W'_{out} + strides[1])\\
 
-            
+
 
 参数:
 	- **input** （Variable）- 输入张量，格式为[N, C, H, W]
@@ -2273,11 +2272,11 @@ conv2d_transpose
 **代码示例**
 
 ..  code-block:: python
-  
+
     data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
     conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
-    
-  
+
+
 
 
 
@@ -2314,11 +2313,11 @@ conv3d
 **示例**
 
 - 输入：
-    输入shape： :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` 
+    输入shape： :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
 
-    滤波器shape： :math:`(C_{out}, C_{in}, D_f, H_f, W_f)` 
+    滤波器shape： :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
 - 输出：
-    输出shape： :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` 
+    输出shape： :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
 
 其中
 
@@ -2393,7 +2392,7 @@ conv3d_transpose
     -  :math:`b` : 偏置（bias），二维张量，shape为 ``[M,1]``
 
     -  :math:`σ` : 激活函数
- 
+
     -  :math:`Out` : 输出值， ``Out`` 和 ``X`` 的 shape可能不一样
 
 
@@ -2401,33 +2400,33 @@ conv3d_transpose
 
 输入:
 
-.. math::   
-	
+.. math::
+
 		Input shape: (N,C_{in},D_{in},H_{in},W_{in})
 
 		Filter shape: (C_{in},C_{out},D_f,H_f,W_f)
 
-	
+
 
 输出:
 
-.. math::   
-	
+.. math::
+
 		Output shape: (N,C_{out},D_{out},H_{out},W_{out})
 
-	
+
 其中：
 
-.. math::   
-		
+.. math::
+
 
 
 		D_{out}=(D_{in}-1)*strides[0]-2*paddings[0]+dilations[0]*(D_f-1)+1
-	
+
 		H_{out}=(H_{in}-1)*strides[1]-2*paddings[1]+dilations[1]*(H_f-1)+1
-	
+
 		W_{out}=(W_{in}-1)*strides[2]-2*paddings[2]+dilations[2]*(W_f-1)+1
-		
+
 
 
 参数:
@@ -2457,7 +2456,7 @@ conv3d_transpose
 **代码示例**
 
 ..  code-block:: python
-  
+
     data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
     conv3d_transpose = fluid.layers.conv3d_transpose(input=data, num_filters=2, filter_size=3)
 
@@ -2472,7 +2471,7 @@ conv3d_transpose
 
 .. _cn_api_fluid_layers_cos_sim:
 
-cos_sim 
+cos_sim
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.cos_sim(X, Y)
@@ -2514,14 +2513,14 @@ crf_decoding
 本函数实现了Viterbi算法，可以动态地寻找隐藏状态最可能的序列，该序列也被称为Viterbi路径（Viterbi path），从而得出的标注(tags)序列。
 
 这个运算的结果会随着 ``Label`` 参数的有无而改变：
-      
+
       1. ``Label`` 非None的情况，在实际训练中时常发生。此时本函数会协同 ``chunk_eval`` 工作。本函数会返回一行形为[N X 1]的向量，其中值为0的部分代表该label不适合作为对应结点的标注，值为1的部分则反之。此类型的输出可以直接作为 ``chunk_eval`` 算子的输入
-      
+
       2. 当没有 ``Label`` 时，该函数会执行标准decoding过程
 
 （没有 ``Label`` 时）该运算返回一个形为 [N X 1]的向量，其中元素取值范围为 0 ~ 最大标注个数-1，分别为预测出的标注（tag）所在的索引。
-	
-参数：	
+
+参数：
     - **input** (Variable)(LoDTensor，默认类型为 LoDTensor<float>) — 一个形为 [N x D] 的LoDTensor，其中 N 是mini-batch的大小，D是标注（tag) 的总数。 该输入是 ``linear_chain_crf`` 的 unscaled emission weight matrix （未标准化的发射权重矩阵）
     - **param_attr** (ParamAttr) — 参与训练的参数的属性
     - **label** (Variable)(LoDTensor，默认类型为 LoDTensor<int64_t>) —  形为[N x 1]的正确标注（ground truth）。 该项可选择传入。 有关该参数的更多信息，请详见上述描述
@@ -2588,7 +2587,7 @@ crop
             Out = [[1, 2, 5],
                    [3, 4, 6]].
 
- 
+
 参数:
   - **x** (Variable): 输入张量。
   - **shape** (Variable|list/tuple of integer) - 输出张量的形状由参数shape指定，它可以是一个变量/整数的列表/整数元组。如果是张量变量，它的秩必须与x相同。该方式适可用于每次迭代时候需要改变输出形状的情况。如果是整数列表/tupe，则其长度必须与x的秩相同
@@ -2635,28 +2634,28 @@ cross_entropy
 以及soft-label cross-entropy computation（软标签交叉熵损失计算）
 
   1. One-hot cross-entropy算法
-     
-     soft_label = False, Label[i, 0] 指明样本i的类别所具的索引:        
+
+     soft_label = False, Label[i, 0] 指明样本i的类别所具的索引:
                             .. math::
                                      \\Y[i]=-log(X[i,Label[i]])\\
-  
+
   2. Soft-label cross-entropy算法
-     
-     soft_label = True, Label[i, j] 表明样本i对应类别j的soft label(软标签):        
+
+     soft_label = True, Label[i, j] 表明样本i对应类别j的soft label(软标签):
                             .. math::
                                      \\Y[i]= \sum_{j}-Label[i,j]*log(X[i,j])\\
-                                     
+
      **请确保采用此算法时识别为各软标签的概率总和为1**
-  
+
   3. One-hot cross-entropy with vecterized label（使用向量化标签的One-hot）算法
-        
+
      作为 *2* 的特殊情况，当软类标签内部只有一个非零概率元素，且它的值为1，那么 *2* 算法降级为一种仅有one-hot标签的one-hot交叉熵
-  
-  
 
 
 
-参数：  
+
+
+参数：
     - **input** (Variable|list) – 一个形为[N x D]的二维tensor，其中N是batch大小，D是类别（class）数目。 这是由之前的operator计算出的概率，绝大多数情况下是由softmax operator得出的结果
     - **label** (Variable|list) – 一个二维tensor组成的正确标记的数据集(ground truth)。 当 ``soft_label`` 为False时，label为形为[N x 1]的tensor<int64>。 ``soft_label`` 为True时, label是形为 [N x D]的 tensor<float/double>
     - **soft_label** (bool) – 标志位，指明是否需要把给定的标签列表认定为软标签。默认为False。
@@ -2664,12 +2663,12 @@ cross_entropy
 
 返回： 一个形为[N x 1]的二维tensor，承载了交叉熵损失
 
-弹出异常： ``ValueError`` 
+弹出异常： ``ValueError``
 
                         1. 当 ``input`` 的第一维和 ``label`` 的第一维不相等时，弹出异常
                         2. 当 ``soft_label`` 值为True， 且 ``input`` 的第二维和 ``label`` 的第二维不相等时，弹出异常
                         3. 当 ``soft_label`` 值为False，且 ``label`` 的第二维不是1时，弹出异常
-                        
+
 
 
 **代码示例**
@@ -2741,16 +2740,16 @@ ctc_greedy_decoder
         - **input** (Variable) — (LoDTensor<float>)，变长序列的概率，它是一个具有LoD信息的二维张量。它的形状是[Lp, num_classes + 1]，其中Lp是所有输入序列长度的和，num_classes是真正的类别。(不包括空白标签)。
         - **blank** (int) — Connectionist Temporal Classification (CTC) loss空白标签索引,  属于半开区间[0,num_classes + 1）。
         - **name** (str) — 此层的名称。可选。
-   
+
 返回： CTC贪婪解码结果是一个形为(Lp,1)的二维张量，其中Lp是所有输出序列的长度之和。如果结果中的所有序列都为空，则输出LoDTensor 为[-1]，其中LoD[[]] 形为[1,1]。
 
 返回类型： 变量（Variable）
-    
+
 
 **代码示例**
 
 ..  code-block:: python
-        
+
     x = fluid.layers.data(name='x', shape=[8], dtype='float32')
 
     cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0)
@@ -2767,7 +2766,7 @@ data_norm
 .. py:function:: paddle.fluid.layers.data_norm(input, act=None, epsilon=1e-05, param_attr=None, data_layout='NCHW', in_place=False, name=None, moving_mean_name=None, moving_variance_name=None, do_model_average_for_mean_and_var=False)
 
 **数据正则化层**
-    
+
 可用作conv2d和fully_connected操作的正则化函数。 此层所需的数据格式为以下之一：
 
 1. NHWC [batch, in_height, in_width, in_channels]
@@ -2775,19 +2774,19 @@ data_norm
 
 :math:`input` 为一个mini-batch上的特征:
 
-.. math::       
+.. math::
         \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
         \ mini-batch\ mean \\
         \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
         \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
         \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
         \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
-        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift  
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
 参数:
   - **input** （variable） - 输入变量，它是一个LoDTensor。
   - **act** （string，默认None） - 激活函数类型，线性| relu | prelu | ...
-  - **epsilon** （float，默认1e-05） - 
+  - **epsilon** （float，默认1e-05） -
   - **param_attr** （ParamAttr） - 参数比例的参数属性。
   - **data_layout** （string，默认NCHW） -  NCHW | NHWC
   - **in_place** （bool，默认值False） - 使data_norm的输入和输出复用同一块内存。
@@ -2795,7 +2794,7 @@ data_norm
   - **moving_mean_name** （string，Default None） - 存储全局Mean的moving_mean的名称。
   - **moving_variance_name** （string，默认None） - 存储全局Variance的moving_variance的名称。
   - **do_model_average_for_mean_and_var** （bool，默认值为false） - 是否为mean和variance进行模型平均。
-    
+
 返回: 张量变量，是对输入数据进行正则化后的结果。
 
 返回类型: Variable
@@ -2803,7 +2802,7 @@ data_norm
 **代码示例**
 
 ..  code-block:: python
-        
+
     data = fluid.layers.data(input=x, size=200, param_attr='fc1.w')
     hidden2 = fluid.layers.data_norm(input=hidden1)
 
@@ -2823,19 +2822,19 @@ dice_loss
 .. py:function:: paddle.fluid.layers.dice_loss(input, label, epsilon=1e-05)
 
 dice_loss是比较两批数据相似度，通常用于二值图像分割，即标签为二值。
-    
+
 dice_loss定义为:
 
-.. math::       
+.. math::
         dice\_loss &= 1- \frac{2 * intersection\_area}{total\_rea}\\
                    &= \frac{(total\_area−intersection\_area)−intersection\_area}{total\_area}\\
-                   &= \frac{union\_area−intersection\_area}{total\_area}           
+                   &= \frac{union\_area−intersection\_area}{total\_area}
 
 参数:
     - **input** (Variable) - rank>=2的预测。第一个维度是batch大小，最后一个维度是类编号。
     - **label** （Variable）- 与输入tensor rank相同的正确的标注数据（groud truth）。第一个维度是batch大小，最后一个维度是1。
     - **epsilon** (float) - 将会加到分子和分母上。如果输入和标签都为空，则确保dice为1。默认值:0.00001
-    
+
 返回: dice_loss shape为[1]。
 
 返回类型:  dice_loss(Variable)
@@ -2843,7 +2842,7 @@ dice_loss定义为:
 **代码示例**
 
 ..  code-block:: python
-        
+
 	predictions = fluid.layers.softmax(x)
     	loss = fluid.layers.dice_loss(input=predictions, label=label, 2)
 
@@ -2876,23 +2875,23 @@ dropout op可以从Program中删除，提高执行效率。
     - **is_test** (bool)-显示是否进行测试用语的标记
     - **seed** (int)-Python整型，用于创建随机种子。如果该参数设为None，则使用随机种子。注：如果给定一个整型种子，始终丢弃相同的输出单元。训练过程中勿用固定不变的种子。
     - **name** (str|None)-该层名称（可选）。如果设置为None,则自动为该层命名
-    - **dropout_implementation** (string) -   
+    - **dropout_implementation** (string) -
 
       [‘downgrade_in_infer’(default)|’upscale_in_train’] 其中:
 
-      1. downgrade_in_infer(default), 在预测时减小输出结果 
-
-         - train: out = input * mask 
+      1. downgrade_in_infer(default), 在预测时减小输出结果
 
+         - train: out = input * mask
+         
          - inference: out = input * (1.0 - dropout_prob) 
 
          (mask是一个张量，维度和输入维度相同，值为0或1，值为0的比例即为 ``dropout_prob`` )
-        
+
       2. upscale_in_train, 增加训练时的结果
 
          - train: out = input * mask / ( 1.0 - dropout_prob )
 
-         - inference: out = input 
+         - inference: out = input
 
          (mask是一个张量，维度和输入维度相同，值为0或1，值为0的比例即为 ``dropout_prob`` ）
 
@@ -2933,7 +2932,7 @@ dynamic_gru
 
 公式如下：
 
-.. math:: 
+.. math::
   u_{t}=act_g(W_{ux}x_{t}+W_{uh}h_{t-1}+b_{u})
 .. math::
   r_{t}=act_g(W_{rx}x_{t}+W_{rh}h_{t-1}+b_{r})
@@ -2969,7 +2968,7 @@ dynamic_gru
 参数:
   - **input** (Variable) – dynamic_gru层的输入, 支持variable time length input sequence（可变时长输入序列）。 本变量底层的tensor是一个(T×3D)矩阵， 其中T是该mini-batch中总时间步数， D是隐藏状态的规模（hidden size）。
   - **size** (int) – GRU cell的维度
-  - **param_attr** (ParamAttr|None)  –  可学习的隐藏层权重矩阵的参数属性。 
+  - **param_attr** (ParamAttr|None)  –  可学习的隐藏层权重矩阵的参数属性。
     注意：
                                     - 该矩阵为一个（T X 3D）矩阵。其中D为隐藏状态的规模（hidden size）
                                     - 该矩阵的所有元素由两部分组成。一是update gate和reset gate的权重，形为（D X 2D)，二是候选隐藏状态（candidate hidden state）的权重，形为 (D X D)
@@ -2979,8 +2978,8 @@ dynamic_gru
   - **gate_activation** (str) – update gate 和 reset gate的激励函数（activation）。 可选择[“sigmoid”, “tanh”, “relu”, “identity”]其一, 默认为 “sigmoid”
   - **candidate_activation** (str) – candidate hidden state（候选隐藏状态）计算所需的激励函数（activation）。 可从[“sigmoid”, “tanh”, “relu”, “identity”]中选择, 默认为 “tanh”
   - **h_0** (Variable) – 该函数参数为初始隐藏状态。若未赋值，则默认为0。它是一个 (N x D) tensor, 其中 N 为输入mini-batch的总时间步数， D 为 隐藏状态规模(hidden size)
-  
-  
+
+
 返回：	GRU的隐藏状态(hidden state)。形为（T X D），序列长度和输入相同。
 
 返回类型:	变量（variable）
@@ -3076,7 +3075,7 @@ W 代表了权重矩阵(weight matrix)，例如 :math:`W_{xi}` 是从输入门
   - **is_reverse** (bool) – （默认: False） 是否计算反LSTM(reversed LSTM)
   - **gate_activation** (str) – （默认: "sigmoid"）应用于input gate（输入门），forget gate（遗忘门）和 output gate（输出门）的激励函数（activation），默认为sigmoid
   - **cell_activation** (str) – （默认: tanh）用于神经元输出的激励函数(activation), 默认为tanh
-  - **candidate_activation** (str) – （默认: tanh）candidate hidden state（候选隐藏状态）的激励函数(activation), 默认为tanh 
+  - **candidate_activation** (str) – （默认: tanh）candidate hidden state（候选隐藏状态）的激励函数(activation), 默认为tanh
   - **dtype** (str) – 即 Data type（数据类型）。 可以选择 [“float32”, “float64”]，默认为“float32”
   - **name** (str|None) – 该层的命名，可选项。如果值为None, 将会自动对该层命名
 
@@ -3141,7 +3140,7 @@ LSTMP层(具有循环映射的LSTM)在LSTM层后有一个分离的映射层，
     - :math:`\tilde{c_t}` : 候选隐藏状态
     - :math:`\odot` : 向量的元素状态生成
     - :math:`act_g` 和 :math:`act_h` : cell输入和cell输出激活函数，通常使用 :math:`tanh`
-    - :math:`\overline{act_h}` : 映射输出的激活函数，通常用 :math:`identity` 或等同的 :math:`act_h` 
+    - :math:`\overline{act_h}` : 映射输出的激活函数，通常用 :math:`identity` 或等同的 :math:`act_h`
 
 将 ``use_peepholes`` 设置为False，断开窥视孔连接（peephole connection）。在此省略公式，详情请参照论文 `LONG SHORT-TERM MEMORY <http://www.bioinf.jku.at/publications/older/2604.pdf>`_ 。
 
@@ -3169,9 +3168,9 @@ LSTMP层(具有循环映射的LSTM)在LSTM层后有一个分离的映射层，
         2.use_peepholes = True
             - Biases = { :math:`b_{c},b_{i},b_{f},b_{o},W_{ic},W_{fc},W_{oc}`}
             - 维度为（1*7D）
-        
+
         如果设置为None或者ParamAttr的一个属性，dynamic_lstm将创建ParamAttr为bias_attr。bias_attr的初始函数未设置，bias则初始化为0.默认：None。
-        
+
     - **use_peepholes** (bool) - 是否开启诊断/窥视孔链接，默认为True。
     - **is_reverse** (bool) - 是否计算反向LSTM，默认为False。
     - **gate_activation** (bool) - 输入门（input gate）、遗忘门（forget gate）和输出门（output gate）的激活函数。Choices = [“sigmoid”，“tanh”，“relu”，“identity”]，默认“sigmoid”。
@@ -3185,11 +3184,6 @@ LSTMP层(具有循环映射的LSTM)在LSTM层后有一个分离的映射层，
     - **cell_clip** (float) - 如果提供该参数，则在单元输出激活之前，单元状态将被此值剪裁。 
     - **proj_clip** (float) - 如果 num_proj > 0 并且 proj_clip 被提供,那么将投影值沿元素方向剪切到[-proj_clip，proj_clip]内
 
-
-
-
-
-
 返回：含有两个输出变量的元组，隐藏状态（hidden state）的投影和LSTMP的cell状态。投影的shape为（T*P），cell state的shape为（T*D），两者的LoD和输入相同。
 
 返回类型：元组(tuple)
@@ -3289,12 +3283,12 @@ elementwise_add
 对于这个运算算子有2种情况：
         1. :math:`Y` 的形状（shape）与 :math:`X` 相同。
         2. :math:`Y` 的形状（shape）是 :math:`X` 的连续子序列。
-        
+
 对于情况2:
         1. 用 :math:`Y` 匹配 :math:`X` 的形状（shape），则 ``axis`` 为 :math:`Y` 传到 :math:`X` 上的起始维度索引。
         2. 如果 ``axis`` 为-1（默认值），则 :math:`axis= rank(X)-rank(Y)` 。
         3. 考虑到子序列， :math:`Y` 的大小为1的尾部尺寸将被忽略，例如shape（Y）=（2,1）=>（2）。
-        
+
 例如：
 
 ..  code-block:: python
@@ -3361,7 +3355,7 @@ elementwise_div
         shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
         shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
         shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
-       
+
 输入 :math:`X` 和 :math:`Y` 可以携带不同的LoD信息。但输出仅与输入 :math:`X` 共享LoD信息。
 
 参数：
@@ -3372,8 +3366,8 @@ elementwise_div
         - **name** （basestring | None）- 输出的名称。
 
 返回：        元素运算的输出。
-        
-        
+
+
 
 
 
@@ -3394,7 +3388,7 @@ elementwise_max
 
 .. math::
         Out = max(X, Y)
-        
+
 - :math:`X` ：任何尺寸的张量（Tensor）。
 - :math:`Y` ：尺寸必须小于或等于X尺寸的张量（Tensor）。
 
@@ -3417,7 +3411,7 @@ elementwise_max
         shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
         shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
         shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
-        
+
 输入X和Y可以携带不同的LoD信息。但输出仅与输入X共享LoD信息。
 
 参数：
@@ -3427,8 +3421,8 @@ elementwise_max
         - **act** （basestring | None）- 激活应用于输出。
         - **name** （basestring | None）- 输出的名称。
 
-返回：        元素运算的输出。        
-        
+返回：        元素运算的输出。
+
 
 
 
@@ -3451,7 +3445,7 @@ elementwise_min
 
 .. math::
         Out = min(X, Y)
-        
+
 - :math:`X` ：任何维数的张量（Tensor）。
 - :math:`Y` ：维数必须小于或等于X维数的张量（Tensor）。
 
@@ -3474,7 +3468,7 @@ elementwise_min
         shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
         shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
         shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
-        
+
 输入X和Y可以携带不同的LoD信息。但输出仅与输入X共享LoD信息。
 
 参数：
@@ -3484,9 +3478,9 @@ elementwise_min
         - **act** （basestring | None）- 激活应用于输出。
         - **name** （basestring | None）- 输出的名称。
 
-返回：        元素运算的输出。   
- 
- 
+返回：        元素运算的输出。
+
+
 
 
 
@@ -3508,7 +3502,7 @@ elementwise_mul
 
 .. math::
         Out = X \odot Y
-        
+
 - **X** ：任何尺寸的张量（Tensor）。
 - **Y** ：尺寸必须小于或等于X尺寸的张量（Tensor）。
 
@@ -3520,7 +3514,7 @@ elementwise_mul
         1. 用 :math:`Y` 匹配 :math:`X` 的形状（shape），其中 ``axis`` 将是 :math:`Y` 传到 :math:`X` 上的起始维度索引。
         2. 如果 ``axis`` 为-1（默认值），则 :math:`axis = rank（X）-rank（Y）` 。
         3. 考虑到子序列， :math:`Y` 的大小为1的尾随尺寸将被忽略，例如shape（Y）=（2,1）=>（2）。
-        
+
 例如：
 
 ..  code-block:: python
@@ -3531,7 +3525,7 @@ elementwise_mul
         shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
         shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
         shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
-        
+
 输入X和Y可以携带不同的LoD信息。但输出仅与输入X共享LoD信息。
 
 参数：
@@ -3541,8 +3535,8 @@ elementwise_mul
         - **act** （basestring | None）- 激活应用于输出。
         - **name** （basestring | None）- 输出的名称。
 
-返回：        元素运算的输出。        
-        
+返回：        元素运算的输出。
+
 
 
 
@@ -3564,7 +3558,7 @@ elementwise_pow
 
 .. math::
         Out = X ^ Y
-       
+
 - :math:`X` ：任何维数的张量（Tensor）。
 - :math:`Y` ：维数必须小于或等于X维数的张量（Tensor）。
 
@@ -3587,7 +3581,7 @@ elementwise_pow
         shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
         shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
         shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
-        
+
 输入X和Y可以携带不同的LoD信息。但输出仅与输入X共享LoD信息。
 
 参数：
@@ -3597,8 +3591,8 @@ elementwise_pow
         - **act** （basestring | None）- 激活应用于输出。
         - **name** （basestring | None）- 输出的名称。
 
-返回：        元素运算的输出。   
-        
+返回：        元素运算的输出。
+
 
 
 
@@ -3621,7 +3615,7 @@ elementwise_sub
 
 .. math::
        Out = X - Y
-        
+
 - **X** ：任何尺寸的张量（Tensor）。
 - **Y** ：尺寸必须小于或等于**X**尺寸的张量（Tensor）。
 
@@ -3633,7 +3627,7 @@ elementwise_sub
         1. 用 :math:`Y` 匹配 :math:`X` 的形状（shape），其中 ``axis`` 将是 :math:`Y` 传到 :math:`X` 上的起始维度索引。
         2. 如果 ``axis`` 为-1（默认值），则 :math:`axis = rank（X）-rank（Y）` 。
         3. 考虑到子序列， :math:`Y` 的大小为1的尾随尺寸将被忽略，例如shape（Y）=（2,1）=>（2）。
-        
+
 例如：
 
 ..  code-block:: python
@@ -3644,7 +3638,7 @@ elementwise_sub
         shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
         shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
         shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
-        
+
 输入X和Y可以携带不同的LoD信息。但输出仅与输入X共享LoD信息。
 
 参数：
@@ -3655,7 +3649,7 @@ elementwise_sub
         - **name** （basestring | None）- 输出的名称。
 
 返回：        元素运算的输出。
-        
+
 
 
 
@@ -3674,8 +3668,8 @@ elu
 ELU激活层（ELU Activation Operator）
 
 根据 https://arxiv.org/abs/1511.07289 对输入张量中每个元素应用以下计算。
-    
-.. math::      
+
+.. math::
         \\out=max(0,x)+min(0,α∗(ex−1))\\
 
 参数:
@@ -3767,11 +3761,11 @@ expand运算会按给定的次数对输入各维度进行复制（tile）运算
                     [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
                     [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
                 ]
- 
+
 参数:
         - **x** (Variable)- 一个秩在[1, 6]范围中的张量（Tensor）.
         - **expand_times** (list|tuple) - 每一个维度要扩展的次数.
-        
+
 返回：     expand变量是LoDTensor。expand运算后，输出（Out）的每个维度的大小等于输入（X）的相应维度的大小乘以 ``expand_times`` 给出的相应值。
 
 返回类型：   变量（Variable）
@@ -3782,8 +3776,8 @@ expand运算会按给定的次数对输入各维度进行复制（tile）运算
 
         x = fluid.layers.data(name='x', shape=[10], dtype='float32')
         out = fluid.layers.expand(x=x, expand_times=[1, 2, 2])
-               
-               
+
+
 
 
 
@@ -3901,16 +3895,16 @@ flatten
 .. code-block:: text
 
     Case 1:
-      
+
       给定
         X.shape = (3, 100, 100, 4)
       且
         axis = 2
       得到:
         Out.shape = (3 * 100, 4 * 100)
-    
+
     Case 2:
-      
+
       给定
         X.shape = (3, 100, 100, 4)
       且
@@ -3968,7 +3962,7 @@ fsp_matrix
 **代码示例**
 
 ..  code-block:: python
-        
+
     feature_map_0 = fluid.layers.conv2d(x)
     feature_map_1 = fluid.layers.conv2d(feature_map_0)
     loss = fluid.layers.fsp_matrix(feature_map_0, feature_map_1)
@@ -4009,13 +4003,13 @@ gather
 参数:
         - **input** (Variable) - input 的rank >= 1。
         - **index** (Variable) - index的rank = 1。
-    
+
 返回：	output (Variable)
 
 **代码示例**
 
 ..  code-block:: python
-        
+
 	output = fluid.layers.gather(x, index)
 
 
@@ -4052,7 +4046,7 @@ gaussian_random算子。
 
 .. code-block:: python
 
-    out = fluid.layers.gaussian_random(shape=[20, 30])       
+    out = fluid.layers.gaussian_random(shape=[20, 30])
 
 
 
@@ -4161,7 +4155,7 @@ step 2：
       |          d_s          |
       |           |           |
       ws ------- y_s ------- wn
-    
+
     x_w = floor(x)              // west side x coord
     x_e = x_w + 1               // east side x coord
     y_n = floor(y)              // north side y coord
@@ -4174,7 +4168,7 @@ step 2：
     en = X[:, :, y_n, x_e]      // north-east point value
     ws = X[:, :, y_s, x_w]      // south-east point value
     es = X[:, :, y_s, x_w]      // north-east point value
-    
+
 
     output = wn * d_e * d_s + en * d_w * d_s
            + ws * d_e * d_n + es * d_w * d_n
@@ -4282,8 +4276,8 @@ GRU单元的输入包括 :math:`z_t` ， :math:`h_{t-1}` 。在上述等式中
 :math:`u_t` 和 :math:`r_t` 分别代表了GRU神经元的update gates（更新门）和reset gates(重置门)。
 和LSTM不同，GRU少了一个门（它没有LSTM的forget gate）。但是它有一个叫做中间候选隐藏状态（intermediate candidate hidden output）的输出，
 记为 :math:`m_t` 。 该层有三个输出： :math:`h_t, dot(r_t,h_{t-1})` 以及 :math:`u_t，r_t，m_t` 的连结(concatenation)。
- 
- 
+
+
 
 
 参数:
@@ -4298,7 +4292,7 @@ GRU单元的输入包括 :math:`z_t` ， :math:`h_{t-1}` 。在上述等式中
   - **bias_attr** (ParamAttr|bool|None) - GRU的bias变量的参数属性。形为 :math:`(1x3D)` 的bias连结（concatenate）在update gates（更新门），reset gates(重置门)以及candidate calculations（候选隐藏状态计算）中的bias。如果值为False，那么上述三者将没有bias参与运算。若值为None或者 ``ParamAttr`` 类中的属性之一，gru_unit则会创建一个 ``ParamAttr`` 类的对象作为 bias_attr。如果bias_attr没有被初始化，那它会被默认初始化为0。默认值为None。
   - **activation** (string) –  神经元 “actNode” 的激励函数（activation）类型。默认类型为‘tanh’
   - **gate_activation** (string) – 门 “actGate” 的激励函数（activation）类型。 默认类型为 ‘sigmoid’
-  
+
 
 返回：	 hidden value（隐藏状态的值），reset-hidden value(重置隐藏状态值)，gate values(门值)
 
@@ -4338,10 +4332,10 @@ HardSigmoid激活算子。
 
 sigmoid的分段线性逼近(https://arxiv.org/abs/1603.00391)，比sigmoid快得多。
 
-.. math::   
+.. math::
 
       \\out=\max(0,\min(1,slope∗x+shift))\\
- 
+
 斜率是正数。偏移量可正可负的。斜率和位移的默认值是根据上面的参考设置的。建议使用默认值。
 
 参数：
@@ -4445,7 +4439,7 @@ hsigmoid
 
 .. py:function:: paddle.fluid.layers.hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None, name=None, path_table=None, path_code=None, is_custom=False, is_sparse=False)
 
-层次sigmod（ hierarchical sigmoid ）加速语言模型的训练过程。这个operator将类别组织成一个完全二叉树，也可以使用 ``is_custom`` 参数来传入自定义的树结构来实现层次化。 
+层次sigmod（ hierarchical sigmoid ）加速语言模型的训练过程。这个operator将类别组织成一个完全二叉树，也可以使用 ``is_custom`` 参数来传入自定义的树结构来实现层次化。
 
 树中每个叶节点表示一个类(一个单词)，每个内部节点进行一个二分类。对于每个单词，都有一个从根到它的叶子节点的唯一路径，hsigmoid计算路径上每个内部节点的损失（cost），并将它们相加得到总损失（cost）。
 
@@ -4465,25 +4459,25 @@ hsigmoid可以把时间复杂度 :math:`O(N)` 优化到 :math:`O(logN)` ,其中
 
 参数:
     - **input** (Variable) - 输入张量，shape为 ``[N×D]`` ,其中 ``N`` 是minibatch的大小，D是特征大小。
-    - **label** (Variable) - 训练数据的标签。该tensor的shape为 ``[N×1]``   
+    - **label** (Variable) - 训练数据的标签。该tensor的shape为 ``[N×1]``
     - **num_classes** (int) - 类别的数量不能少于2。若使用默认树结构，该参数必须用户设置。当 ``is_custom=False`` 时，该项绝不能为None。反之，如果 ``is_custom=True`` ，它取值应为非叶节点的个数，来指明二分类实用的类别数目。
     - **param_attr** (ParamAttr|None) - 可学习参数/ hsigmoid权重的参数属性。如果将其设置为ParamAttr的一个属性或None，则将ParamAttr设置为param_attr。如果没有设置param_attr的初始化器，那么使用用Xavier初始化。默认值:没None。
     - **bias_attr** (ParamAttr|bool|None) - hsigmoid偏置的参数属性。如果设置为False，则不会向输出添加偏置。如果将其设置ParamAttr的一个属性或None，则将ParamAttr设置为bias_attr。如果没有设置bias_attr的初始化器，偏置将初始化为零。默认值:None。
     - **name** (str|None) - 该layer的名称(可选)。如果设置为None，该层将被自动命名。默认值:None。
     - **path_table** (Variable|None) – 存储每一批样本从词到根节点的路径。路径应为从叶至根方向。 ``path_table`` 和 ``path_code`` 应具有相同的形, 对于每个样本 i ，path_table[i]为一个类似np.array的结构，该数组内的每个元素都是其双亲结点权重矩阵的索引
     - **path_code** (Variable|None) – 存储每批样本的路径编码，仍然是按从叶至根方向。各样本路径编码批都由其各祖先结点的路径编码组成
-    - **is_custom** (bool|False) – 使用用户自定义二叉树取代默认二叉树结构，如果该项为真， 请务必设置 ``path_table`` , ``path_code`` , ``num_classes`` , 否则就需要设置 num_classes 
+    - **is_custom** (bool|False) – 使用用户自定义二叉树取代默认二叉树结构，如果该项为真， 请务必设置 ``path_table`` , ``path_code`` , ``num_classes`` , 否则就需要设置 num_classes
     - **is_sparse** (bool|False) – 使用稀疏更新方式，而非密集更新。如果为真， W的梯度和输入梯度将会变得稀疏
 
 返回:  (LoDTensor) 层次sigmod（ hierarchical sigmoid） 。shape[N, 1]
-    
+
 返回类型:  Out
 
 
 **代码示例**
 
 ..  code-block:: python
-        
+
 	x = fluid.layers.data(name='x', shape=[2], dtype='float32')
     	y = fluid.layers.data(name='y', shape=[1], dtype='int64')
     	out = fluid.layers.hsigmoid(input=x, label=y, num_classes=6)
@@ -4502,12 +4496,12 @@ Huber损失是更具鲁棒性的损失函数。 huber损失可以评估输入对
 
 当输入和标签之间的距离大于delta时:
 
-.. math:: 
+.. math::
         huber\_loss = delta * (label - input) - 0.5 * delta * delta
 
 当输入和标签之间的距离小于delta时:
 
-.. math:: 
+.. math::
         huber\_loss = 0.5 * (label - input) * (label - input)
 
 
@@ -4543,7 +4537,7 @@ im2sequence
 从输入张量中提取图像张量，与im2col相似，shape={input.batch_size * output_height * output_width, filter_size_H * filter_size_W * input.通道}。这个op使用filter / kernel扫描图像并将这些图像转换成序列。一个图片展开后的timestep的个数为output_height * output_width，其中output_height和output_width由下式计算:
 
 
-.. math:: 
+.. math::
                         output\_size=1+\frac{(2∗padding+img\_size−block\_size+stride-1}{stride}
 
 每个timestep的维度为 :math:`block\_y * block\_x * input.channels` 。
@@ -4608,7 +4602,7 @@ im2sequence
 **代码示例**
 
 ..  code-block:: python
-  
+
     output = fluid.layers.im2sequence(
     input=layer, stride=[1, 1], filter_size=[2, 2])
 
@@ -4629,10 +4623,10 @@ image_resize
 .. py:function:: paddle.fluid.layers.image_resize(input, out_shape=None, scale=None, name=None, resample='BILINEAR', actual_shape=None, align_corners=True, align_mode=1)
 
 调整一个batch中图片的大小。
-    
+
 输入张量的shape为(num_batch, channels, in_h, in_w)，并且调整大小只适用于最后两个维度(高度和宽度)。
-    
-支持重新取样方法: 
+
+支持重新取样方法:
 
     BILINEAR：双线性插值
 
@@ -4650,18 +4644,18 @@ Align_corners和align_mode是可选参数，插值的计算方法可以由它们
 ::
 
       For scale:
-      
+
         if align_corners = True && out_size > 1 :
 
           scale_factor = (in_size-1.0)/(out_size-1.0)
-        
+
         else:
-          
+
           scale_factor = float(in_size/out_size)
-        
-      
+
+
       Nearest neighbor interpolation:
-      
+
       if:
           align_corners = False
 
@@ -4684,16 +4678,16 @@ Align_corners和align_mode是可选参数，插值的计算方法可以由它们
 
       if:
           align_corners = False , align_mode = 0
-          
+
           input : (N,C,H_in,W_in)
           output: (N,C,H_out,W_out) where:
-          
+
           H_out = (H_{in}+0.5) * scale_{factor} - 0.5
           W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
 
       else:
-       
+
           input : (N,C,H_in,W_in)
           output: (N,C,H_out,W_out) where:
 
@@ -4734,9 +4728,9 @@ https://en.wikipedia.org/wiki/Bilinear_interpolation。
 **代码示例**
 
 ..  code-block:: python
-        
-	out = fluid.layers.image_resize(input, out_shape=[12, 12], resample="NEAREST") 
-  
+
+	out = fluid.layers.image_resize(input, out_shape=[12, 12], resample="NEAREST")
+
 
 
 
@@ -4761,7 +4755,7 @@ image_resize_short
         - **input** (Variable) -  图像调整图层的输入张量，这是一个4维的形状张量(num_batch, channels, in_h, in_w)。
         - **out_short_len** (int) -  输出图像的短边长度。
         - **resample** (str) - resample方法，默认为双线性插值。
-    
+
 返回：	4维张量，shape为(num_batch, channls, out_h, out_w)
 
 返回类型:	变量（variable）
@@ -4839,11 +4833,11 @@ L2正则（L2 normalize Layer）
     - **axis** (int)-运用归一化的轴。如果轴小于0，归一化的维是rank(X)+axis。-1是最后维
     - **epsilon** (float)-epsilon用于避免分母为0，默认值为1e-10
     - **name** (str|None)-该层名称（可选）。如果设为空，则自动为该层命名
-    
+
     返回：输出张量，同x的维度一致
-    
+
     返回类型：变量
-    
+
 **代码示例**：
 
 .. code-block:: python
@@ -4919,8 +4913,8 @@ layer_norm
 
 假设特征向量存在于维度 ``begin_norm_axis ... rank (input）`` 上，计算大小为 ``H`` 的特征向量a在该维度上的矩统计量，然后使用相应的统计量对每个特征向量进行归一化。 之后，如果设置了 ``scale`` 和 ``shift`` ，则在标准化的张量上应用可学习的增益和偏差以进行缩放和移位。
 
-请参考 `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ 
-            
+请参考 `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+
 公式如下
 
 .. math::
@@ -4929,7 +4923,7 @@ layer_norm
             \\\sigma=\sqrt{\frac{1}{H}\sum_i^H{(a_i-\mu)^2}}\\
 .. math::
              \\h=f(\frac{g}{\sigma}(a-\mu) + b)\\
-             
+
 - :math:`\alpha` : 该层神经元输入总和的向量表示
 - :math:`H` : 层中隐藏的神经元个数
 - :math:`g` : 可训练的缩放因子参数
@@ -4947,12 +4941,12 @@ layer_norm
   - **act** （str） - 激活函数。默认 None
   - **name** （str） - 该层的名称， 可选的。默认为None，将自动生成唯一名称。
 
-返回： 标准化后的结果   
+返回： 标准化后的结果
 
 **代码示例**
 
 ..  code-block:: python
-    
+
    data = fluid.layers.data(name='data', shape=[3, 32, 32],
                                            dtype='float32')
    x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
@@ -5032,7 +5026,7 @@ linear_chain_crf
 
 	5.Label用 :math:`s` 表示
 
-	
+
 
 
 **注意：**
@@ -5052,7 +5046,7 @@ linear_chain_crf
 返回：
     output(Variable，Tensor，默认float类型Tensor)：shape为[N*D]的二维张量。Emission的指数。这是前向计算中的中间计算结果，在后向计算中还会复用
 
-    output(Variable，Tensor，默认float类型Tensor)：shape为[(D+2)*D]的二维张量。Transition的指数。这是前向计算中的中间计算结果，在后向计算中还会复用 
+    output(Variable，Tensor，默认float类型Tensor)：shape为[(D+2)*D]的二维张量。Transition的指数。这是前向计算中的中间计算结果，在后向计算中还会复用
 
     output(Variable,Tensor，默认float类型Tensor)：mini-batch每个训练样本的条件概率的对数。这是一个shape为[S*1]的二维张量，S是mini-batch的序列数。注：S等同于mini-batch的序列数。输出不再是LoDTensor
 
@@ -5168,7 +5162,7 @@ log
 
 .. math::
                   \\Out=ln(x)\\
- 
+
 
 参数:
   - **x** (Variable) – 输入张量
@@ -5259,11 +5253,11 @@ logical_and算子
         - **out** （Tensor）- 输出逻辑运算的张量。
         - **name** （basestring | None）- 输出的名称。
 
-返回：        (LoDTensor)n-dim bool张量。每个元素的计算公式： :math:`Out = X \&\& Y` 
-        
-返回类型：        输出（Variable）。        
-        
-        
+返回：        (LoDTensor)n-dim bool张量。每个元素的计算公式： :math:`Out = X \&\& Y`
+
+返回类型：        输出（Variable）。
+
+
 **代码示例：**
 
 .. code-block:: python
@@ -5293,7 +5287,7 @@ logical_not算子
 
 它在X上以元素方式操作，并返回Out。X和Out是N维布尔张量（Tensor）。Out的每个元素的计算公式为：
 
-.. math:: 
+.. math::
         Out = !X
 
 参数：
@@ -5303,13 +5297,13 @@ logical_not算子
 
 返回：        (LoDTensor)n维布尔张量。
 
-返回类型：        输出（Variable）。        
+返回类型：        输出（Variable）。
 
 
 **代码示例：**
 
 .. code-block:: python
-    
+
     left = fluid.layers.data(
         name='left', shape=[1], dtype='int32')
     result = fluid.layers.logical_not(x=left)
@@ -5331,7 +5325,7 @@ logical_or算子
 
 它在X和Y上以元素方式操作，并返回Out。X、Y和Out是N维布尔张量（Tensor）。Out的每个元素的计算公式为：
 
-.. math:: 
+.. math::
         Out = X || Y
 
 参数：
@@ -5340,9 +5334,9 @@ logical_or算子
         - **out** （Tensor）- 输出逻辑运算的张量。
         - **name** （basestring | None）- 输出的名称。
 
-返回：        (LoDTensor)n维布尔张量。每个元素的计算公式： :math:`Out = X || Y` 
-        
-返回类型：        输出（Variable）。        
+返回：        (LoDTensor)n维布尔张量。每个元素的计算公式： :math:`Out = X || Y`
+
+返回类型：        输出（Variable）。
 
 
 
@@ -5373,7 +5367,7 @@ logical_xor算子
 
 它在X和Y上以元素方式操作，并返回Out。X、Y和Out是N维布尔张量（Tensor）。Out的每个元素的计算公式为：
 
-.. math:: 
+.. math::
         Out = (X || Y) \&\& !(X \&\& Y)
 
 参数：
@@ -5383,8 +5377,8 @@ logical_xor算子
         - **name** （basestring | None）- 输出的名称。
 
 返回：        (LoDTensor)n维布尔张量。
-       
-返回类型：        输出（Variable）。        
+
+返回类型：        输出（Variable）。
 
 
 
@@ -5418,7 +5412,7 @@ lrn
 
 .. math::
 
-    Output(i,x,y) = Input(i,x,y)/\left ( k+\alpha \sum_{j=max(0,c-n/2)}^{min(C,c+n/2)}(Input(j,x,y))^2 \right )^\beta 
+    Output(i,x,y) = Input(i,x,y)/\left ( k+\alpha \sum_{j=max(0,c-n/2)}^{min(C,c+n/2)}(Input(j,x,y))^2 \right )^\beta
 
 在以上公式中：
   - :math:`n` ：累加的通道数
@@ -5636,9 +5630,9 @@ margin rank loss（差距排序损失）层。在排序问题中，它可以比
 
 返回类型:	变量（Variable）
 
-抛出异常: 
+抛出异常:
   - ``ValueError`` - ``label`` , ``left`` , ``right`` 有一者不为Variable类型时，抛出此异常
- 
+
 **代码示例**
 
 ..  code-block:: python
@@ -5736,7 +5730,7 @@ maxout
 
 假设输入形状为(N, Ci, H, W)，输出形状为(N, Co, H, W)，则 :math:`Co=Ci/groups` 运算公式如下:
 
-.. math:: 
+.. math::
 
 	y_{si+j} &= \max_k x_{gsi + sk + j} \\
 	g &= groups \\
@@ -5774,18 +5768,18 @@ mean
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.mean(x, name=None)
-       
+
 mean算子计算X中所有元素的平均值
-     
+
 参数：
         - **x** (Variable)- (Tensor) 均值运算的输入。
         - **name** (basestring | None)- 输出的名称。
 
 返回：       均值运算输出张量（Tensor）
-       
+
 返回类型：        Variable
-        
-        
+
+
 
 
 
@@ -5802,18 +5796,18 @@ mean_iou
 .. py:function:: paddle.fluid.layers.mean_iou(input, label, num_classes)
 
 均值IOU（Mean  Intersection-Over-Union）是语义图像分割中的常用的评价指标之一，它首先计算每个语义类的IOU，然后计算类之间的平均值。定义如下:
-      
-.. math::   
+
+.. math::
 
     IOU = \frac{true\_positive}{true\_positive+false\_positive+false\_negative}
-          
+
 在一个confusion矩阵中累积得到预测值，然后从中计算均值-IOU。
 
 参数:
     - **input** (Variable) - 类型为int32或int64的语义标签的预测结果张量。
     - **label** (Variable) - int32或int64类型的真实label张量。它的shape应该与输入相同。
     - **num_classes** (int) - 标签可能的类别数目。
-    
+
 返回: 返回三个变量:
 
 - mean_iou: 张量，形为[1]， 代表均值IOU。
@@ -5848,18 +5842,18 @@ merge_selected_rows
 该运算用于合并（值相加）输入张量中重复的行。输出行没有重复的行，并且按值从小到大顺序重新对行排序。
 
 ::
-    
+
     例如：
-          
-          输入: 
+
+          输入:
                X.rows = [0, 5, 5, 4, 19]
                X.height = 20
                X.value = [[1, 1] [2, 2] [3, 3] [4, 4] [6, 6]]
-          
-          
+
+
           输出：
-               Out.row is [0, 4, 5, 19] 
-               Out.height is 20 
+               Out.row is [0, 4, 5, 19]
+               Out.height is 20
                Out.value is: [[1, 1] [4, 4] [5, 5] [6, 6]]
 
 
@@ -5868,7 +5862,7 @@ merge_selected_rows
   - x (Variable) – 输入类型为SelectedRows, 选中行有可能重复
   - name (basestring|None) – 输出变量的命名
 
-返回: 输出类型为SelectedRows，并且选中行不会重复 
+返回: 输出类型为SelectedRows，并且选中行不会重复
 
 返回类型:	变量（Variable）
 
@@ -5888,12 +5882,12 @@ mul
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None)
-        
+
 mul算子
 此运算是用于对输入X和Y执行矩阵乘法。
 等式是：
 
-.. math:: 
+.. math::
         Out = X * Y
 
 输入X和Y都可以携带LoD（详细程度）信息。但输出仅与输入X共享LoD信息。
@@ -5906,10 +5900,10 @@ mul算子
         - **name** (basestring | None)- 输出的名称。
 
 返回：       乘法运算输出张量（Tensor）.
-       
-返回类型：    输出(Variable)。       
-        
-        
+
+返回类型：    输出(Variable)。
+
+
 
 
 
@@ -5925,18 +5919,18 @@ multiplex
 
 .. py:function:: paddle.fluid.layers.multiplex(inputs, index)
 
-引用给定的索引变量，该层从输入变量中选择行构造Multiplex变量。 
+引用给定的索引变量，该层从输入变量中选择行构造Multiplex变量。
 
-假设有 :math:`m` 个输入变量，:math:`I_{i}` 代表第i个输入变量，而且 :math:`i` is in :math:`[0,m)` 。 
+假设有 :math:`m` 个输入变量，:math:`I_{i}` 代表第i个输入变量，而且 :math:`i` is in :math:`[0,m)` 。
 
-所有输入变量都是具有相同形状的张量 :math:`[d_0,d_1, ... ,d_R]` 。 
+所有输入变量都是具有相同形状的张量 :math:`[d_0,d_1, ... ,d_R]` 。
 
-请注意，输入张量的秩应至少为2。每个输入变量将被视为形状为 :math:`[M，N]` 的二维矩阵，其中 :math:`M` 表示 :math:`d0` ，N表示 :math:`d_1 * d_2 * ... * d_R` 。 
+请注意，输入张量的秩应至少为2。每个输入变量将被视为形状为 :math:`[M，N]` 的二维矩阵，其中 :math:`M` 表示 :math:`d0` ，N表示 :math:`d_1 * d_2 * ... * d_R` 。
 
-设 :math:`I_{i}[j]` 为第i个输入变量的第j行。 给定的索引变量是具有形状[M，1]的2-D张量。 设 :math:`ID[i]` 为索引变量的第i个索引值。 然后输出变量将是一个形状为 :math:`[d_0,d_1, ... ,d_R]` 的张量。 
+设 :math:`I_{i}[j]` 为第i个输入变量的第j行。 给定的索引变量是具有形状[M，1]的2-D张量。 设 :math:`ID[i]` 为索引变量的第i个索引值。 然后输出变量将是一个形状为 :math:`[d_0,d_1, ... ,d_R]` 的张量。
+
+如果将输出张量视为具有形状[M，N]的2-D矩阵,并且令O[i]为矩阵的第i行，则O[i]等于 :math:`I_{ID}[i][i]`
 
-如果将输出张量视为具有形状[M，N]的2-D矩阵,并且令O[i]为矩阵的第i行，则O[i]等于 :math:`I_{ID}[i][i]` 
-  
 - Ids: 索引张量
 - X[0 : N - 1]: 输出的候选张量度(N >= 2).
 - 对于从 0 到 batchSize-1 的每个索引i，输出是第（Ids [i]）  张量的第i行
@@ -5997,7 +5991,7 @@ multiplex
 ..  code-block:: python
 
    import paddle.fluid as fluid
-   
+
    x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
    x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
    index = fluid.layers.data(name='index', shape=[1], dtype='int32')
@@ -6019,8 +6013,8 @@ nce
 .. py:function:: paddle.fluid.layers.nce(input, label, num_total_classes, sample_weight=None, param_attr=None, bias_attr=None, num_neg_samples=None, name=None, sampler='uniform', custom_dist=None, seed=0, is_sparse=False)
 
 计算并返回噪音对比估计（ noise-contrastive estimation training loss）。
-`请参考 See Noise-contrastive estimation: A new estimation principle for unnormalized statistical models 
-<http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_ 
+`请参考 See Noise-contrastive estimation: A new estimation principle for unnormalized statistical models
+<http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_
 该operator默认使用均匀分布进行抽样。
 
 参数:
@@ -6094,8 +6088,8 @@ npair_loss
 NPair损失需要成对的数据。NPair损失分为两部分：第一部分是嵌入向量上的L2正则化器；第二部分是以anchor的相似矩阵和正的相似矩阵为逻辑的交叉熵损失。
 
 参数:
-    - **anchor** (Variable) -  嵌入锚定图像的向量。尺寸=[batch_size, embedding_dims] 
-    - **positive** (Variable) -  嵌入正图像的向量。尺寸=[batch_size, embedding_dims] 
+    - **anchor** (Variable) -  嵌入锚定图像的向量。尺寸=[batch_size, embedding_dims]
+    - **positive** (Variable) -  嵌入正图像的向量。尺寸=[batch_size, embedding_dims]
     - **labels** (Variable) - 1维张量，尺寸=[batch_size]
     - **l2_reg** (float32) - 嵌入向量的L2正则化项，默认值：0.002
 
@@ -6105,7 +6099,7 @@ NPair损失需要成对的数据。NPair损失分为两部分：第一部分是
 
 **代码示例**：
 
-.. code-block:: python 
+.. code-block:: python
 
     anchor = fluid.layers.data(
               name = 'anchor', shape = [18, 6], dtype = 'float32', append_batch_size=False)
@@ -6123,7 +6117,7 @@ NPair损失需要成对的数据。NPair损失分为两部分：第一部分是
 
 .. _cn_api_fluid_layers_one_hot:
 
-one_hot 
+one_hot
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.one_hot(input, depth)
@@ -6140,7 +6134,7 @@ one_hot
 
 **代码示例**：
 
-.. code-block:: python 
+.. code-block:: python
 
     label = fluid.layers.data(name="label", shape=[1], dtype="float32")
     one_hot_label = fluid.layers.one_hot(input=label, depth=10)
@@ -6162,7 +6156,7 @@ pad
 
 在张量上加上一个由 ``pad_value`` 给出的常数值，填充宽度由 ``paddings`` 指定。
 其中，维度 ``i`` 中 ``x`` 内容前填充的值个数用 ``paddings[i]`` 表示，维度 ``i`` 中 ``x`` 内容后填充的值个数用 ``paddings[i+1]`` 表示。
-   
+
 一个例子:
 
 ::
@@ -6191,12 +6185,12 @@ pad
 返回：	填充后的张量变量
 
 返回类型： 变量（Variable）
-    
+
 
 **代码示例**
 
 ..  code-block:: python
-        
+
     out = fluid.layers.pad(
     x=x, paddings=[0, 1, 1, 2], pad_value=0.)
 
@@ -6226,7 +6220,7 @@ pad2d
 
       X = [[1, 2, 3],
            [4, 5, 6]]
-     
+
      Case 0:
         paddings = [0, 1, 2, 3],
         mode = 'constant'
@@ -6234,14 +6228,14 @@ pad2d
         Out = [[0, 0, 1, 2, 3, 0, 0, 0]
                [0, 0, 4, 5, 6, 0, 0, 0]
                [0, 0, 0, 0, 0, 0, 0, 0]]
-     
+
      Case 1:
         paddings = [0, 1, 2, 1],
         mode = 'reflect'
         Out = [[3, 2, 1, 2, 3, 2]
                [6, 5, 4, 5, 6, 5]
                [3, 2, 1, 2, 3, 2]]
-     
+
      Case 2:
         paddings = [0, 1, 2, 1],
         mode = 'edge'
@@ -6312,7 +6306,6 @@ pad_constant_like
               [[41, 42, 43]]]]
         Y.shape = (1, 3, 1, 3)
 
-
 参数：
           - **x** （Variable）- 输入Tensor变量。
           - **y** （Variable）- 输出Tensor变量。
@@ -6488,7 +6481,7 @@ pooling3d操作根据input，pool_type，pool_size，strides和paddings参数计
 
 例如，
 
-输入X形为 :math:`(N, C, D_{in}, H_{in}, W_{in})` ，输出形为 :math:`(N, C, D_{out}, H_{out}, W_{out})` 
+输入X形为 :math:`(N, C, D_{in}, H_{in}, W_{in})` ，输出形为 :math:`(N, C, D_{out}, H_{out}, W_{out})`
 
 当ceil_mode = false时，
 
@@ -6627,7 +6620,7 @@ prelu
 返回： 输出Tensor与输入shape相同。
 
 返回类型：  变量（Variable）
-  
+
 **代码示例：**
 
 .. code-block:: python
@@ -6685,7 +6678,7 @@ PyFunc运算。
 
 在调用此函数之前，应正确设置 ``out`` 的数据类型和形状。 但是，``out`` 和 ``x`` 对应梯度的数据类型和形状将自动推断而出。
 
-``backward_func`` 的输入顺序为：前向输入x，前向输出 ``out`` 和反向输入 ``out`` 的梯度。 如果 ``out`` 的某些变量没有梯度，则输入张量在Python端将为None。 
+``backward_func`` 的输入顺序为：前向输入x，前向输出 ``out`` 和反向输入 ``out`` 的梯度。 如果 ``out`` 的某些变量没有梯度，则输入张量在Python端将为None。
 
 如果in的某些变量没有梯度，则用户应返回None。
 
@@ -6853,7 +6846,7 @@ reduce_max
 返回：  运算、减少维度之后的Tensor变量。
 
 返回类型：  变量（Variable）
-          
+
 **代码示例**
 
 ..  code-block:: python
@@ -6901,7 +6894,7 @@ reduce_mean
 返回：  运算、减少维度之后的Tensor变量。
 
 返回类型：  变量（Variable）
-          
+
 **代码示例**
 
 ..  code-block:: python
@@ -6950,7 +6943,7 @@ reduce_min
 返回：  运算、减少维度之后的Tensor变量。
 
 返回类型：  变量（Variable）
-          
+
 **代码示例**
 
 ..  code-block:: python
@@ -6998,7 +6991,7 @@ reduce_prod
 返回：  运算、减少维度之后的Tensor变量。
 
 返回类型：  变量（Variable）
-          
+
 **代码示例**
 
 ..  code-block:: python
@@ -7047,7 +7040,7 @@ reduce_sum
 返回：  运算、减少维度之后的Tensor变量。
 
 返回类型：  变量（Variable）
-          
+
 **代码示例**
 
 ..  code-block:: python
@@ -7067,7 +7060,7 @@ reduce_sum
       # 接下来的示例中，我们在每处函数调用后面都标注出了它的结果张量。
       fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26]
       fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20]
-      
+
 
 
 
@@ -7085,10 +7078,10 @@ relu
 .. py:function:: paddle.fluid.layers.relu(x, name=None)
 
 Relu接受一个输入数据(张量)，输出一个张量。将线性函数y = max(0, x)应用到张量中的每个元素上。
-    
-.. math::                 
+
+.. math::
               \\Out=\max(0,x)\\
- 
+
 
 参数:
   - **x** (Variable):输入张量。
@@ -7123,7 +7116,7 @@ relu6
 relu6激活算子（Relu6 Activation Operator）
 
 .. math::
-  
+
     \\out=min(max(0, x), 6)\\
 
 
@@ -7165,7 +7158,7 @@ reshape
 在指定目标shape时存在一些技巧：
 
 .. code-block:: text
-	
+
 	1. -1表示这个维度的值是从x的元素总数和剩余维度推断出来的。因此，有且只有一个维度可以被设置为-1。
 	2. 0表示实际的维数是从x的对应维数中复制出来的，因此shape中0的索引值不能超过秩(x)。
 
@@ -7230,22 +7223,22 @@ align_corners和align_mode是可选参数，插值的计算方法可以由它们
     Example:
 
       For scale:
-      
+
         if align_corners = True && out_size > 1 :
 
           scale_factor = (in_size-1.0)/(out_size-1.0)
-        
+
         else:
-          
-          scale_factor = float(in_size/out_size)     
+
+          scale_factor = float(in_size/out_size)
 
     Bilinear interpolation:
 
       if align_corners = False , align_mode = 0
-          
+
           input : (N,C,H_in,W_in)
           output: (N,C,H_out,W_out) where:
-          
+
           H_out = (H_{in}+0.5) * scale_{factor} - 0.5
           W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
@@ -7261,7 +7254,7 @@ align_corners和align_mode是可选参数，插值的计算方法可以由它们
 
 
 参数:
-    - **input** (Variable) - 双线性插值的输入张量，是一个shpae为(N x C x h x w)的4d张量。
+    - **input** (Variable) - 双线性插值的输入张量，是一个shape为(N x C x h x w)的4d张量。
     - **out_shape** (Variable) - 一维张量，包含两个数。第一个数是高度，第二个数是宽度。
     - **scale** (float|None) - 用于输入高度或宽度的乘数因子。out_shape和scale至少要设置一个。out_shape的优先级高于scale。默认值:None。
     - **name** (str|None) - 输出变量名。
@@ -7301,18 +7294,18 @@ resize_nearest
     Example:
 
           For scale:
-          
+
             if align_corners = True && out_size > 1 :
 
               scale_factor = (in_size-1.0)/(out_size-1.0)
-            
+
             else:
-              
+
               scale_factor = float(in_size/out_size)
-            
-          
+
+
           Nearest neighbor interpolation:
-          
+
           if align_corners = False
 
               input : (N,C,H_in,W_in)
@@ -7415,9 +7408,9 @@ roi_pool
 
 .. py:function:: paddle.fluid.layers.roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0)
 
-    
+
 roi池化是对非均匀大小的输入执行最大池化，以获得固定大小的特征映射(例如7*7)。
-    
+
 该operator有三个步骤:
 
     1. 用pooled_width和pooled_height将每个区域划分为大小相等的部分
@@ -7426,7 +7419,7 @@ roi池化是对非均匀大小的输入执行最大池化，以获得固定大
 
 Faster-RCNN.使用了roi池化。roi关于roi池化请参考 https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
 
-参数:    
+参数:
     - **input** (Variable) - 张量，ROIPoolOp的输入。输入张量的格式是NCHW。其中N为batch大小，C为输入通道数，H为特征高度，W为特征宽度
     - **roi** (Variable) -  roi区域。
     - **pooled_height** (integer) - (int，默认1)，池化输出的高度。默认:1
@@ -7434,9 +7427,9 @@ Faster-RCNN.使用了roi池化。roi关于roi池化请参考 https://stackoverfl
     - **spatial_scale** (float) - (float，默认1.0)，用于将ROI coords从输入比例转换为池化时使用的比例。默认1.0
 
 返回: (张量)，ROIPoolOp的输出是一个shape为(num_rois, channel, pooled_h, pooled_w)的4d张量。
-    
+
 返回类型: 变量（Variable）
-    
+
 
 **代码示例**
 
@@ -7460,17 +7453,17 @@ row_conv
 
 .. py:function:: paddle.fluid.layers.row_conv(input, future_context_size, param_attr=None, act=None)
 
-行卷积（Row-convolution operator）称为超前卷积（lookahead convolution）。下面关于DeepSpeech2的paper中介绍了这个operator 
-    
-    `<http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf>`_ 
+行卷积（Row-convolution operator）称为超前卷积（lookahead convolution）。下面关于DeepSpeech2的paper中介绍了这个operator
+
+    `<http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf>`_
 
 双向的RNN在深度语音模型中很有用，它通过对整个序列执行正向和反向传递来学习序列的表示。然而，与单向RNNs不同的是，在线部署和低延迟设置中，双向RNNs具有难度。超前卷积将来自未来子序列的信息以一种高效的方式进行计算，以改进单向递归神经网络。 row convolution operator 与一维序列卷积不同，计算方法如下:
-   
+
 给定输入序列长度为 :math:`t` 的输入序列 :math:`X` 和输入维度 :math:`D` ，以及一个大小为 :math:`context * D` 的滤波器 :math:`W` ，输出序列卷积为:
 
-.. math::   
+.. math::
 		out_i = \sum_{j=i}^{i+context-1} X_{j} · W_{j-i}
-    
+
 公式中：
     - :math:`out_i` : 第i行输出变量形为[1, D].
     - :math:`context` ： 下文（future context）大小
@@ -7484,7 +7477,7 @@ row_conv
     - **future_context_size** (int) -- 下文大小。请注意，卷积核的shape是[future_context_size + 1, D]。
     - **param_attr** (ParamAttr) --  参数的属性，包括名称、初始化器等。
     - **act** (str) -- 非线性激活函数。
-    
+
 返回: 输出(Out)是一个LodTensor，它支持可变时间长度的输入序列。这个LodTensor的内部量是一个形状为 T x N 的矩阵，和X的 shape 一样。
 
 
@@ -7493,7 +7486,7 @@ row_conv
 ..  code-block:: python
 
 	import paddle.fluid as fluid
-     
+
      	x = fluid.layers.data(name='x', shape=[16],
                         dtype='float32', lod_level=1)
 	out = fluid.layers.row_conv(input=x, future_context_size=2)
@@ -7683,15 +7676,15 @@ selu
 .. math::
     selu= \lambda*
     \begin{cases}
-         x                      &\quad \text{ if } x>0 \\ 
-         \alpha * e^x - \alpha  &\quad \text{ if } x<=0 
+         x                      &\quad \text{ if } x>0 \\
+         \alpha * e^x - \alpha  &\quad \text{ if } x<=0
     \end{cases}
 
 输入 ``x`` 可以选择性携带LoD信息。输出和它共享此LoD信息(如果有)。
 
 参数:
   - **x** (Variable) – 输入张量
-  - **scale** (float, None) – 如果标度没有设置，其默认值为 1.0507009873554804934193349852946。 详情请见： `Self-Normalizing Neural Networks <https://arxiv.org/abs/1706.02515.pdf>`_ 
+  - **scale** (float, None) – 如果标度没有设置，其默认值为 1.0507009873554804934193349852946。 详情请见： `Self-Normalizing Neural Networks <https://arxiv.org/abs/1706.02515.pdf>`_
   - **alpha** (float, None) – 如果没有设置改参数, 其默认值为 1.6732632423543772848170429916717。 详情请见： `Self-Normalizing Neural Networks <https://arxiv.org/abs/1706.02515.pdf>`_
   - **name** (str|None, default None) – 该层命名，若为None则自动为其命名
 
@@ -7729,7 +7722,7 @@ sequence_concat操作通过序列信息连接LoD张量（Tensor）。例如：X1
 参数:
         - **input** (list) – 要连接变量的列表
         - **name** (str|None) – 此层的名称(可选)。如果没有设置，该层将被自动命名。
-        
+
 返回:     连接好的输出变量。
 
 返回类型:   变量（Variable）
@@ -7740,7 +7733,7 @@ sequence_concat操作通过序列信息连接LoD张量（Tensor）。例如：X1
 ..  code-block:: python
 
         out = fluid.layers.sequence_concat(input=[seq1, seq2, seq3])
-        
+
 
 
 
@@ -7752,7 +7745,7 @@ sequence_concat操作通过序列信息连接LoD张量（Tensor）。例如：X1
 
 .. _cn_api_fluid_layers_sequence_conv:
 
-sequence_conv 
+sequence_conv
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.sequence_conv(input, num_filters, filter_size=3, filter_stride=1, padding=None, bias_attr=None, param_attr=None, act=None, name=None)
@@ -7800,16 +7793,16 @@ sequence_enumerate
             win_size = 2  pad_value = 0
         输出：
             Out.lod = [[0, 3, 5]]  Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]  Out.dims = [5, 2]
-        
-参数:   
+
+参数:
         - **input** （Variable）- 作为索引序列的输入变量。
         - **win_size** （int）- 枚举所有子序列的窗口大小。
         - **pad_value** （int）- 填充值，默认为0。
-          
+
 返回:      枚举序列变量是LoD张量（LoDTensor）。
 
 返回类型:   Variable
-          
+
 **代码示例**
 
 ..  code-block:: python
@@ -7827,7 +7820,7 @@ sequence_enumerate
 
 .. _cn_api_fluid_layers_sequence_expand:
 
-sequence_expand 
+sequence_expand
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.sequence_expand(x, y, ref_level=-1, name=None)
@@ -7899,7 +7892,7 @@ sequence_expand
 
 .. _cn_api_fluid_layers_sequence_expand_as:
 
-sequence_expand_as 
+sequence_expand_as
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.sequence_expand_as(x, y, name=None)
@@ -7929,13 +7922,13 @@ Sequence Expand As Layer
     给定一个 input(X)：
         X.data = [[a, b], [c, d], [e, f]]
         X.dims = [3, 2]
-    
+
     和 input(Y):
         Y.lod = [[0, 2, 3, 6]]
     ref_level: 0
 
     得到输出张量：
-    
+
         Out.lod =  [[0,             2,     3,                    6]]
         Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
         Out.dims = [6, 2]
@@ -8034,9 +8027,9 @@ sequence_last_step
     输出为Tensor:
 
         out.dim = [3, 1]
-        
+
         且 len(x.lod[-1]) == out.dims[0]
-        
+
         out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
 
 参数：**input** (variable)-输入变量，为LoDTensor
@@ -8113,13 +8106,13 @@ sequence_pad
     例1:
 
     给定 1-level LoDTensor
-    
+
     input(X):
         X.lod = [[0,2,5]]
         X.data = [a,b,c,d,e]
     input(PadValue):
         PadValue.data = [0]
-    
+
     'padded_length'=4
 
     得到LoDTensor:
@@ -8129,17 +8122,17 @@ sequence_pad
 ::
 
     例2:
-    
+
     给定 1-level LoDTensor
-    
+
     input(X):
         X.lod = [[0,2,5]]
         X.data = [[a1,a2],[b1,b2],[c1,c2],[d1,d2],[e1,e2]]
     input(PadValue):
         PadValue.data = [0]
-    
+
     'padded_length' = -1,表示用最长输入序列的长度(此例中为3)
-    
+
     得到LoDTensor:
         Out.data = [[[a1,a2],[b1,b2],[0,0]],[[c1,c2],[d1,d2],[e1,e2]]]
         Length.data = [[2],[3]]
@@ -8148,17 +8141,17 @@ sequence_pad
 ::
 
     例3:
-    
+
     给定 1-level LoDTensor
-    
+
     input(X):
         X.lod = [[0,2,5]]
         X.data = [[a1,a2],[b1,b2],[c1,c2],[d1,d2],[e1,e2]]
     input(PadValue):
         PadValue.data = [p1,p2]
-    
+
     'padded_length' = -1,表示用最长输入序列的长度（此例中为3）
-    
+
     得到LoDTensor:
         Out.data = [[[a1,a2],[b1,b2],[p1,p2]],[[c1,c2],[d1,d2],[e1,e2]]]
         Length.data = [[2],[3]]
@@ -8196,7 +8189,7 @@ sequence_pad
 
 .. _cn_api_fluid_layers_sequence_pool:
 
-sequence_pool 
+sequence_pool
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.sequence_pool(input, pool_type, is_test=False)
@@ -8262,7 +8255,7 @@ sequence_pool
 sequence_reshape
 -------------------------------
 
-.. py:function:: paddle.fluid.layers.sequence_reshape(input, new_dim) 
+.. py:function:: paddle.fluid.layers.sequence_reshape(input, new_dim)
 
 Sequence Reshape Layer
 该层重排输入序列。用户设置新维度。每一个序列的的长度通过原始长度、原始维度和新的维度计算得出。以下实例帮助解释该层的功能
@@ -8333,7 +8326,7 @@ sequence_reverse
 
 ::
 
-    Y.data() = [ [5, 6, 7, 8], [1, 2, 3, 4], # 索引为0，长度为2的逆序列 
+    Y.data() = [ [5, 6, 7, 8], [1, 2, 3, 4], # 索引为0，长度为2的逆序列
                  [17, 18, 19, 20], [13, 14, 15, 16], [9, 10, 11, 12] # 索引为1，长度为3的逆序列
 
 该运算在建立反dynamic RNN 网络中十分有用。
@@ -8370,7 +8363,7 @@ sequence_scatter
 这个operator将更新张量X，它使用Ids的LoD信息来选择要更新的行，并使用Ids中的值作为列来更新X的每一行。
 
 **样例**:
- 
+
 ::
 
     输入：
@@ -8449,7 +8442,7 @@ sequence_slice
         out.dims = (3, 2).
 
 .. note::
-   ``input`` ， ``offset`` ， ``length`` 的第一维大小应相同。 
+   ``input`` ， ``offset`` ， ``length`` 的第一维大小应相同。
    ``offset`` 从0开始。
 
 参数:
@@ -8575,7 +8568,7 @@ sequence_unpad
     x = fluid.layers.data(name='x', shape=[10, 5], dtype='float32')
     len = fluid.layers.data(name='length', shape=[1], dtype='int64')
     out = fluid.layers.sequence_unpad(x=x, length=len)
-    
+
 
 
 
@@ -8605,14 +8598,13 @@ shape层。
 
 返回类型：    Variable
         
-        
 **代码示例：**
 
 .. code-block:: python
 
     input = fluid.layers.data(
         name="input", shape=[3, 100, 100], dtype="float32")
-    out = fluid.layers.shape(input)        
+    out = fluid.layers.shape(input)
 
 
 
@@ -8671,14 +8663,14 @@ shuffle_channel
 返回：通道混洗结果是一个张量变量，其形状和类型与输入相同。
 
 返回类型：输出（Variable）
-        
-        
+
+
 **代码示例：**
 
 .. code-block:: python
 
     input = fluid.layers.data(name='input', shape=[4,2,2], dtype='float32')
-    out = fluid.layers.shuffle_channel(x=input, group=2)    
+    out = fluid.layers.shuffle_channel(x=input, group=2)
 
 
 
@@ -8701,7 +8693,7 @@ sigmoid_cross_entropy_with_logits
 
 .. math::
     loss = -Labels * log(sigma(X)) - (1 - Labels) * log(1 - sigma(X))
- 
+
 已知:
 
 .. math::
@@ -8722,7 +8714,7 @@ sigmoid_cross_entropy_with_logits
 
 
 参数:
-  - **x** (Variable) - (Tensor, 默认 Tensor<float>)，形为 N x D 的二维张量，N为batch大小，D为类别数目。该输入是一个由先前运算得出的logit组成的张量。logit是未标准化(unscaled)的log概率， 公式为 :math:`log(\frac{p}{1-p})` 
+  - **x** (Variable) - (Tensor, 默认 Tensor<float>)，形为 N x D 的二维张量，N为batch大小，D为类别数目。该输入是一个由先前运算得出的logit组成的张量。logit是未标准化(unscaled)的log概率， 公式为 :math:`log(\frac{p}{1-p})`
   - **label** (Variable) -  (Tensor, 默认 Tensor<float>) 具有和X相同类型，相同形状的二维张量。该输入张量代表了每个logit的可能标签
   - **ignore_index** （int） - （int，默认kIgnoreIndex）指定被忽略的目标值，它不会影响输入梯度
   - **name** (basestring|None) - 输出的名称
@@ -8781,7 +8773,7 @@ similarity_focus
 
     给定四维张量 x 形为 (BatchSize, C, A, B), 其中C 为通道Channel数目，
     特征图（feature map）的形为（A,B）：
-    
+
         x.shape = (2, 3, 2, 2)
         x.data = [[[[0.8, 0.1],
                     [0.4, 0.5]],
@@ -8868,17 +8860,17 @@ slice算子。
 
 ::
 
-        案例1：给定：data=[[1,2,3,4],[5,6,7,8],] 
-                     axes=[0,1] 
-                     starts=[1,0] 
-                     ends=[2,3] 
+        案例1：给定：data=[[1,2,3,4],[5,6,7,8],]
+                     axes=[0,1]
+                     starts=[1,0]
+                     ends=[2,3]
                则：
                      result=[[5,6,7],]
 
         案例2：给定：
-                     data=[[1,2,3,4],[5,6,7,8],] 
-                     starts=[0,1] 
-                     ends=[-1,1000] 
+                     data=[[1,2,3,4],[5,6,7,8],]
+                     starts=[0,1]
+                     ends=[-1,1000]
                则：
                      result=[[2,3,4],]
 
@@ -8928,15 +8920,15 @@ smooth_l1
         - **inside_weight** (Variable|None) - rank至少为2的张量。这个输入是可选的，与x的形状应该相同。如果给定， ``(x - y)`` 的结果将乘以这个张量元素。
         - **outside_weight** (变量|None) - 一个rank至少为2的张量。这个输入是可选的，它的形状应该与 ``x`` 相同。如果给定，那么 smooth L1 loss 就会乘以这个张量元素。
         - **sigma** (float|None) - smooth L1 loss layer的超参数。标量，默认值为1.0。
-   
+
 返回：	smooth L1 loss, shape为 [batch_size, 1]
 
-返回类型:  Variable    
+返回类型:  Variable
 
 **代码示例**
 
 ..  code-block:: python
-        
+
     data = fluid.layers.data(name='data', shape=[128], dtype='float32')
     label = fluid.layers.data(
         name='label', shape=[100], dtype='float32')
@@ -8962,7 +8954,7 @@ soft_relu
 SoftRelu 激活函数
 
 .. math::   out=ln(1+exp(max(min(x,threshold),threshold))
- 
+
 参数:
     - **x** (variable) - SoftRelu operator的输入
     - **threshold** (FLOAT|40.0) - SoftRelu的阈值
@@ -8972,7 +8964,7 @@ SoftRelu 激活函数
 
 .. code-block:: python
 
-    x = fluid.layers.data(name=”x”, shape=[2,3,16,16], dtype=”float32”) 
+    x = fluid.layers.data(name=”x”, shape=[2,3,16,16], dtype=”float32”)
     y = fluid.layers.soft_relu(x, threshold=20.0)
 
 
@@ -9065,6 +9057,7 @@ softmax_with_cross_entropy
 
 
 参数:
+
   - **logits** (Variable) - 未标准化(unscaled)的log概率,一个形为 N X K 的二维张量。 N是batch大小，K是类别总数。
   - **label** (Variable) - 2-D 张量，代表了正确标注（ground truth）, 如果 ``soft_label`` 为  False，则该参数是一个形为 N X 1 的Tensor<int64> 。如果 ``soft_label`` 为 True，它是 Tensor<float/double> ，形为 N X K 。
   - **soft_label** (bool) - 是否将输入标签当作软标签。默认为False。
@@ -9072,7 +9065,7 @@ softmax_with_cross_entropy
   - **numeric_stable_mode** (bool) – 标志位，指明是否使用一个具有更佳数学稳定性的算法。仅在 ``soft_label`` 为 False的GPU模式下生效. 若 ``soft_label`` 为 True 或者执行场所为CPU, 算法一直具有数学稳定性。 注意使用稳定算法时速度可能会变慢。默认为 True。
   - **return_softmax** (bool) – 标志位，指明是否额外返回一个softmax值， 同时返回交叉熵计算结果。默认为False。
 
-返回: 
+返回:
   - 如果 ``return_softmax`` 为 False， 则返回交叉熵损失
   - 如果 ``return_softmax`` 为 True，则返回元组 (loss, softmax) ，其中交叉熵损失为形为[N x 1]的二维张量，softmax为[N x K]的二维张量
 
@@ -9088,8 +9081,8 @@ softmax_with_cross_entropy
         fc = fluid.layers.fc(input=data, size=100)
         out = fluid.layers.softmax_with_cross_entropy(
         logits=fc, label=label)
-      
-      
+
+
 
 
 
@@ -9117,7 +9110,7 @@ space_to_depth
 
  - 在各位置上，不重叠的，大小为 :math:`block\_size * block\_size` 的块重组入深度depth
  - 输出张量的深度为 :math:`block\_size * block\_size * input\_channel`
- - 输入各个块中的Y,X坐标变为输出张量通道索引的高序部位 
+ - 输入各个块中的Y,X坐标变为输出张量通道索引的高序部位
  - channel可以被blocksize的平方整除
  - 高度，宽度可以被blocksize整除
 
@@ -9129,7 +9122,7 @@ space_to_depth
 
 返回类型：Variable
 
-抛出异常： 
+抛出异常：
   - ``TypeError`` - ``blocksize`` 必须是long类型
 
 **代码示例**
@@ -9164,7 +9157,7 @@ spectral_norm
 
 步骤1：生成形状为[H]的向量U,以及形状为[W]的向量V,其中H是输入权重的第 ``dim`` 个维度，W是剩余维度的乘积。
 
-步骤2： ``power_iters`` 应该是一个正整数，用U和V迭代计算 ``power_iters`` 轮。 
+步骤2： ``power_iters`` 应该是一个正整数，用U和V迭代计算 ``power_iters`` 轮。
 
 .. math::
 
@@ -9177,7 +9170,7 @@ spectral_norm
     \sigma(\mathbf{W}) &= \mathbf{u}^{T} \mathbf{W} \mathbf{v}\\
     \mathbf{W} &= \frac{\mathbf{W}}{\sigma(\mathbf{W})}
 
-可参考: `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ 
+可参考: `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_
 
 参数：
     - **weight** (Variable)-spectral_norm算子的输入权重张量，可以是2-D, 3-D, 4-D, 5-D张量，它是fc、conv1d、conv2d、conv3d层的权重。
@@ -9245,7 +9238,7 @@ split
 
 .. _cn_api_fluid_layers_square_error_cost:
 
-square_error_cost 
+square_error_cost
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.square_error_cost(input,label)
@@ -9291,7 +9284,7 @@ square_error_cost
 
 .. _cn_api_fluid_layers_squeeze:
 
-squeeze 
+squeeze
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.squeeze(input, axes, name=None)
@@ -9330,7 +9323,7 @@ squeeze
 .. code-block:: python
 
     x = fluid.layers.data(name='x', shape=[5, 1, 10])
-    y = fluid.layers.sequeeze(input=x, axes=[1])      
+    y = fluid.layers.sequeeze(input=x, axes=[1])
 
 
 
@@ -9395,6 +9388,7 @@ stack
         Out.dims = [1, 3, 2]
 
 参数:	
+
   - **x** (Variable|list(Variable)|tuple(Variable)) – 输入变量
   - **axis** (int|None) – 对输入进行stack运算所在的轴
 
@@ -9421,7 +9415,7 @@ stanh
 
 STanh 激活算子（STanh Activation Operator.）
 
-.. math::      
+.. math::
           \\out=b*\frac{e^{a*x}-e^{-a*x}}{e^{a*x}+e^{-a*x}}\\
 
 参数：
@@ -9488,7 +9482,7 @@ swish
 
 Swish 激活函数
 
-.. math::   
+.. math::
          out = \frac{x}{1 + e^{- beta x}}
 
 参数：
@@ -9537,7 +9531,7 @@ teacher_student_sigmoid_loss
 
 **代码示例**：
 
-.. code-block:: python 
+.. code-block:: python
 
     cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label)
 
@@ -9572,7 +9566,7 @@ temporal_shift
 
 步骤4：沿第3(C)维连接三个切片，并将结果重塑为[N*T, C, H, W]。
 
-有关时间移动的详细信息，请参阅文件： `Temporal Shift Module <https://arxiv.org/abs/1811.08383>`_ 
+有关时间移动的详细信息，请参阅文件： `Temporal Shift Module <https://arxiv.org/abs/1811.08383>`_
 
 参数：
   - **x**  (Variable) – 时移算符的输入张量。这是一个4维张量，形状为[N*T，C，H，W]。N为批量大小，T为时间段数，C为信道数，H为特征高度，W为特征宽度
@@ -9589,7 +9583,7 @@ temporal_shift
 
 **代码示例**：
 
-.. code-block:: python 
+.. code-block:: python
 
     input = fluid.layers.data(name='input', shape=[4,2,2], dtype='float32')
     out = fluid.layers.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
@@ -9642,7 +9636,7 @@ topk
 
 **代码示例**：
 
-.. code-block:: python 
+.. code-block:: python
 
     top5_values, top5_indices = fluid.layers.topk(input, k=5)
 
@@ -9728,7 +9722,7 @@ tree_conv
     # 输出的形会是[None, 10, 6, 1],
     # None 代表batch size, 10数据集的最大节点大小max_node_size, 6 代表输出大小output size, 1 代表 1 个filter
     out_vector = fluid.layers.reshape(out_vector, shape=[None, 10, 6])
-    # reshape之后, 输出张量output tensor为下一个树卷积的nodes_vector 
+    # reshape之后, 输出张量output tensor为下一个树卷积的nodes_vector
     out_vector_2 = fluid.layers.tree_conv(out_vector, edge_set, 3, 4, 2, 'tanh',
         ParamAttr(initializer=Constant(1.0), ParamAttr(initializer=Constant(1.0))
     # 输出tensor也可以用来池化(论文中称为global pooling)
@@ -9837,9 +9831,9 @@ unstack
 
 如果 ``num`` 为 None，则它可以从 ``x.shape[axis]`` 中推断而来。
 
-如果 ``x.shape[axis]`` <= 0或者Unknown, 则抛出异常 ``ValueError`` 。 
+如果 ``x.shape[axis]`` <= 0或者Unknown, 则抛出异常 ``ValueError`` 。
 
-参数:	
+参数:
   - **x** (Variable|list(Variable)|tuple(Variable)) – 输入变量
   - **axis** (int|None) – 对输入进行unstack运算所在的轴
   - **num** (int|None) - 输出变量的数目
@@ -9847,7 +9841,7 @@ unstack
 返回: 经unstack运算后的变量
 
 返回类型: list(Variable)
-  
+
 
 
 
@@ -9897,7 +9891,7 @@ warpctc
 
 
 ============
- ops 
+ ops
 ============
 
 
@@ -9914,6 +9908,7 @@ abs
     out = |x|
 
 参数:
+
     - **x** - abs算子的输入 
     - **use_cudnn** (BOOLEAN) – （bool，默认为false）是否仅用于cudnn核，需要安装cudnn
  
@@ -9997,12 +9992,13 @@ ceil
 
 
 参数:
+
     - **x** - Ceil算子的输入 
     - **use_cudnn** (BOOLEAN) – （bool，默认为false）是否仅用于cudnn核，需要安装cudnn
 
 返回：        Ceil算子的输出。
-        
-        
+
+
 
 
 
@@ -10027,6 +10023,7 @@ Cosine余弦激活函数。
 
 
 参数:
+
     - **x** - cos算子的输入 
     - **use_cudnn** (BOOLEAN) – （bool，默认为false）是否仅用于cudnn核，需要安装cudnn
 
@@ -10087,6 +10084,7 @@ Exp激活函数(Exp指以自然常数e为底的指数运算)。
     out = e^x
 
 参数:
+
     - **x** - Exp算子的输入 
     - **use_cudnn** (BOOLEAN) – （bool，默认为false）是否仅用于cudnn核，需要安装cudnn
 
@@ -10117,6 +10115,7 @@ floor
 
 
 参数:
+
     - **x** - Floor算子的输入 
     - **use_cudnn** (BOOLEAN) – （bool，默认为false）是否仅用于cudnn核，需要安装cudnn
 
@@ -10144,7 +10143,7 @@ HardShrink激活函数(HardShrink activation operator)
 
 
 .. math::
-	
+
 	out = \begin{cases}
         x, \text{if } x > \lambda \\
         x, \text{if } x < -\lambda \\
@@ -10162,7 +10161,7 @@ HardShrink激活函数(HardShrink activation operator)
 .. code-block:: python
 
     data = fluid.layers.data(name="input", shape=[784])
-    result = fluid.layers.hard_shrink(x=data, threshold=0.3)    
+    result = fluid.layers.hard_shrink(x=data, threshold=0.3)
 
 
 
@@ -10189,6 +10188,7 @@ Logsigmoid激活函数。
 
 参数:
     - **x** - LogSigmoid算子的输入
+    
     - **use_cudnn** (BOOLEAN) – （bool，默认为false）是否仅用于cudnn核，需要安装cudnn 
 
 返回：        LogSigmoid算子的输出
@@ -10216,15 +10216,13 @@ Reciprocal（取倒数）激活函数
     out = \frac{1}{x}
 
 参数:
+
     - **x** - reciprocal算子的输入 
     - **use_cudnn** (BOOLEAN) – （bool，默认为false）是否仅用于cudnn核，需要安装cudnn 
 
-返回：        Reciprocal算子的输出。        
-
+返回：        Reciprocal算子的输出。
 
 
-        
-        
 
 
 
@@ -10248,12 +10246,13 @@ Round取整激活函数。
 
 
 参数:
+
     - **x** - round算子的输入 
     - **use_cudnn** (BOOLEAN) – （bool，默认为false）是否仅用于cudnn核，需要安装cudnn 
 
 返回：        Round算子的输出。
-        
-        
+
+
 
 
 
@@ -10276,13 +10275,14 @@ sigmoid激活函数
 
 
 参数:
+
     - **x** - Sigmoid算子的输入 
     - **use_cudnn** (BOOLEAN) – （bool，默认为false）是否仅用于cudnn核，需要安装cudnn
 
 返回：     Sigmoid运算输出.
 
 
- 
+
 
 
 
@@ -10304,6 +10304,7 @@ sin
 
 
 参数:
+
     - **x** - sin算子的输入 
     - **use_cudnn** (BOOLEAN) – （bool，默认为false）是否仅用于cudnn核，需要安装cudnn
 
@@ -10351,7 +10352,7 @@ softplus激活函数。
 softshrink
 -------------------------------
 
-.. py:function:: paddle.fluid.layers.softshrink(x, name=None)       
+.. py:function:: paddle.fluid.layers.softshrink(x, name=None)
 
 Softshrink激活算子
 
@@ -10361,9 +10362,9 @@ Softshrink激活算子
                     x + \lambda, \text{if } x < -\lambda \\
                     0,  \text{otherwise}
               \end{cases}
-       
+
 参数：
-        - **x** - Softshrink算子的输入 
+        - **x** - Softshrink算子的输入
         - **lambda** （FLOAT）- 非负偏移量。
 
 返回：       Softshrink算子的输出
@@ -10421,6 +10422,7 @@ sqrt
     out = \sqrt{x}
 
 参数:
+
     - **x** - Sqrt算子的输入 
     - **use_cudnn** (BOOLEAN) – （bool，默认为false）是否仅用于cudnn核，需要安装cudnn
 
@@ -10479,6 +10481,7 @@ tanh 激活函数。
 
 
 参数:
+
     - **x** - Tanh算子的输入  
     - **use_cudnn** (BOOLEAN) – （bool，默认为false）是否仅用于cudnn核，需要安装cudnn
 
@@ -10507,6 +10510,7 @@ tanh_shrink激活函数。
     out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
 
 参数:
+
     - **x** - TanhShrink算子的输入 
     - **use_cudnn** (BOOLEAN) – （bool，默认为false）是否仅用于cudnn核，需要安装cudnn
 
@@ -10532,8 +10536,8 @@ ThresholdedRelu激活函数
 .. math::
 
 	out = \left\{\begin{matrix}
-	    x, if&x > threshold\\ 
-	    0, &otherwise 
+	    x, if&x > threshold\\
+	    0, &otherwise
 	    \end{matrix}\right.
 
 参数：
@@ -10590,7 +10594,7 @@ uniform_random
 
 
 ============
- tensor 
+ tensor
 ============
 
 
@@ -10600,7 +10604,7 @@ argmax
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.argmax(x,axis=0)
-    
+
 **argmax**
 
 该功能计算输入张量元素中最大元素的索引，张量的元素在提供的轴上。
@@ -10634,7 +10638,7 @@ argmin
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.argmin(x,axis=0)
-    
+
 **argmin**
 
 该功能计算输入张量元素中最小元素的索引，张量元素在提供的轴上。
@@ -10653,7 +10657,7 @@ argmin
 
     out = fluid.layers.argmin(x=in, axis=0)
     out = fluid.layers.argmin(x=in, axis=-1)
-    
+
 
 
 
@@ -10673,7 +10677,7 @@ argsort
 
 .. code-block:: text
 
-    例如： 
+    例如：
 	给定 input 并指定 axis=-1
 
         input = [[0.15849551, 0.45865775, 0.8563702 ],
@@ -10683,7 +10687,7 @@ argsort
 
         out = [[0.15849551, 0.45865775, 0.8563702 ],
             [0.12070083, 0.18776911, 0.28766365]],
-	
+
 	根据指定axis排序后的数据indices变为:
 
         indices = [[0, 1, 2],
@@ -10750,7 +10754,7 @@ assign
 
 .. _cn_api_fluid_layers_cast:
 
-cast 
+cast
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.cast(x,dtype)
@@ -10787,7 +10791,7 @@ concat
 
 .. py:function:: paddle.fluid.layers.concat(input,axis=0,name=None)
 
-**Concat** 
+**Concat**
 
 这个函数将输入连接在前面提到的轴上，并将其作为输出返回。
 
@@ -11067,7 +11071,7 @@ isfinite
 
 .. _cn_api_fluid_layers_ones:
 
-ones 
+ones
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.ones(shape,dtype,force_cpu=False)
@@ -11292,13 +11296,13 @@ zeros
 
 
 ============
- learning_rate_scheduler 
+ learning_rate_scheduler
 ============
 
 
 .. _cn_api_fluid_layers_append_LARS:
 
-append_LARS 
+append_LARS
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.append_LARS(params_grads,learning_rate,weight_decay)
@@ -11357,7 +11361,7 @@ cosine_decay
 
 .. _cn_api_fluid_layers_exponential_decay:
 
-exponential_decay 
+exponential_decay
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.exponential_decay(learning_rate,decay_steps,decay_rate,staircase=False)
@@ -11370,7 +11374,7 @@ exponential_decay
     if staircase == True:
         decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
     else:
-        decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)    
+        decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
 
 参数：
     - **learning_rate** (Variable|float)-初始学习率
@@ -11539,7 +11543,7 @@ Noam衰减方法。noam衰减的numpy实现如下。
                            np.power(current_steps, -0.5),
                            np.power(warmup_steps, -1.5) * current_steps])
 
-请参照 `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_ 
+请参照 `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_
 
 参数：
     - **d_model** (Variable)-模型的输入和输出维度
@@ -11593,7 +11597,7 @@ piecewise_decay
 
 .. _cn_api_fluid_layers_polynomial_decay:
 
-polynomial_decay 
+polynomial_decay
 -------------------------------
 
 .. py:function:: paddle.fluid.layers.polynomial_decay(learning_rate,decay_steps,end_learning_rate=0.0001,power=1.0,cycle=False)
@@ -11631,7 +11635,7 @@ polynomial_decay
 
 
 ============
- detection 
+ detection
 ============
 
 
@@ -11657,7 +11661,7 @@ anchor_generator
 
 返回：
     - Anchors(Varibale): 输出anchor，布局[H,W,num_anchors,4] , ``H``  是输入的高度， ``W`` 是输入的宽度， ``num_priors`` 是输入每位的框数,每个anchor格式（未归一化）为(xmin,ymin,xmax,ymax)
-    
+
     - Variances(Variable): anchor的扩展变量布局为 [H,W,num_priors,4]。 ``H`` 是输入的高度， ``W`` 是输入的宽度， ``num_priors`` 是输入每个位置的框数,每个变量的格式为(xcenter,ycenter,w,h)。
 
 返回类型：Anchors(Variable),Variances(Variable)
@@ -11683,7 +11687,7 @@ anchor_generator
 
 
 .. _cn_api_fluid_layers_bipartite_match:
-        
+
 bipartite_match
 -------------------------------
 
@@ -11725,7 +11729,7 @@ bipartite_match
 
 
 .. _cn_api_fluid_layers_box_clip:
-        
+
 box_clip
 -------------------------------
 
@@ -11753,7 +11757,7 @@ box_clip
     - **im_info (variable)**  – 具有（高度height，宽度width，比例scale）排列的形为[N，3]的图像的信息。高度和宽度是输入大小，比例是输入大小和原始大小的比率
     - **name (str)**  – 该层的名称。 为可选项
 
-返回：剪切后的tensor 
+返回：剪切后的tensor
 
 返回类型： Variable
 
@@ -12002,7 +12006,7 @@ density prior box的量由fixed_sizes and fixed_ratios决定。显然地，fixed
 
 
 .. _cn_api_fluid_layers_detection_map:
-        
+
 detection_map
 -------------------------------
 
@@ -12011,7 +12015,7 @@ detection_map
 检测mAP评估算子。一般步骤如下：首先，根据检测输入和标签计算TP（true positive）和FP（false positive），然后计算mAP评估值。支持'11 point'和积分mAP算法。请从以下文章中获取更多信息：
 
         https://sanchom.wordpress.com/tag/average-precision/
-        
+
         https://arxiv.org/abs/1512.02325
 
 参数：
@@ -12025,19 +12029,19 @@ detection_map
         - **input_states** - 如果不是None，它包含3个元素：
 
             1、pos_count（Tensor）是一个shape为[Ncls，1]的张量，存储每类的输入正例的数量，Ncls是输入分类的数量。此输入用于在执行多个小批量累积计算时传递最初小批量生成的AccumPosCount。当输入（PosCount）为空时，不执行累积计算，仅计算当前小批量的结果。
-        
+
             2、true_pos（LoDTensor）是一个shape为[Ntp，2]的2-D LoDTensor，存储每个类输入的正实例。此输入用于在执行多个小批量累积计算时传递最初小批量生成的AccumPosCount。
-        
+
             3、false_pos（LoDTensor）是一个shape为[Nfp，2]的2-D LoDTensor，存储每个类输入的负实例。此输入用于在执行多个小批量累积计算时传递最初小批量生成的AccumPosCount。
-        
+
         - **out_states** - 如果不是None，它包含3个元素：
 
-            1、accum_pos_count（Tensor）是一个shape为[Ncls，1]的Tensor，存储每个类的实例数。它结合了输入（PosCount）和从输入中的（Detection）和（label）计算的正例数。 
-        
-            2、accum_true_pos（LoDTensor）是一个shape为[Ntp'，2]的LoDTensor，存储每个类的正实例。它结合了输入（TruePos）和从输入中（Detection）和（label）计算的正实例数。 。 
-        
+            1、accum_pos_count（Tensor）是一个shape为[Ncls，1]的Tensor，存储每个类的实例数。它结合了输入（PosCount）和从输入中的（Detection）和（label）计算的正例数。
+
+            2、accum_true_pos（LoDTensor）是一个shape为[Ntp'，2]的LoDTensor，存储每个类的正实例。它结合了输入（TruePos）和从输入中（Detection）和（label）计算的正实例数。 。
+
             3、accum_false_pos（LoDTensor）是一个shape为[Nfp'，2]的LoDTensor，存储每个类的负实例。它结合了输入（FalsePos）和从输入中（Detection）和（label）计算的负实例数。
-        
+
         - **ap_version** （string，默认'integral'）- AP算法类型，'integral'或'11 point'。
 
 返回：        具有形状[1]的（Tensor），存储mAP的检测评估结果。
@@ -12207,7 +12211,7 @@ generate_mask_labels
     feeder.feed(batch_masks)
 
 
-参数： 
+参数：
     - **im_info**  (Variable) – 具有形状[N，3]的2-D张量。 N是批量大小，其每个元素是图像的[高度，宽度，比例]，对应第二维中的3。图像比例是 :math:`\frac{target\_size}{original\_size}` 。
     - **gt_classes**  (Variable) – 形为[M，1]的2-D LoDTensor。 M是真实值的总数，其每个元素都是一个类标签，对应第二维中的1。
     - **is_crowd**  (Variable) – 一个形为 ``gt_classes`` 的2-D LoDTensor，每个元素都是一个标志，指示一个groundtruth是否为crowd（群）。
@@ -12219,7 +12223,7 @@ generate_mask_labels
 
 返回：
     - 形为[P，4]的2D LoDTensor。 P是采样出的RoI总数。每个元素都是在原始图像大小范围内具有[xmin，ymin，xmax，ymax]格式的边界框(bounding box)。
-    - mask_rois_has_mask_int32（Variable）：形状为[P，1]的2D LoDTensor，其中每个元素为对于输入的RoI进行输出的mask RoI 索引 
+    - mask_rois_has_mask_int32（Variable）：形状为[P，1]的2D LoDTensor，其中每个元素为对于输入的RoI进行输出的mask RoI 索引
     - mask_int32（Variable）：形状为[P，K * M * M]的2D LoDTensor，K为种类数，M为mask预测的分辨率，每个元素都是二进制目标mask值。
 
 返回类型：mask_rois (Variable)
@@ -12263,7 +12267,7 @@ generate_proposal_labels
 
 该函数可以根据 ``GenerateProposals`` 的输出结果，即bounding boxes（区域框），groundtruth（正确标记数据）来对foreground boxes和background boxes进行采样，并计算loss值。
 
-RpnRois 是RPN的输出box， 并由 ``GenerateProposals`` 来进一步处理, 这些box将与groundtruth boxes合并， 并根据 ``batch_size_per_im`` 和 ``fg_fraction`` 进行采样。 
+RpnRois 是RPN的输出box， 并由 ``GenerateProposals`` 来进一步处理, 这些box将与groundtruth boxes合并， 并根据 ``batch_size_per_im`` 和 ``fg_fraction`` 进行采样。
 
 如果一个实例具有大于 ``fg_thresh`` (前景重叠阀值)的正确标记重叠，那么它会被认定为一个前景样本。
 如果一个实例具有的正确标记重叠大于 ``bg_thresh_lo`` 且小于 ``bg_thresh_hi`` (详见参数说明)，那么它将被认定为一个背景样本。
@@ -12271,7 +12275,7 @@ RpnRois 是RPN的输出box， 并由 ``GenerateProposals`` 来进一步处理, 
 
 对Rois中的每个box, 我们给它分配类标签和回归目标(box label)。最后 ``bboxInsideWeights`` 和 ``BboxOutsideWeights`` 用来指明是否它将影响训练loss值。
 
-参数:	
+参数:
   - **rpn_rois** (Variable) – 形为[N, 4]的二维LoDTensor。 N 为 ``GenerateProposals`` 的输出结果, 其中各元素为 :math:`[x_{min}, y_{min}, x_{max}, y_{max}]` 格式的边界框
   - **gt_classes** (Variable) – 形为[M, 1]的二维LoDTensor。 M 为正确标记数据数目, 其中各元素为正确标记数据的类别标签
   - **is_crowd** (Variable) – 形为[M, 1]的二维LoDTensor。M 为正确标记数据数目, 其中各元素为一个标志位，表明一个正确标记数据是不是crowd
@@ -12304,7 +12308,7 @@ RpnRois 是RPN的输出box， 并由 ``GenerateProposals`` 来进一步处理, 
 generate_proposals
 -------------------------------
 
-.. py:function:: paddle.fluid.layers.generate_proposals(scores, bbox_deltas, im_info, anchors, variances, pre_nms_top_n=6000, post_nms_top_n=1000, nms_thresh=0.5, min_size=0.1, eta=1.0, name=None) 
+.. py:function:: paddle.fluid.layers.generate_proposals(scores, bbox_deltas, im_info, anchors, variances, pre_nms_top_n=6000, post_nms_top_n=1000, nms_thresh=0.5, min_size=0.1, eta=1.0, name=None)
 
 生成proposal的Faster-RCNN
 
@@ -12313,15 +12317,15 @@ generate_proposals
 为了生成 ``proposals`` ，此操作执行以下步骤：
 
         1、转置和调整bbox_deltas的分数和大小为（H * W * A，1）和（H * W * A，4）。
-        
+
         2、计算方框位置作为 ``proposals`` 候选框。
-        
+
         3、剪辑框图像。
-        
+
         4、删除小面积的预测框。
-        
+
         5、应用NMS以获得最终 ``proposals`` 作为输出。
-        
+
 参数：
         - **scores** (Variable)- 是一个shape为[N，A，H，W]的4-D张量，表示每个框成为object的概率。N是批量大小，A是anchor数，H和W是feature map的高度和宽度。
         - **bbox_deltas** （Variable）- 是一个shape为[N，4 * A，H，W]的4-D张量，表示预测框位置和anchor位置之间的差异。
@@ -12329,9 +12333,9 @@ generate_proposals
         - **anchors** （Variable）- 是一个shape为[H，W，A，4]的4-D Tensor。H和W是 ``feature map`` 的高度和宽度，
         - **num_anchors** - 是每个位置的框的数量。每个anchor都是以非标准化格式（xmin，ymin，xmax，ymax）定义的。
         - **variances** （Variable）- anchor的方差，shape为[H，W，num_priors，4]。每个方差都是（xcenter，ycenter，w，h）这样的格式。
-        - **pre_nms_top_n** （float）- 每个图在NMS之前要保留的总框数。默认为6000。 
-        - **post_nms_top_n** （float）- 每个图在NMS后要保留的总框数。默认为1000。 
-        - **nms_thresh** （float）- NMS中的阈值，默认为0.5。 
+        - **pre_nms_top_n** （float）- 每个图在NMS之前要保留的总框数。默认为6000。
+        - **post_nms_top_n** （float）- 每个图在NMS后要保留的总框数。默认为1000。
+        - **nms_thresh** （float）- NMS中的阈值，默认为0.5。
         - **min_size** （float）- 删除高度或宽度小于min_size的预测框。默认为0.1。
         - **eta** （float）- 在自适应NMS中应用，如果自适应阈值> 0.5，则在每次迭代中使用adaptive_threshold = adaptive_treshold * eta。
 
@@ -12376,7 +12380,7 @@ iou_similarity
 
 
 .. _cn_api_fluid_layers_multi_box_head:
-        
+
 multi_box_head
 -------------------------------
 
@@ -12418,7 +12422,7 @@ multi_box_head
     - **variances** ： ``PriorBox`` 的方差。布局是[num_priors，4]。 ``num_priors`` 是每个输入位置的总窗口数。
 
 返回类型：元组（tuple）
-        
+
 **代码示例**
 
 ..  code-block:: python
@@ -12443,7 +12447,7 @@ multi_box_head
 multiclass_nms
 -------------------------------
 
-.. py:function:: paddle.fluid.layers.multiclass_nms(bboxes, scores, score_threshold, nms_top_k, keep_top_k, nms_threshold=0.3, normalized=True, nms_eta=1.0, background_label=0, name=None)  
+.. py:function:: paddle.fluid.layers.multiclass_nms(bboxes, scores, score_threshold, nms_top_k, keep_top_k, nms_threshold=0.3, normalized=True, nms_eta=1.0, background_label=0, name=None)
 
 **多分类NMS**
 
@@ -12462,10 +12466,10 @@ multiclass_nms
 
     - **scores**  (Variable) – 支持两种类型的分数：
 
-      1. （tensor）具有形状[N，C，M]的3-D张量表示预测的置信度。 N是批量大小 batch size，C是种类数目，M是边界框bounding box的数量。对于每个类别，存在对应于M个边界框的总M个分数。请注意，M等于bboxes的第二维。 
+      1. （tensor）具有形状[N，C，M]的3-D张量表示预测的置信度。 N是批量大小 batch size，C是种类数目，M是边界框bounding box的数量。对于每个类别，存在对应于M个边界框的总M个分数。请注意，M等于bboxes的第二维。
       2. （LoDTensor）具有形状[M，C]的2-D LoDTensor。 M是bbox的数量，C是种类数目。在这种情况下，输入bboxes应该是形为[M，C，4]的第二种情况。
-            
-    - **background_label**  (int) – 背景标签（类别）的索引，背景标签（类别）将被忽略。如果设置为-1，则将考虑所有类别。默认值：0 
+
+    - **background_label**  (int) – 背景标签（类别）的索引，背景标签（类别）将被忽略。如果设置为-1，则将考虑所有类别。默认值：0
     - **score_threshold**  (float) – 过滤掉低置信度分数的边界框的阈值。如果没有提供，请考虑所有边界框。
     - **nms_top_k**  (int) – 根据通过score_threshold的过滤后而得的检测(detection)的置信度，所需要保留的最大检测数。
     - **nms_threshold**  (float) – 在NMS中使用的阈值。默认值：0.3 。
@@ -12502,7 +12506,7 @@ multiclass_nms
 polygon_box_transform
 -------------------------------
 
-.. py:function:: paddle.fluid.layers.polygon_box_transform(input, name=None)  
+.. py:function:: paddle.fluid.layers.polygon_box_transform(input, name=None)
 
 PolygonBoxTransform 算子。
 
@@ -12513,7 +12517,7 @@ PolygonBoxTransform 算子。
 参数：
     - **input** （Variable） - shape 为[batch_size，geometry_channels，height，width]的张量
 
-返回：与输入 shpae 相同
+返回：与输入 shape 相同
 
 返回类型：output（Variable）
 
@@ -12529,7 +12533,7 @@ PolygonBoxTransform 算子。
 
 .. _cn_api_fluid_layers_prior_box:
 
-prior_box 
+prior_box
 -------------------------------
 .. py:function:: paddle.fluid.layers.prior_box(input,image,min_sizes=None,max_sizes=None,aspect_ratios=[1.0],variance=[0.1,0.1,0.2,0.2],flip=False,clip=False,steps=[0.0,0.0],offset=0.5,name=None,min_max_aspect_ratios_order=False)
 
@@ -12654,10 +12658,10 @@ rpn_target_assign
 
 返回:
 
-返回元组 (predicted_scores, predicted_location, target_label, target_bbox, bbox_inside_weight) : 
-   - **predicted_scores** 和 **predicted_location** 是RPN的预测结果。 **target_label** 和 **target_bbox** 分别是真实准确数据(ground-truth)。 
-   - **predicted_location** 是一个形为[F，4]的2D Tensor， **target_bbox** 的形与 **predicted_location** 相同，F是foreground anchors的数量。 
-   - **predicted_scores** 是一个shape为[F + B，1]的2D Tensor， **target_label** 的形与 **predict_scores** 的形相同，B是background anchors的数量，F和B取决于此算子的输入。 
+返回元组 (predicted_scores, predicted_location, target_label, target_bbox, bbox_inside_weight) :
+   - **predicted_scores** 和 **predicted_location** 是RPN的预测结果。 **target_label** 和 **target_bbox** 分别是真实准确数据(ground-truth)。
+   - **predicted_location** 是一个形为[F，4]的2D Tensor， **target_bbox** 的形与 **predicted_location** 相同，F是foreground anchors的数量。
+   - **predicted_scores** 是一个shape为[F + B，1]的2D Tensor， **target_label** 的形与 **predict_scores** 的形相同，B是background anchors的数量，F和B取决于此算子的输入。
    - **Bbox_inside_weight** 标志着predicted_loction是否为fake_fg（假前景），其形为[F,4]。
 
 返回类型：        元组(tuple)
@@ -12678,9 +12682,9 @@ rpn_target_assign
         loc_pred, score_pred, loc_target, score_target, bbox_inside_weight=
                 fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
                         cls_logits=cls_logits, anchor_box=anchor_box, gt_boxes=gt_boxes)
-        
-        
-        
+
+
+
 
 
 
@@ -12690,11 +12694,11 @@ rpn_target_assign
 
 
 .. _cn_api_fluid_layers_ssd_loss:
-        
+
 ssd_loss
 -------------------------------
 
-.. py:function:: paddle.fluid.layers.ssd_loss(location, confidence, gt_box, gt_label, prior_box, prior_box_var=None, background_label=0, overlap_threshold=0.5, neg_pos_ratio=3.0, neg_overlap=0.5, loc_loss_weight=1.0, conf_loss_weight=1.0, match_type='per_prediction', mining_type='max_negative', normalize=True, sample_size=None) 
+.. py:function:: paddle.fluid.layers.ssd_loss(location, confidence, gt_box, gt_label, prior_box, prior_box_var=None, background_label=0, overlap_threshold=0.5, neg_pos_ratio=3.0, neg_overlap=0.5, loc_loss_weight=1.0, conf_loss_weight=1.0, match_type='per_prediction', mining_type='max_negative', normalize=True, sample_size=None)
 
 用于SSD的对象检测算法的多窗口损失层
 
@@ -12703,13 +12707,13 @@ ssd_loss
 1、通过二分匹配算法查找匹配的边界框。
 
         1.1、计算真实框与先验框之间的IOU相似度。
-        
+
         1.2、通过二分匹配算法计算匹配的边界框。
 
 2、计算难分样本的置信度
 
         2.1、根据匹配的索引获取目标标签。
-        
+
         2.2、计算置信度损失。
 
 3、应用实例挖掘来获取负示例索引并更新匹配的索引。
@@ -12717,19 +12721,19 @@ ssd_loss
 4、分配分类和回归目标
 
         4.1、根据前面的框编码bbox。
-        
+
         4.2、分配回归目标。
-        
+
         4.3、分配分类目标。
-        
+
 5、计算总体客观损失。
 
         5.1计算置信度损失。
-        
+
         5.1计算本地化损失。
-        
+
         5.3计算总体加权损失。
-        
+
 参数：
         - **location** （Variable）- 位置预测是具有形状[N，Np，4]的3D张量，N是批量大小，Np是每个实例的预测总数。 4是坐标值的数量，布局是[xmin，ymin，xmax，ymax]。
         - **confidence**  (Variable) - 置信度预测是具有形状[N，Np，C]，N和Np的3D张量，它们与位置相同，C是类号。
@@ -12773,7 +12777,7 @@ ssd_loss
          gt_label = fluid.layers.data(
                  name='gt_label', shape=[1], lod_level=1, dtype='float32')
          loss = fluid.layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
-        
+
 
 
 
@@ -12951,7 +12955,7 @@ yolov3_loss
          $$
          loss = (loss_{xy} + loss_{wh}) * weight_{box} + loss_{conf} + loss_{class}
          $$
-         
+
 
 当 ``use_label_smooth`` 设置为 ``True`` 时，在计算分类损失时将平滑分类目标，将正样本的目标平滑到1.0-1.0 / class_num，并将负样本的目标平滑到1.0 / class_num。
 
@@ -12977,7 +12981,7 @@ yolov3_loss
 
 返回类型:   变量（Variable）
 
-抛出异常: 
+抛出异常:
     - ``TypeError``  – yolov3_loss的输入x必须是Variable
     - ``TypeError``  – 输入yolov3_loss的gtbox必须是Variable
     - ``TypeError``  – 输入yolov3_loss的gtlabel必须是None或Variable
@@ -13010,7 +13014,7 @@ yolov3_loss
 
 
 ============
- metric_op 
+ metric_op
 ============
 
 
diff --git a/doc/fluid/api_guides/high_low_level_api.md b/doc/fluid/api_guides/high_low_level_api.md
deleted file mode 100644
index b44836540..000000000
--- a/doc/fluid/api_guides/high_low_level_api.md
+++ /dev/null
@@ -1,14 +0,0 @@
-## High/Low-level API简介
-
-PaddlePaddle Fluid目前有2套API接口：
-
-- Low-level（底层） API：
-	
-	- 灵活性强并且已经相对成熟，使用它训练的模型，能直接支持C++预测上线。
-	- 提供了大量的模型作为使用示例，包括[Book](https://github.com/PaddlePaddle/book)中的全部章节，以及[models](https://github.com/PaddlePaddle/models)中的所有章节。
-	- 适用人群：对深度学习有一定了解，需要自定义网络进行训练/预测/上线部署的用户。
-
-- High-level（高层）API：
-	
-	- 使用简单
-	- 尚未成熟，接口暂时在[paddle.fluid.contrib](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/contrib)下面。
diff --git a/doc/fluid/api_guides/high_low_level_api_en.md b/doc/fluid/api_guides/high_low_level_api_en.md
deleted file mode 100644
index 035a5b29f..000000000
--- a/doc/fluid/api_guides/high_low_level_api_en.md
+++ /dev/null
@@ -1,14 +0,0 @@
-## Introduction to High/Low-level API
-
-Currently PaddlePaddle Fluid has 2 branches of API interfaces:
-
-- Low-level API:
-
-	- It is highly flexible and relatively mature. The model trained by it can directly support C++ inference deployment and release.
-	- There are a large number of models as examples, including all chapters in [book](https://github.com/PaddlePaddle/book), and [models](https://github.com/PaddlePaddle/models).
-	- Recommended for users who have a certain understanding of deep learning and need to customize a network for training/inference/online deployment.
-
-- High-level API:
-
-	- Simple to use
-    - Still under development. the interface is temporarily in [paddle.fluid.contrib](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/contrib).
\ No newline at end of file
diff --git a/doc/fluid/api_guides/index.rst b/doc/fluid/api_guides/index.rst
index f8ce32af9..2b10c21e3 100755
--- a/doc/fluid/api_guides/index.rst
+++ b/doc/fluid/api_guides/index.rst
@@ -1,23 +1,22 @@
 ===========
-API使用指南
+API快速检索
 ===========
 
-API使用指南分功能向您介绍PaddlePaddle Fluid的API体系和用法，帮助您快速了解PaddlePaddle Fluid API的全貌，包括以下几个模块：
+API快速检索分功能向您介绍PaddlePaddle Fluid的API体系和用法，帮助您快速了解PaddlePaddle Fluid API的全貌，包括以下几个模块：
 
 ..  toctree::
     :maxdepth: 1
 
-    high_low_level_api.md
+    low_level/program.rst
     low_level/layers/index.rst
-    low_level/executor.rst
+    low_level/nets.rst
     low_level/optimizer.rst
+    low_level/backward.rst
     low_level/metrics.rst
     low_level/model_save_reader.rst
     low_level/inference.rst
-    low_level/distributed/index.rst
     low_level/memory_optimize.rst
-    low_level/nets.rst
+    low_level/executor.rst
     low_level/parallel_executor.rst
-    low_level/backward.rst
     low_level/parameter.rst
-    low_level/program.rst
+    low_level/distributed/index.rst
diff --git a/doc/fluid/api_guides/index_en.rst b/doc/fluid/api_guides/index_en.rst
index fa24aa36d..f358f887a 100755
--- a/doc/fluid/api_guides/index_en.rst
+++ b/doc/fluid/api_guides/index_en.rst
@@ -1,24 +1,22 @@
-===========
-API Guides
-===========
+=================
+API Quick Search
+=================
 
 This section introduces the Fluid API structure and usage, to help you quickly get the full picture of the PaddlePaddle Fluid API. This section is divided into the following modules:
 
 ..  toctree::
     :maxdepth: 1
 
-    high_low_level_api_en.md
+    low_level/program_en.rst
     low_level/layers/index_en.rst
-    low_level/executor_en.rst
+    low_level/nets_en.rst
     low_level/optimizer_en.rst
+    low_level/backward_en.rst
     low_level/metrics_en.rst
     low_level/model_save_reader_en.rst
     low_level/inference_en.rst
-    low_level/distributed/index_en.rst
     low_level/memory_optimize_en.rst
-    low_level/nets_en.rst
+    low_level/executor_en.rst
     low_level/parallel_executor_en.rst
-    low_level/compiled_program_en.rst
-    low_level/backward_en.rst
     low_level/parameter_en.rst
-    low_level/program_en.rst
+    low_level/distributed/index_en.rst
diff --git a/doc/fluid/api_guides/low_level/distributed/async_training.rst b/doc/fluid/api_guides/low_level/distributed/async_training.rst
index 07ad3ec91..0a360e40c 100644
--- a/doc/fluid/api_guides/low_level/distributed/async_training.rst
+++ b/doc/fluid/api_guides/low_level/distributed/async_training.rst
@@ -20,13 +20,13 @@ API详细使用方法参考 :ref:`cn_api_fluid_DistributeTranspiler` ，简单
     # 配置策略config
     config.slice_var_up = False
     t = fluid.DistributedTranspiler(config=config)
-    t.transpile(trainer_id, 
+    t.transpile(trainer_id,
                 program=main_program,
                 pservers="192.168.0.1:6174,192.168.0.2:6174",
                 trainers=1,
                 sync_mode=False)
 
-以上参数说明请参考 `同步训练 <../distributed/sync_training.html>`_ 
+以上参数说明请参考 `同步训练 <../distributed/sync_training.html>`_
 
 需要注意的是：进行异步训练时，请修改 :code:`sync_mode` 的值
 
diff --git a/doc/fluid/api_guides/low_level/distributed/async_training_en.rst b/doc/fluid/api_guides/low_level/distributed/async_training_en.rst
index d8d1b917d..d26465591 100644
--- a/doc/fluid/api_guides/low_level/distributed/async_training_en.rst
+++ b/doc/fluid/api_guides/low_level/distributed/async_training_en.rst
@@ -4,21 +4,21 @@
 Asynchronous Distributed Training
 ####################################
 
-Fluid supports parallelism asynchronous distributed training. :code:`DistributedTranspiler` converts a single node network configuration into a :code:`pserver` side program and the :code:`trainer` side program that can be executed on multiple machines. The user executes the same piece of code on different nodes. Depending on the environment variables or startup parameters, the corresponding :code:`pserver` or :code:`trainer` role can be executed. 
+Fluid supports parallelism asynchronous distributed training. :code:`DistributeTranspiler` converts a single node network configuration into a :code:`pserver` side program and the :code:`trainer` side program that can be executed on multiple machines. The user executes the same piece of code on different nodes. Depending on the environment variables or startup parameters, the corresponding :code:`pserver` or :code:`trainer` role can be executed.
 
 **Asynchronous distributed training in Fluid only supports the pserver mode** . The main difference between asynchronous training and `synchronous training <../distributed/sync_training_en.html>`_ is that the gradients of each trainer are asynchronously applied on the parameters, but in synchronous training, the gradients of all trainers must be combined first and then they are used to update the parameters. Therefore, the hyperparameters of synchronous training and asynchronous training need to be adjusted separately.
 
-Asynchronous distributed training in Pserver mode 
+Asynchronous distributed training in Pserver mode
 ==================================================
 
 For detailed API, please refer to :ref:`api_fluid_transpiler_DistributeTranspiler` . A simple example:
 
 .. code-block:: python
 
-	config = fluid.DistributedTranspilerConfig()
-	#Configuring config policy 
+	config = fluid.DistributeTranspilerConfig()
+	#Configuring config policy
 	config.slice_var_up = False
-	t = fluid.DistributedTranspiler(config=config)
+	t = fluid.DistributeTranspiler(config=config)
 	t.transpile(trainer_id,
 				program=main_program,
 				pservers="192.168.0.1:6174,192.168.0.2:6174",
diff --git a/doc/fluid/api_guides/low_level/distributed/index.rst b/doc/fluid/api_guides/low_level/distributed/index.rst
index d689921f1..40026707b 100644
--- a/doc/fluid/api_guides/low_level/distributed/index.rst
+++ b/doc/fluid/api_guides/low_level/distributed/index.rst
@@ -7,8 +7,5 @@
 
     sync_training.rst
     async_training.rst
-    cpu_train_best_practice.rst
     large_scale_sparse_feature_training.rst
     cluster_train_data_cn.rst
-
-
diff --git a/doc/fluid/api_guides/low_level/distributed/index_en.rst b/doc/fluid/api_guides/low_level/distributed/index_en.rst
index 593899fc2..a77fb4737 100644
--- a/doc/fluid/api_guides/low_level/distributed/index_en.rst
+++ b/doc/fluid/api_guides/low_level/distributed/index_en.rst
@@ -7,7 +7,6 @@ Distributed Training
 
     sync_training_en.rst
     async_training_en.rst
-    cpu_train_best_practice_en.rst
     large_scale_sparse_feature_training_en.rst
     cluster_train_data_en.rst
 
diff --git a/doc/fluid/api_guides/low_level/distributed/sync_training.rst b/doc/fluid/api_guides/low_level/distributed/sync_training.rst
index b64c37ac8..1ea58c397 100644
--- a/doc/fluid/api_guides/low_level/distributed/sync_training.rst
+++ b/doc/fluid/api_guides/low_level/distributed/sync_training.rst
@@ -4,7 +4,7 @@
 分布式同步训练
 ############
 
-Fluid支持数据并行的分布式同步训练，API使用 :code:`DistributedTranspiler` 将单机网络配置转换成可以多机执行的
+Fluid支持数据并行的分布式同步训练，API使用 :code:`DistributeTranspiler` 将单机网络配置转换成可以多机执行的
 :code:`pserver` 端程序和 :code:`trainer` 端程序。用户在不同的节点执行相同的一段代码，根据环境变量或启动参数，
 可以执行对应的 :code:`pserver` 或 :code:`trainer` 角色。Fluid分布式同步训练同时支持pserver模式和NCCL2模式，
 在API使用上有差别，需要注意。
@@ -16,11 +16,11 @@ API详细使用方法参考 :ref:`DistributeTranspiler` ，简单实例用法：
 
 .. code-block:: python
 
-    config = fluid.DistributedTranspilerConfig()
+    config = fluid.DistributeTranspilerConfig()
     # 配置策略config
     config.slice_var_up = False
-    t = fluid.DistributedTranspiler(config=config)
-    t.transpile(trainer_id, 
+    t = fluid.DistributeTranspiler(config=config)
+    t.transpile(trainer_id,
                 program=main_program,
                 pservers="192.168.0.1:6174,192.168.0.2:6174",
                 trainers=1,
@@ -68,8 +68,8 @@ NCCL2模式分布式训练
 
     config = fluid.DistributeTranspilerConfig()
     config.mode = "nccl2"
-    t = fluid.DistributedTranspiler(config=config)
-    t.transpile(trainer_id, 
+    t = fluid.DistributeTranspiler(config=config)
+    t.transpile(trainer_id,
                 program=main_program,
                 startup_program=startup_program,
                 trainers="192.168.0.1:6174,192.168.0.2:6174",
diff --git a/doc/fluid/api_guides/low_level/distributed/sync_training_en.rst b/doc/fluid/api_guides/low_level/distributed/sync_training_en.rst
index dd26664e5..6b8cd7343 100644
--- a/doc/fluid/api_guides/low_level/distributed/sync_training_en.rst
+++ b/doc/fluid/api_guides/low_level/distributed/sync_training_en.rst
@@ -4,19 +4,19 @@
 Synchronous Distributed Training
 ####################################
 
-Fluid supports parallelism distributed synchronous training, the API uses the :code:`DistributedTranspiler` to convert a single node network configuration into a :code:`pserver` side and :code:`trainer` side program that can be executed on multiple machines. The user executes the same piece of code on different nodes. Depending on the environment variables or startup parameters, you can execute the corresponding :code:`pserver` or :code:`trainer` role. Fluid distributed synchronous training supports both pserver mode and NCCL2 mode. There are differences in the use of the API, to which you need to pay attention.
+Fluid supports parallelism distributed synchronous training, the API uses the :code:`DistributeTranspiler` to convert a single node network configuration into a :code:`pserver` side and :code:`trainer` side program that can be executed on multiple machines. The user executes the same piece of code on different nodes. Depending on the environment variables or startup parameters, you can execute the corresponding :code:`pserver` or :code:`trainer` role. Fluid distributed synchronous training supports both pserver mode and NCCL2 mode. There are differences in the use of the API, to which you need to pay attention.
 
-Distributed training in pserver mode 
+Distributed training in pserver mode
 ======================================
 
 For API Reference, please refer to :ref:`DistributeTranspiler`. A simple example :
 
 .. code-block:: python
 
-	config = fluid.DistributedTranspilerConfig()
+	config = fluid.DistributeTranspilerConfig()
 	#Configuring policy config
 	config.slice_var_up = False
-	t = fluid.DistributedTranspiler(config=config)
+	t = fluid.DistributeTranspiler(config=config)
 	t.transpile(trainer_id,
 				program=main_program,
 				pservers="192.168.0.1:6174,192.168.0.2:6174",
@@ -51,7 +51,7 @@ Configuration for general environment variables:
 - :code:`FLAGS_rpc_deadline` : int, the longest waiting time for RPC communication, in milliseconds, default 180000
 
 
-Distributed training in NCCL2 mode 
+Distributed training in NCCL2 mode
 ====================================
 
 The multi-node synchronous training mode based on NCCL2 (Collective Communication) is only supported in the GPU cluster.
@@ -65,7 +65,7 @@ Use the following code to convert the current :code:`Program` to a Fluid :code:`
 
 	Config = fluid.DistributeTranspilerConfig()
 	Config.mode = "nccl2"
-	t = fluid.DistributedTranspiler(config=config)
+	t = fluid.DistributeTranspiler(config=config)
 	t.transpile(trainer_id,
 				program=main_program,
 				startup_program=startup_program,
diff --git a/doc/fluid/api_guides/low_level/program.rst b/doc/fluid/api_guides/low_level/program.rst
index 41e571c67..4372bde27 100644
--- a/doc/fluid/api_guides/low_level/program.rst
+++ b/doc/fluid/api_guides/low_level/program.rst
@@ -1,8 +1,8 @@
 .. _api_guide_Program:
 
-###############################
-Program/Block/Operator/Variable
-###############################
+#########
+基础概念
+#########
 
 ==================
 Program
@@ -13,13 +13,13 @@ Program
 
 总得来说：
 
-* 一个模型是一个 Fluid :code:`Program` ,一个模型可以含有多于一个 :code:`Program` ； 
+* 一个模型是一个 Fluid :code:`Program` ,一个模型可以含有多于一个 :code:`Program` ；
 
 * :code:`Program` 由嵌套的 :code:`Block` 构成，:code:`Block` 的概念可以类比到 C++ 或是 Java 中的一对大括号，或是 Python 语言中的一个缩进块；
 
 * :code:`Block` 中的计算由顺序执行、条件选择或者循环执行三种方式组合，构成复杂的计算逻辑；
 
-* :code:`Block` 中包含对计算和计算对象的描述。计算的描述称之为 Operator；计算作用的对象（或者说 Operator 的输入和输出）被统一为 Tensor，在Fluid中，Tensor 用层级为0的 `LoD-Tensor <http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/prepare_data/lod_tensor.html#permalink-4-lod-tensor>`_ 表示。 
+* :code:`Block` 中包含对计算和计算对象的描述。计算的描述称之为 Operator；计算作用的对象（或者说 Operator 的输入和输出）被统一为 Tensor，在Fluid中，Tensor 用层级为0的 `LoD-Tensor <http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/prepare_data/lod_tensor.html#permalink-4-lod-tensor>`_ 表示。
 
 
 
@@ -37,7 +37,7 @@ Block
 +----------------------+-------------------------+
 | if-else, switch      | IfElseOp, SwitchOp      |
 +----------------------+-------------------------+
-| 顺序执行              | 一系列 layers            | 
+| 顺序执行              | 一系列 layers            |
 +----------------------+-------------------------+
 
 如上文所说，Fluid 中的 :code:`Block` 描述了一组以顺序、选择或是循环执行的 Operator 以及 Operator 操作的对象：Tensor。
@@ -54,7 +54,7 @@ Operator
 这是因为一些常见的对 Tensor 的操作可能是由更多基础操作构成，为了提高使用的便利性，框架内部对基础 Operator 进行了一些封装，包括创建 Operator 依赖可学习参数，可学习参数的初始化细节等，减少用户重复开发的成本。
 
 
-更多内容可参考阅读 `Fluid设计思想 <../../advanced_usage/design_idea/fluid_design_idea.html>`_ 
+更多内容可参考阅读 `Fluid设计思想 <../../advanced_usage/design_idea/fluid_design_idea.html>`_
 
 
 =========
@@ -78,4 +78,4 @@ Fluid 中的 :code:`Variable` 可以包含任何类型的值———在大多
 
 * 用户还可以使用 :ref:`cn_api_fluid_program_guard` 配合 :code:`with` 语句，修改配置好的 :ref:`cn_api_fluid_default_startup_program` 和 :ref:`cn_api_fluid_default_main_program` 。
 
-* 在Fluid中，Block内部执行顺序由控制流决定，如 :ref:`cn_api_fluid_layers_IfElse` , :ref:`cn_api_fluid_layers_While`, :ref:`cn_api_fluid_layers_Switch` 等，更多内容可参考： :ref:`api_guide_control_flow` 
+* 在Fluid中，Block内部执行顺序由控制流决定，如 :ref:`cn_api_fluid_layers_IfElse` , :ref:`cn_api_fluid_layers_While`, :ref:`cn_api_fluid_layers_Switch` 等，更多内容可参考： :ref:`api_guide_control_flow`
diff --git a/doc/fluid/api_guides/low_level/program_en.rst b/doc/fluid/api_guides/low_level/program_en.rst
index f4627a9cc..d926336a8 100644
--- a/doc/fluid/api_guides/low_level/program_en.rst
+++ b/doc/fluid/api_guides/low_level/program_en.rst
@@ -1,8 +1,8 @@
 .. _api_guide_Program_en:
 
-###############################
-Program/Block/Operator/Variable
-###############################
+###############
+Basic Concept
+###############
 
 ==================
 Program
@@ -36,7 +36,7 @@ Block
 +----------------------+-------------------------+
 | if-else, switch      | IfElseOp, SwitchOp      |
 +----------------------+-------------------------+
-| execute sequentially | a series of layers      | 
+| execute sequentially | a series of layers      |
 +----------------------+-------------------------+
 
 As mentioned above,  :code:`Block` in Fluid describes a set of Operators that include sequential execution, conditional selection or loop execution, and the operating object of Operator: Tensor.
@@ -53,7 +53,7 @@ This is because some common operations on Tensor may consist of more basic opera
 
 
 
-More information can be read for reference. `Fluid Design Idea <../../advanced_usage/design_idea/fluid_design_idea.html>`_ 
+More information can be read for reference. `Fluid Design Idea <../../advanced_usage/design_idea/fluid_design_idea.html>`_
 
 
 =========
@@ -75,4 +75,4 @@ Related API
 * Users can also use :ref:`api_fluid_program_guard` with :code:`with` to modify the configured :ref:`api_fluid_default_startup_program` and :ref:`api_fluid_default_main_program` .
 
 
-* In Fluid，the execution order in a Block is determined by control flow，such as :ref:`api_fluid_layers_IfElse` , :ref:`api_fluid_layers_While` and :ref:`api_fluid_layers_Switch` . For more information, please refer to： :ref:`api_guide_control_flow_en` 
+* In Fluid，the execution order in a Block is determined by control flow，such as :ref:`api_fluid_layers_IfElse` , :ref:`api_fluid_layers_While` and :ref:`api_fluid_layers_Switch` . For more information, please refer to： :ref:`api_guide_control_flow_en`
diff --git a/doc/fluid/beginners_guide/install/compile/compile_MacOS.md b/doc/fluid/beginners_guide/install/compile/compile_MacOS.md
index 1ca671b86..39b4f5bb6 100644
--- a/doc/fluid/beginners_guide/install/compile/compile_MacOS.md
+++ b/doc/fluid/beginners_guide/install/compile/compile_MacOS.md
@@ -186,6 +186,7 @@
 			For Python2: cmake .. -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF  -DCMAKE_BUILD_TYPE=Release
 			For Python3: cmake .. -DPY_VERSION=3.5 -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIRS} \
 			 -DPYTHON_LIBRARY=${PYTHON_LIBRARY} -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF  -DCMAKE_BUILD_TYPE=Release
+
 	>`-DPY_VERSION=3.5`请修改为安装环境的Python版本
 
 10. 使用以下命令来编译：
diff --git a/doc/fluid/beginners_guide/install/compile/compile_MacOS_en.md b/doc/fluid/beginners_guide/install/compile/compile_MacOS_en.md
index 4705a9bed..ffef18edd 100644
--- a/doc/fluid/beginners_guide/install/compile/compile_MacOS_en.md
+++ b/doc/fluid/beginners_guide/install/compile/compile_MacOS_en.md
@@ -121,7 +121,7 @@ Congratulations, you have now completed the process of compiling PaddlePaddle us
 
 4. (Only For Python3) Set Python-related environment variables:
 
-	- a. First use 
+	- a. First use
 			```find `dirname $(dirname
 			  $(which python3))` -name "libpython3.*.dylib"```
 			to find the path to Pythonlib (the first one it prompts is the dylib path for the python you need to use), then (below [python-lib-path] is replaced by finding the file path)
@@ -148,7 +148,7 @@ Congratulations, you have now completed the process of compiling PaddlePaddle us
 		Since we are using CMake3.4 please follow the steps below:
 
 		1. Download the CMake image from the [official CMake website](https://cmake.org/files/v3.4/cmake-3.4.3-Darwin-x86_64.dmg) and install it.
-	
+
 		2. Enter `sudo "/Applications/CMake.app/Contents/bin/cmake-gui" –install` in the console
 
 	- b. If you do not want to use the system default blas and want to use your own installed OPENBLAS please read [FAQ](../FAQ.html/#OPENBLAS)
diff --git a/doc/fluid/beginners_guide/programming_guide/programming_guide.md b/doc/fluid/beginners_guide/programming_guide/programming_guide.md
index 86d5eeee7..5baa35490 100644
--- a/doc/fluid/beginners_guide/programming_guide/programming_guide.md
+++ b/doc/fluid/beginners_guide/programming_guide/programming_guide.md
@@ -236,7 +236,7 @@ Fluid的设计思想类似于高级编程语言C++和JAVA等。程序的执行
 #定义Exector
 cpu = fluid.core.CPUPlace() #定义运算场所，这里选择在CPU下训练
 exe = fluid.Executor(cpu) #创建执行器
-exe.run(fluid.default_startup_program()) #初始化Program
+exe.run(fluid.default_startup_program()) #用来进行初始化的program
 
 #训练Program，开始计算
 #feed以字典的形式定义了数据传入网络的顺序
-- 
GitLab