Merge pull request #1 from PaddlePaddle/develop

update

Merge pull request #1 from PaddlePaddle/develop
update
cb4f4929 · lujun · GitHub · 37fd49da · 97dcf6e5 · cb4f4929
176 changed file
--- a/.gitignore
+++ b/.gitignore
+.vscode/
--- a/.gitmodules
+++ b/.gitmodules
-[submodule "external/Paddle"]
-	path = external/Paddle
-	url = https://github.com/PaddlePaddle/Paddle
 [submodule "external/book"]
 	path = external/book
 	url = https://github.com/PaddlePaddle/book
+[submodule "external/Anakin"]
+	path = external/Anakin
+	url = https://github.com/PaddlePaddle/Anakin
+[submodule "external/paddle-mobile"]
+	path = external/paddle-mobile
+	url = https://github.com/PaddlePaddle/paddle-mobile
+[submodule "external/Paddle"]
+	path = external/Paddle
+	url = https://github.com/PaddlePaddle/Paddle
 [submodule "external/models"]
 	path = external/models
 	url = https://github.com/PaddlePaddle/models
--- a/doc/fluid/advanced_usage/deploy/anakin_arm_benchmark.md
+++ b/doc/fluid/advanced_usage/deploy/anakin_arm_benchmark.md
--- a/doc/fluid/advanced_usage/deploy/anakin_example.md
+++ b/doc/fluid/advanced_usage/deploy/anakin_example.md
--- a/doc/fluid/advanced_usage/deploy/anakin_gpu_benchmark.md
+++ b/doc/fluid/advanced_usage/deploy/anakin_gpu_benchmark.md
@@ -30,21 +30,19 @@

 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 8.8690 | 8.2815 |
-| 2 | 15.5344 | 13.9116 |
-| 4 | 26.6000 | 21.8747 |
-| 8 | 49.8279 | 40.4076 |
-| 32 | 188.6270 | 163.7660 |
+| 1 | 8.53945 | 8.18737 |
+| 2 | 14.2269 | 13.8976 |
+| 4 | 24.2803 | 21.7976 |
+| 8 | 45.6003 | 40.319 |

 - GPU Memory Used (`MB`)

 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 963 | 997 |
-| 2 | 965 | 1039 |
-| 4 | 991 | 1115 |
-| 8 | 1067 | 1269 |
-| 32 | 1715 | 2193 |
+| 1 | 1053.88 | 762.73 |
+| 2 | 1055.71 | 762.41 |
+| 4 | 1003.22 | 832.75 |
+| 8 | 1108.77 | 926.9 |


 ### <span id = '2'>Yolo </span>
@@ -53,21 +51,19 @@

 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 16.4596| 15.2124 |
-| 2 | 26.6347| 25.0442 |
-| 4 | 43.3695| 43.5017 |
-| 8 | 80.9139 | 80.9880 |
-| 32 | 293.8080| 310.8810 |
+| 1 | 8.41606| 7.07977 |
+| 2 | 16.6588| 15.2216 |
+| 4 | 31.9955| 30.5102 |
+| 8 | 66.1107 | 64.3658 |

 - GPU Memory Used (`MB`)

 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 1569 | 1775 |
-| 2 | 1649 | 1815 |
-| 4 | 1709 | 1887 |
-| 8 | 1731 | 2031 |
-| 32 | 2253 | 2907 |
+| 1 | 1054.71  | 299.8 |
+| 2 | 951.51  | 347.47 |
+| 4 | 846.9  | 438.47 |
+| 8 | 1042.31  | 515.15 |

 ### <span id = '3'> Resnet50 </span>

@@ -75,21 +71,19 @@

 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 4.2459   |  4.1061 |
-| 2 |  6.2627  |  6.5159 |
-| 4 | 10.1277  | 11.3327 |
-| 8 | 17.8209  | 20.6680 |
-| 32 | 65.8582 | 77.8858 |
+| 1 | 4.10063  |  3.33845 |
+| 2 |  6.10941 |  5.54814 |
+| 4 | 9.90233  | 10.2763 |
+| 8 | 17.3287  |   20.0783 |

 - GPU Memory Used (`MB`)

 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 531  | 503 |
-| 2 | 543  | 517 |
-| 4 | 583 | 541 |
-| 8 | 611 | 589 |
-| 32 |  809 | 879 |
+| 1 | 1059.15 | 299.86 |
+| 2 | 1077.8  | 340.78 |
+| 4 | 903.04  | 395 |
+| 8 | 832.53  | 508.86 |

 ### <span id = '4'> Resnet101 </span>

@@ -97,21 +91,19 @@

 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 7.5562 | 7.0837 |
-| 2 | 11.6023 | 11.4079 |
-| 4 | 18.3650 | 20.0493 |
-| 8 | 32.7632 | 36.0648 |
-| 32 | 123.2550 | 135.4880 |
+| 1 | 7.29828 | 5.672 |
+| 2 | 11.2037 | 9.42352 |
+| 4 | 17.9306 | 18.0936 |
+| 8 | 31.4804 | 35.7439 |

 - GPU Memory Used (`MB)`

 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 701  | 683 |
-| 2 | 713  | 697 |
-| 4 | 793 | 721 |
-| 8 | 819 | 769 |
-| 32 | 1043 | 1059 |
+| 1 | 1161.94 | 429.22 |
+| 2 | 1190.92 | 531.92 |
+| 4 | 994.11  | 549.7 |
+| 8 | 945.47  | 653.06 |

 ###  <span id = '5'> MobileNet V1 </span>

@@ -119,21 +111,19 @@

 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 45.5156  |  1.3947 |
-| 2 |  46.5585  |  2.5483 |
-| 4 | 48.4242  | 4.3404 |
-| 8 |  52.7957 |  8.1513 |
-| 32 | 83.2519 | 31.3178 |
+| 1 | 1.52692  |  1.39282 |
+| 2 |  1.98091  |  2.05788 |
+| 4 | 3.2705  | 4.03476 |
+| 8 |  5.15652 |  7.06651 |

 - GPU Memory Used (`MB`)

 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 329  | 283 |
-| 2 | 345  | 289 |
-| 4 | 371 | 299 |
-| 8 | 393 | 319 |
-| 32 |  531 | 433 |
+| 1 | 1144.35   | 99.6 |
+| 2 | 1160.03    | 199.75 |
+| 4 | 1098  | 184.33 |
+| 8 | 990.71  | 232.11 |

 ###  <span id = '6'> MobileNet V2</span>

@@ -141,21 +131,20 @@

 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 65.6861 | 2.9842 |
-| 2 | 66.6814 | 4.7472 |
-| 4 | 69.7114 | 7.4163 |
-| 8 | 76.1092 | 12.8779 |
-| 32 | 124.9810 | 47.2142 |
+| 1 | 1.95961 | 1.78249 |
+| 2 | 2.8709 | 3.01144 |
+| 4 | 4.46131 | 5.43946 |
+| 8 | 7.161 | 10.2081 |

 - GPU Memory Used (`MB`)

 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 341 | 293 |
-| 2 | 353 | 301 |
-| 4 | 385 | 319 |
-| 8 | 421 | 351 |
-| 32 | 637 | 551 |
+| 1 | 1154.69 | 195.25 |
+| 2 | 1187.25 | 227.6 |
+| 4 | 1053 | 241.75 |
+| 8 | 1062.48 | 352.18 |
+

 ## How to run those Benchmark models


--- a/doc/fluid/advanced_usage/deploy/anakin_parser_design.md
+++ b/doc/fluid/advanced_usage/deploy/anakin_parser_design.md
--- a/doc/fluid/advanced_usage/deploy/anakin_run_on_arm.md
+++ b/doc/fluid/advanced_usage/deploy/anakin_run_on_arm.md
--- a/doc/fluid/advanced_usage/deploy/anakin_tutorial.md
+++ b/doc/fluid/advanced_usage/deploy/anakin_tutorial.md
@@ -114,64 +114,67 @@ Anakin中数据类型与基本数据类型的对应如下:

  理论上，Anakin支持申明1维以上的tensor，但是对于Anakin中的Op来说，只支持NW、NHW、NCHW、NCHW_C4这四种LayOut，其中NCHW是默认的LayOuteType，NCHW_C4是专门针对于int8这种数据类型的。

-  **例子：**
+  例子

-下面的代码将展示如何使用tensor， 我们建议先看看这些示例。
+    下面的代码将展示如何使用tensor， 我们建议先看看这些示例。

-要想获得更多关于tensor的信息， 请参考 *soure_path/core/tensor.h*
+    要想获得更多关于tensor的信息， 请参考 *soure_path/core/tensor.h*

-1. 使用shape对象初始化tensor
+    > 1. 使用shape对象初始化tensor

-    ```cpp
-    //create a null tensor. A null tensor holds for nothing.
-    //tensor's buffer  is resident at CPU and its datatype is AK_FLOAT.
-    //tensor's Layout is NCHW(default)
-    Tensor<X86, AK_FLOAT> mytensor;
+    ```c++
+      //create a null tensor. A null tensor holds for nothing.
+      //tensor's buffer  is resident at CPU and its datatype is AK_FLOAT.
+      //tensor's Layout is NCHW(default)
+      Tensor<X86, AK_FLOAT> mytensor;

-    //1. using shape object to create a tensor.
-    Shape shape1(NUM); //1-D shape. NUM is the number of dimention.
-    Tensor<X86, AK_FLOAT, W> mytensor1(shape1); //1-D tensor.
+      //1. using shape object to create a tensor.
+      Shape shape1(NUM); //1-D shape. NUM is the number of dimention.
+      Tensor<X86, AK_FLOAT, W> mytensor1(shape1); //1-D tensor.

-    // A 4-D shape
-    Shape shape2(N, C, H, W); // batch x channel x height x width
+      // A 4-D shape
+      Shape shape2(N, C, H, W); // batch x channel x height x width
    ```

-    `注意：Shape的维度必须和tensor的`[LayoutType](#layout)`相同，比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW，否则会出错。如下列代码所示`
+    >`注意：Shape的维度必须和tensor的`[LayoutType](#layout)`相同，比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW，否则会出错。如下列代码所示`

    ```c++
-    // A 4-D tensor.
-    Tensor<X86, AK_FLOAT> mytensor2(shape2);  //right
+       // A 4-D tensor.
+       Tensor<X86, AK_FLOAT> mytensor2(shape2);  //right
+
+       //A 4-D tensor which is resident at GPU and its datatype is AK_INT8
+       Tensor<NV, AK_INT8> mytensor3(shape2);   //right

-    //A 4-D tensor which is resident at GPU and its datatype is AK_INT8
-    Tensor<NV, AK_INT8> mytensor3(shape2);   //right
+       Tensor<X86, AK_FLOAT, NHW> mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout.
+       Tensor<NV, AK_FLOAT, NCHW_C4> mytensor5(shape2); //wrong!!!!

-    Tensor<X86, AK_FLOAT, NHW> mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout.
-    Tensor<NV, AK_FLOAT, NCHW_C4> mytensor5(shape2); //wrong!!!!
    ```

-2. 使用现有的数据和shape初始化tensor
+    > 2. 使用现有的数据和shape初始化tensor

    ```c++
-    /**
-    *  A construtor of Tensor.
-    *  data_ptr is a pointer to any data type of data
-    *  TargetType is type of a platform [Anakin TargetType]
-    *  id : device id
-    *  shape: a Anakin shape
-    */
-    Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape);

-    //using existing data feed to a tensor
-    Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W).
+       /**
+       *  A construtor of Tensor.
+       *  data_ptr is a pointer to any data type of data
+       *  TargetType is type of a platform [Anakin TargetType]
+       *  id : device id
+       *  shape: a Anakin shape
+       */
+       Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape);
+
+       //using existing data feed to a tensor
+       Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W).
+
    ```

-3. 使用tensor初始化tensor
+    > 3. 使用tensor初始化tensor

    ```c++
-    Tensor<NV, AK_FLOAT> tensor(exist_tensor);
+       Tensor<NV, AK_FLOAT> tensor(exist_tensor);
    ```

->提示： 你可以用` typedef Tensor<X86, AK_FLOAT> Tensor4d_X86 `方便定义tensor
+    > 提示： 你可以用` typedef Tensor<X86, AK_FLOAT> Tensor4d_X86 `方便定义tensor

 #### 填充tensor数据区


--- a/doc/fluid/advanced_usage/deploy/convert_paddle_to_anakin.md
+++ b/doc/fluid/advanced_usage/deploy/convert_paddle_to_anakin.md
--- a/doc/fluid/advanced_usage/deploy/how_to_add_anakin_op.md
+++ b/doc/fluid/advanced_usage/deploy/how_to_add_anakin_op.md
--- a/doc/fluid/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
+++ b/doc/fluid/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
--- a/doc/fluid/advanced_usage/deploy/index_anakin.rst
+++ b/doc/fluid/advanced_usage/deploy/index_anakin.rst
@@ -13,6 +13,7 @@ Anakin 预测引擎
   anakin_tutorial.md
   anakin_run_on_arm.md
   anakin_example.md
+   int8_design_anakin.md
   anakin_gpu_benchmark.md
   anakin_arm_benchmark.md


--- a/doc/fluid/advanced_usage/deploy/install_anakin.md
+++ b/doc/fluid/advanced_usage/deploy/install_anakin.md
--- a/doc/fluid/advanced_usage/deploy/run_anakin_on_arm.md
+++ b/doc/fluid/advanced_usage/deploy/run_anakin_on_arm.md
--- a/doc/fluid/advanced_usage/deploy/index_cn.rst
+++ b/doc/fluid/advanced_usage/deploy/index_cn.rst
+########
+预测部署
+########
+
+- `原生预测引擎 <inference/index_cn.html>`_ ：介绍了支持模型部署上线的Fluid C++ API
+
+- `移动端部署 <mobile/index_cn.html>`_：介绍了 PaddlePaddle组织下的嵌入式平台深度学习框架Paddle-Mobile
+
+..  toctree::
+    :hidden:
+
+    inference/index_cn.rst
+    mobile/index_cn.rst
--- a/doc/fluid/advanced_usage/deploy/index_mobile.rst
+++ b/doc/fluid/advanced_usage/deploy/index_mobile.rst
-移动端部署
-##########
-
-.. toctree::
-   :maxdepth: 2
-
-   mobile_readme.md
-   mobile_build.md
--- a/doc/fluid/user_guides/howto/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/user_guides/howto/inference/build_and_install_lib_cn.rst
@@ -22,18 +22,18 @@ cuda9.0_cudnn7_avx_mkl   `fluid_inference.tgz <https://guest:@paddleci.ngrok.io/
 ----------
 用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：

-=================   =========
-选项                 值   
-=================   =========
-CMAKE_BUILD_TYPE    Release
+============================  =========
+选项                           值   
+============================  =========
+CMAKE_BUILD_TYPE              Release
 FLUID_INFERENCE_INSTALL_DIR   安装路径    
-WITH_FLUID_ONLY     ON（推荐）
-WITH_SWIG_PY        OFF（推荐
-WITH_PYTHON         OFF（推荐）
-WITH_GPU            ON/OFF
-WITH_MKL            ON/OFF
-ON_INFER            ON（预测优化）
-=================   =========
+WITH_FLUID_ONLY               ON（推荐）
+WITH_SWIG_PY                  OFF（推荐）
+WITH_PYTHON                   OFF（推荐）
+ON_INFER                      ON（推荐）
+WITH_GPU                      ON/OFF
+WITH_MKL                      ON/OFF
+============================  =========

 建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。

@@ -67,7 +67,12 @@ ON_INFER            ON（预测优化）
     ├── CMakeCache.txt
     ├── paddle
     │   ├── include
-     │   │   └── paddle_inference_api.h
+     │   │   ├── paddle_anakin_config.h
+     │   │   ├── paddle_analysis_config.h
+     │   │   ├── paddle_api.h
+     │   │   ├── paddle_inference_api.h
+     │   │   ├── paddle_inference_pass.h
+     │   │   └── paddle_pass_builder.h
     │   └── lib
     │       ├── libpaddle_fluid.a
     │       └── libpaddle_fluid.so
@@ -80,10 +85,12 @@ ON_INFER            ON（预测优化）
     │   └── install
     │       ├── gflags
     │       ├── glog
+     │       ├── mkldnn
     │       ├── mklml
     │       ├── protobuf
     │       ├── snappy
     │       ├── snappystream
+     │       ├── xxhash
     │       └── zlib
     └── version.txt
     
@@ -91,9 +98,9 @@ version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使

  .. code-block:: text

-     GIT COMMIT ID: 23da8defc8314b0c711130c1d9536e2cf2fb8414
+     GIT COMMIT ID: cc9028b90ef50a825a722c55e5fda4b7cd26b0d6
     WITH_MKL: ON
-     WITH_MKLDNN: OFF
+     WITH_MKLDNN: ON
     WITH_GPU: ON
     CUDA version: 8.0
     CUDNN version: v5
--- a/doc/fluid/user_guides/howto/inference/image/image1.png
+++ b/doc/fluid/user_guides/howto/inference/image/image1.png
--- a/doc/fluid/user_guides/howto/inference/image/image2.png
+++ b/doc/fluid/user_guides/howto/inference/image/image2.png
--- a/doc/fluid/user_guides/howto/inference/image/image3.png
+++ b/doc/fluid/user_guides/howto/inference/image/image3.png
--- a/doc/fluid/user_guides/howto/inference/image/image4.png
+++ b/doc/fluid/user_guides/howto/inference/image/image4.png
--- a/doc/fluid/user_guides/howto/inference/image/image5.png
+++ b/doc/fluid/user_guides/howto/inference/image/image5.png
--- a/doc/fluid/user_guides/howto/inference/image/image6.png
+++ b/doc/fluid/user_guides/howto/inference/image/image6.png
--- a/doc/fluid/user_guides/howto/inference/image/image7.png
+++ b/doc/fluid/user_guides/howto/inference/image/image7.png
--- a/doc/fluid/user_guides/howto/inference/image/image8.png
+++ b/doc/fluid/user_guides/howto/inference/image/image8.png
--- a/doc/fluid/user_guides/howto/inference/image/image9.png
+++ b/doc/fluid/user_guides/howto/inference/image/image9.png
--- a/doc/fluid/user_guides/howto/inference/image/model_graph_original.png
+++ b/doc/fluid/user_guides/howto/inference/image/model_graph_original.png
--- a/doc/fluid/user_guides/howto/inference/image/model_graph_trt.png
+++ b/doc/fluid/user_guides/howto/inference/image/model_graph_trt.png
--- a/doc/fluid/user_guides/howto/inference/index.rst
+++ b/doc/fluid/user_guides/howto/inference/index.rst
 ############
-预测部署
+原生预测引擎
 ############

 PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线

 .. toctree::
-   :maxdepth: 2
+   :hidden:

   build_and_install_lib_cn.rst
   native_infer.md
   paddle_tensorrt_infer.md
+   paddle_gpu_benchmark.md
   windows_cpp_inference.md
--- a/doc/fluid/user_guides/howto/inference/native_infer.md
+++ b/doc/fluid/user_guides/howto/inference/native_infer.md
-# Paddle 预测 API
+# C++ 预测 API介绍

 为了更简单方便的预测部署，Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。


--- a/doc/fluid/advanced_usage/deploy/inference/paddle_gpu_benchmark.md
+++ b/doc/fluid/advanced_usage/deploy/inference/paddle_gpu_benchmark.md
+# TensorRT库性能测试
+
+## 测试环境
+- CPU:Intel(R) Xeon(R) Gold 5117 CPU @ 2.00GHz GPU:Tesla P4
+- TensorRT4.0, CUDA8.0, CUDNNV7
+- 测试模型 ResNet50，MobileNet，ResNet101, Inception V3.
+
+## 测试对象
+**PaddlePaddle, Pytorch, Tensorflow**   
+
+- 在测试中，PaddlePaddle使用子图优化的方式集成了TensorRT, 模型[地址](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification/models)。
+- Pytorch使用了原生的实现, 模型[地址1](https://github.com/pytorch/vision/tree/master/torchvision/models)、[地址2](https://github.com/marvis/pytorch-mobilenet)。
+- 对TensorFlow测试包括了对TF的原生的测试，和对TF—TRT的测试，**对TF—TRT的测试并没有达到预期的效果，后期会对其进行补充**， 模型[地址](https://github.com/tensorflow/models)。
+
+
+### ResNet50 
+ 
+|batch_size|PaddlePaddle(ms)|Pytorch(ms)|TensorFlow(ms)|
+|---|---|---|---|
+|1|4.64117 |16.3|10.878|
+|5|6.90622| 22.9 |20.62|
+|10|7.9758 |40.6|34.36|
+
+### MobileNet
+|batch_size|PaddlePaddle(ms)|Pytorch(ms)|TensorFlow(ms)|
+|---|---|---|---|
+|1| 1.7541 | 7.8 |2.72|
+|5| 3.04666 | 7.8 |3.19|
+|10|4.19478 | 14.47 |4.25|
+
+### ResNet101
+|batch_size|PaddlePaddle(ms)|Pytorch(ms)|TensorFlow(ms)|
+|---|---|---|---|
+|1|8.95767| 22.48 |18.78|
+|5|12.9811 | 33.88 |34.84|
+|10|14.1463| 61.97 |57.94|
+
+
+### Inception v3
+|batch_size|PaddlePaddle(ms)|Pytorch(ms)|TensorFlow(ms)|
+|---|---|---|---|
+|1|15.1613 | 24.2 |19.1|
+|5|18.5373 | 34.8 |27.2|
+|10|19.2781| 54.8 |36.7|
+
+
+
+
--- a/doc/fluid/user_guides/howto/inference/paddle_tensorrt_infer.md
+++ b/doc/fluid/user_guides/howto/inference/paddle_tensorrt_infer.md
-# 使用Paddle TensorRT预测
+# 使用TensorRT库预测

 NVIDIA TensorRT 是一个高性能的深度学习预测库，可为深度学习推理应用程序提供低延迟和高吞吐量。Paddle 1.0 采用了子图的形式对TensorRT进行了初步集成，即我们可以使用该模块来提升Paddle模型的预测性能。该模块依旧在持续开发中，目前已支持的模型有：AlexNet, MobileNet, ResNet50, VGG19, ResNext, Se-ReNext, GoogleNet, DPN, ICNET, MobileNet-SSD等。在这篇文档中，我们将会对Paddle-TensorRT库的获取、使用和原理进行介绍。


--- a/doc/fluid/user_guides/howto/inference/windows_cpp_inference.md
+++ b/doc/fluid/user_guides/howto/inference/windows_cpp_inference.md
-Windows环境模型预测使用说明
+Windows环境模型预测
 ===========================

 环境部署
@@ -8,79 +8,50 @@ Windows环境模型预测使用说明

 测试环境硬件配置：

-| CPU   |      I7-8700K      |
-|----------|:-------------:|
-| 内存 |  16G |
-| 硬盘 |  1T hdd + 256G ssd |
-| 显卡 |  GTX1080 8G |
+| CPU      |      I7-8700K      |
+|:---------|:-------------------|
+| 内存 | 16G               |
+| 硬盘 | 1T hdd + 256G ssd |
+| 显卡 | GTX1080 8G        |

-测试环境操作系统使用win10 Version 18.03 版本。下载地址：
+测试环境操作系统使用 win10 家庭版本。

 ### 环境配置步骤

-**一定要严格按照安装步骤顺序，否则会安装失败！**
+**请您严格按照以下步骤进行安装，否则可能会导致安装失败！**

-**安装vs2015**
+**安装Visual Studio 2015 update3**

-安装vs2015，安装选项中选择安装内容时勾选自定义，把关于c，c++，vc++的功能都安装上。下载地址：
+安装Visual Studio 2015，安装选项中选择安装内容时勾选自定义，选择安装全部关于c，c++，vc++的功能。

-**安装CUDA8**
-
-需要去NVIDIA官网[https://www.geforce.cn/drivers](https://www.geforce.cn/drivers)
-下载显卡对应的驱动。推荐391版本
-<p align="center">
- <img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image1.png" >
-</p>
-安装时需要勾选自定义，勾选安装全部。
-
-验证安装需要进入cmd中，输入nvcc -V查看。
-<p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image2.png">
-</p>
-
-如果有显卡安装驱动，也可以选择直接安装CUDA8.0，[https://developer.nvidia.com/cuda-80-ga2-download-archive](https://developer.nvidia.com/cuda-80-ga2-download-archive)
-
-**安装CUDNN**
-
-安装CUDNN只需要将文件中CUDNN
-7下的文件复制到对应的CUDA安装目录下。文件名，cudnn-8.0-windows10-x64-v7.zip。这里提供了cudnn
-7
-64位的版本。需要其他版本可在[https://developer.nvidia.com/cudnn](https://developer.nvidia.com/cudnn)
-下载。

 预测demo使用
 ------------

-解压Paddle，Release，fluid\_install\_dir压缩包。
+解压Paddle，Release，fluid_install_dir压缩包。

-进入Paddle/paddle/fluid/inference/api/demo\_ci目录，新建build目录并进入，然后使用cmake生成vs2015的solution文件。
+进入Paddle/paddle/fluid/inference/api/demo_ci目录，新建build目录并进入，然后使用cmake生成vs2015的solution文件。
 指令为：
-```cmake
-cmake .. -G \"Visual Studio 14 2015 Win64\" -DWITH\_GPU=ON
-DWITH\_MKL=OFF -DWITH\_STATIC\_LIB=ON -DCMAKE\_BUILD\_TYPE=Release
-DDEMO\_NAME=simple\_on\_word2vec
-DPADDLE\_LIB=D:\\to\_the\_paddle\_fluid.lib
-DCUDA\_LIB=D:\\CUDA\\v8.0\\lib\\x64
-```
+
+`cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=simple_on_word2vec -DPADDLE_LIB=path_to_the_patddle\paddle_fluid.lib`

 注：

-DDEMO\_NAME 是要编译的文件
+-DDEMO_NAME 是要编译的文件

-DPADDLE\_LIB 是fluid\_install\_dir路径，例如
-DPADDLE\_LIB=D:\\fluid\_install\_dir
+-DPADDLE_LIB 是fluid_install_dir路径，例如
+-DPADDLE_LIB=D:\fluid_install_dir

-DCUDA\_LIB 是CUDA安装目录对应的文件夹

-Cmake可以在官网进行下载，并添加到环境变量中。[[https://cmake.org/download/]{.underline}](https://cmake.org/download/)
+Cmake可以在[官网进行下载](https://cmake.org/download/)，并添加到环境变量中。

-执行完毕后，build目录如图所示，打开 箭头指向的solution文件：
+执行完毕后，build 目录如图所示，打开箭头指向的 solution 文件：

 <p align="center">
 <img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image3.png">
 </p>

-修改编译属性为/MT：
+修改编译属性为 `/MT` ：

 <p align="center">
 <img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image4.png">
@@ -90,7 +61,7 @@ Cmake可以在官网进行下载，并添加到环境变量中。[[https://cmake
 <img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image5.png">
 </p>

-编译生成选项改成Release。
+编译生成选项改成 `Release` 。

 <p align="center">
 <img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image6.png">
@@ -110,17 +81,13 @@ Cmake可以在官网进行下载，并添加到环境变量中。[[https://cmake

  1.  开启GLOG

-  	set GLOG\_v=3
+  	`set GLOG_v=100`

  2.  进行预测

-  	simple\_on\_word2vec.exe \--dirname=.\\word2vec.inference.model
+  	`simple_on_word2vec.exe --dirname=.\word2vec.inference.model`

 <p align="center">
 <img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image9.png">
 </p>

-**FAQ：**
-
-路径中尽量不要包含空格，例如发现CUDA\_LIB路径是Program
-Files(x86)可能会出错。可以将CUDA拷贝到一个新位置（这里直接拷贝就行）
--- a/doc/fluid/advanced_usage/deploy/mobile/index_cn.rst
+++ b/doc/fluid/advanced_usage/deploy/mobile/index_cn.rst
+##########
+移动端部署
+##########
+
+本模块介绍了 PaddlePaddle 组织下的嵌入式平台深度学习框架——Paddle-Mobile，包括：
+
+* `项目简介 <mobile_readme.html>`_：简要介绍了 Paddle-Mobile 的应用效果，特点以及使用说明
+
+* `环境搭建 <mobile_build.html>`_：分别介绍如何在Docker和非Docker下搭建环境
+
+.. toctree::
+   :hidden:
+
+   mobile_readme.md
+   mobile_build.md
--- a/doc/fluid/advanced_usage/deploy/mobile_build.md
+++ b/doc/fluid/advanced_usage/deploy/mobile_build.md
 # 环境搭建
 ## 使用 docker
 ### 1. 安装 docker
-安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+安装 docker 的方式，参考 [官方文档](https://docs.docker.com/install/)
 ### 2. 使用 docker 搭建构建环境
 首先进入 paddle-mobile 的目录下，执行 `docker build`
 以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)

--- a/doc/fluid/advanced_usage/deploy/mobile_readme.md
+++ b/doc/fluid/advanced_usage/deploy/mobile_readme.md
@@ -33,13 +33,11 @@

 开发文档主要是关于编译、运行等问题。作为开发者，它可以和贡献文档共同结合使用

-[iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
-
-[Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
-
-[FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
-
-[ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md)
+* [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
+* [Android_CPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
+* [Android_GPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android_GPU.md)
+* [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
+* [ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md)

 ### 贡献代码

@@ -71,10 +69,6 @@ ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切

 [下载链接](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)

-如下gif是简单搜索app的线上主体检测应用效果
-
-![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)
-
 ## 问题解决

 欢迎提出或解决我们的问题，有疑问可以发[Issue](https://github.com/PaddlePaddle/paddle-mobile/issues)
@@ -85,3 +79,4 @@ Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](L

 ## 旧版 Mobile-Deep-Learning
 原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
+
--- a/doc/fluid/user_guides/design_idea/fluid_design_idea.md
+++ b/doc/fluid/user_guides/design_idea/fluid_design_idea.md
-# Fluid设计思想
+# 设计思想

 ## 简介

@@ -17,7 +17,7 @@ Fluid使用一种编译器式的执行流程，分为编译时和运行时两个

 本地训练任务执行流程图如下所示：
 <p align="center">
-	<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/design_idea/image/fluid_process.png" width=800>
+	<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/advanced_usage/design_idea/image/fluid_process.png" width=800>
 </p>

 1. 编译时，用户编写一段python程序，通过调用 Fluid 提供的算子，向一段 Program 中添加变量（Tensor）以及对变量的操作（Operators 或者 Layers）。用户只需要描述核心的前向计算，不需要关心反向计算、分布式下以及异构设备下如何计算。
@@ -153,7 +153,7 @@ Executor 在运行时将接受一个`ProgramDesc`、一个`block_id`和一个`Sc
 完成的编译执行的具体过程如下图所示：

 <p align="center">
-	<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/design_idea/image/executor_design.png" width=600>
+	<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/advanced_usage/design_idea/image/executor_design.png" width=600>
 </p>

 1. Executor 为每一个block创建一个Scope，Block是可嵌套的，因此Scope也是可嵌套的
@@ -359,5 +359,5 @@ Fluid使用Executor.run来运行一段Program。
       [6.099215 ]], dtype=float32), array([1.6935859], dtype=float32)]
 ```

-至此您已经了解了Fluid 内部的执行流程的核心概念，更多框架使用细节请参考[使用指南](../../user_guides/index.html)相关内容，[模型库](../../user_guides/models/index.html
+至此您已经了解了Fluid 内部的执行流程的核心概念，更多框架使用细节请参考[使用指南](../../user_guides/index.html)相关内容，[模型库](../../user_guides/models/index_cn.html
 )中也为您提供了丰富的模型示例以供参考。
--- a/doc/fluid/user_guides/design_idea/image/executor_design.png
+++ b/doc/fluid/user_guides/design_idea/image/executor_design.png
--- a/doc/fluid/user_guides/design_idea/image/fluid_process.png
+++ b/doc/fluid/user_guides/design_idea/image/fluid_process.png
--- a/doc/fluid/advanced_usage/development/contribute_to_paddle/index_cn.rst
+++ b/doc/fluid/advanced_usage/development/contribute_to_paddle/index_cn.rst
-########
+############
 如何贡献代码
-########
+############

 ..  toctree::
-    :maxdepth: 2
+    :maxdepth: 1

    local_dev_guide.md
    submit_pr_guide.md
--- a/doc/fluid/advanced_usage/development/new_op.md
+++ b/doc/fluid/advanced_usage/development/new_op.md
-../../dev/new_op_cn.md
\ No newline at end of file
--- a/doc/fluid/advanced_usage/development/new_op/index_cn.rst
+++ b/doc/fluid/advanced_usage/development/new_op/index_cn.rst
+#############
+新增operator
+#############
+
+- `如何写新的operator <../../../advanced_usage/development/new_op.html>`_ ：介绍如何在 Fluid 中添加新的 Operator
+
+- `op相关的一些注意事项 <../../../advanced_usage/development/op_notes.html>`_ ：介绍op相关的一些注意事项
+
+.. toctree::
+   :hidden:
+
+   new_op_cn.md
+   op_notes.md
--- a/doc/fluid/advanced_usage/development/new_op/new_op_cn.md
+++ b/doc/fluid/advanced_usage/development/new_op/new_op_cn.md
+../../../dev/new_op_cn.md
\ No newline at end of file
--- a/doc/fluid/advanced_usage/development/new_op/op_notes.md
+++ b/doc/fluid/advanced_usage/development/new_op/op_notes.md
+# op相关的一些注意事项
+
+## Fluid中Op的构建逻辑
+### 1.Fluid中Op的构建逻辑
+Fluid中所有的Op都继承自`OperatorBase`，且所有的Op都是无状态的，每个Op包含的成员变量只有四个：type、inputs、outputs、attribute。
+
+Op的核心方法是Run，Run方法需要两方面的资源：数据资源和计算资源，这两个资源分别通过`Scope`和`Place`获取。框架内部有一个全局的`DeviceContextPool`，用来记录`Place`和`DeviceContext`之间的对应的关系，即每个`Place`有且仅有一个`DeviceContext`与之对应，`DeviceContext`中存放了当前设备的计算资源。比如对于GPU，这些资源包括`cudnn_handle`、`cublas_handle`、`stream`等，Op内部所有的计算（数据拷贝和CUDA Kernel等）都必须在`DeviceContext`中进行。
+
+Fluid框架的设计理念是可以在多种设备及第三方库上运行，有些Op的实现可能会因为设备或者第三方库的不同而不同。为此，Fluid引入了OpKernel的方式，即一个Op可以有多个OpKernel，这类Op继承自`OperatorWithKernel`，这类Op的代表是conv，conv_op的OpKerne有：`GemmConvKernel`、`CUDNNConvOpKernel`、`ConvMKLDNNOpKernel`，且每个OpKernel都有double和float两种数据类型。不需要OpKernel的代表有`WhileOp`等。
+
+Operator继承关系图： 
+![op_inheritance_relation_diagram](../../pics/op_inheritance_relation_diagram.png)
+
+进一步了解可参考：[multi_devices](https://github.com/PaddlePaddle/FluidDoc/tree/develop/doc/fluid/design/multi_devices)，[scope](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/scope.md)，[Developer's_Guide_to_Paddle_Fluid](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md)    
+
+### 2.Op的注册逻辑
+每个Operator的注册项包括：
+    ```C++
+    OpCreator creator_;
+    GradOpMakerFN grad_op_maker_;
+    proto::OpProto* proto_{nullptr};
+    OpAttrChecker* checker_{nullptr};
+    InferVarTypeFN infer_var_type_;
+    InferShapeFN infer_shape_;
+    ```
+
+<table>
+<thead>
+<tr>
+<th>注册项</th>
+<th>类型</th>
+<th>说明</th>
+<th>调用</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>proto::OpProto </td>
+<td>Class </td>
+<td>存放Op的输入/输出/属性/Op类型 </td>
+<td>编译时调用 </td>
+</tr>
+<tr>
+<td>GradOpMakerFN </td>
+<td>Functor </td>
+<td>返回当前Op对应的反向Op的一组OpDesc，因为正向Op的反向可能有多个Op构成 </td>
+<td>编译时调用 </td>
+</tr>
+<tr>
+<td>OpAttrChecker </td>
+<td>Class </td>
+<td>对Op的attr进行check </td>
+<td>编译时调用</td>
+</tr>
+<tr>
+<td>InferVarTypeFN </td>
+<td>Functor </td>
+<td>用于推断输出Var的Type，比如是LoDTensor还是SelectedRows，或者其他 </td>
+<td>编译时调用 </td>
+</tr>
+<tr>
+<td>InferShapeFN </td>
+<td>Functor </td>
+<td>用于推断Output的Shape </td>
+<td>分为编译时和运行时，编译时是在Python端调用；如果Op继承自OperatorWithKernel，运行时是在op.run时调用 </td>
+</tr>
+<tr>
+<td>OpCreator </td>
+<td>Functor </td>
+<td>每次调用都会创建一个新的OperatorBase </td>
+<td>运行时调用 </td>
+</tr>
+</tbody>
+</table>
+
+通常Op注释时需要调用REGISTER_OPERATOR，即：
+    ```
+    REGISTER_OPERATOR(op_type, 
+                      OperatorBase
+                      op_maker_and_checker_maker,
+                      op_grad_opmaker,
+                      op_infer_var_shape,
+                      op_infer_var_type)
+    ```
+
+**注意：**   
+
+1. 对于所有Op，前三个参数是必须的，op_type指明op的名字，OperatorBase是该Op的对象，op_maker_and_checker_maker是op的maker和op中attr的checker。
+2. 如果该Op有反向，则必须要有op_grad_opmaker，因为在backward会根据正向的Op中获取反向Op的Maker。
+3. 框架提供了一个默认的op_grad_opmaker：`DefaultGradOpDescMaker`，这个Maker会将前向Op的输入和输出都作为反向Op的输入，将前向Op的输入的梯度作为反向Op的输出，并将前向Op的属性拷贝过来。**注意：**DefaultGradOpDescMaker会将前向Op的所有输入输出都做反向Op的输入，即使这个输入是没有必要的，这将会导致无法对没有用到的变量做内存优化。
+4. 框架没有提供默认的op_infer_var_shape方法。如果该Op是无OpKernel的，通常需要用户添加对应的op_infer_var_shape方法；如果该Op是有OpKernel的，需要实现`OperatorWithKernel`中的`InferShape`方法，此时不需要提供op_infer_var_shape方法。具体实现可参考[while_op.cc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/controlflow/while_op.cc)，[conv_op.cc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/conv_op.cc)。
+5. 框架没有提供默认的op_infer_var_type方法，用户需要根据实际情况添加op_infer_var_shape。严格来说每个Op都应该注册一个InferVarType，op_infer_var_type根据输入的Var的type和dtype推断输出Var的type和dtype。**注意：**在Python端的LayerHelper中create_variable_for_type_inference操作返回的Variable里面是LoDTensor，C++端的InferVarType可以修改`Variable`的type和dtype。
+
+
+更多内容请参考: [如何写新的Op](../new_op.html)
+
+## 写Op注意事项
+### 1.Op可以支持输入输出类型
+Fluid的Op的输入输出都是`Variable`，从设计上讲，`Variable`中可以存放任意类型，Op的输入输出`Variable`可能是是任意类型，通常情况下`Variable`中存放的是`LoDTensor`、`SlelecteRows`。
+
+**注意：**
+
+- 代码中经常出现`context.Input<Tensor>("Input")`，并不表示"Input"的`Variable`是`Tensor`，而是从"Input"的`Variable`的`LoDTensor`中获取`Tensor`。如果"Input"的`Variable`是`SelecetedRows`，则会报错。
+- 如果”Input”是`SelectedRows`，`context->GetInputDim("Input")`返回的是`var->Get<SelectedRows>().GetCompleteDims()`，而不是`SelectedRows`中`Tensor`的Dim。
+
+### 2.在Op内部不能对输入的数据做任何的改写
+在Op内部绝不允许对输入数据做任何改写，因为可能存在其他Op需要读这个数据。
+
+### 3.OpKernel需要注册的数据类型
+目前要求所有OpKernel都要注册double和float数据类型。
+
+### 4.Op兼容性问题
+对Op的修改需要考虑兼容性问题，要保证Op修改之后，之前的模型都能够正常加载及运行。<font color="#FF0000">**所以现在不允许对已有的Op新增输入或者输出，不允许减去Op的已有属性及修改默认值**</font> 。
+
+### 5.ShareDataWith的调用
+ShareDataWith的功能是使两个Tensor共享底层buffer，在调用这个操作的时候需要特别注意，在Op内部不能将ShareDataWith作用在Op的输出上，即Op输出的Tensor必须是Malloc出来的。
+
+### 6.稀疏梯度参数更新方法
+目前稀疏梯度在做更新更新的时候会先对梯度做merge，即对相同参数的梯度做累加，然后做参数以及附加参数（如velocity）的更新。
+
+### 7.显存优化
+如果Op的反向不需要将前向op的所有输入输出作为其输入，则不要用`DefaultGradOpDescMaker`，这将会导致无法对没有用到的变量做内存/显存优化。
+
+### 8.混合设备调用
+由于GPU是异步执行的，当CPU调用返回之后，GPU端可能还没有真正的执行，所以如果在Op中创建了GPU运行时需要用到的临时变量，当GPU开始运行的时候，该临时变量可能在CPU端已经被释放，这样可能会导致GPU计算出错。
+
+关于GPU中的一些同步和异步操作：
+```
+The following device operations are asynchronous with respect to the host:
+    Kernel launches;
+    Memory copies within a single device's memory;
+    Memory copies from host to device of a memory block of 64 KB or less;
+    Memory copies performed by functions that are suffixed with Async;
+    Memory set function calls.
+```
+
+关于cudaMemCpy和cudaMemCpyAsync注意事项：
+
+- 如果数据传输是从GPU端到非页锁定的CPU端，数据传输将是同步，即使调用的是异步拷贝操作。
+- 如果数据传输时从CPU端到CPU端，数据传输将是同步的，即使调用的是异步拷贝操作。
+
+更多内容可参考：[Asynchronous Concurrent Execution](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#asynchronous-concurrent-execution)，[API synchronization behavior](https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html#api-sync-behavior)  
+
+## Op性能优化
+### 1.第三方库的选择
+在写Op过程中优先使用高性能（如cudnn、mkldnn、mklml、eigen等）中提供的操作，但是一定要做benchmark，有些库中的操作在深度学习任务中可能会比较慢。因为高性能库（如eigen等）中提供的操作为了更为通用，在性能方面可能并不是很好，通常深度学习模型中数据量较小，所以有些情况下可能高性能库中提供的某些操作速度较慢。比如Elementwise系列的所有Op（前向和反向），Elementwise操作在模型中调用的次数比较多，尤其是Elementwise_add，在很多操作之后都需要添加偏置项。在之前的实现中Elementwise_op直接调用Eigen库，由于Elementwise操作在很多情况下需要对数据做Broadcast，而实验发现Eigen库做Broadcast的速度比较慢，慢的原因在这个PR[#6229](https://github.com/PaddlePaddle/Paddle/pull/6229)中有描述。
+
+### 2.Op性能优化
+Op的计算速度与输入的数据量有关，对于某些Op可以根据输入数据的Shape和Op的属性参数来选择不同的计算方式。比如concat_op，当axis>=1时，在对多个tensor做拼接过程中需要对每个tensor做很多次拷贝，如果是在GPU上，需要调用cudaMemCopy。相对CPU而言，GPU属于外部设备，所以每次调用GPU的操作都会有一定的额外开销，并且当需要拷贝的次数较多时，这种开销就更为凸现。目前concat_op的实现会根据输入数据的Shape以及axis值来选择不同的调用方式，如果输入的tensor较多，且axis不等于0，则将多次拷贝操作转换成一个CUDA Kernel来完成；如果输入tensor较少，且axis等于0，使用直接进行拷贝。相关实验过程在该PR（[#8669](https://github.com/PaddlePaddle/Paddle/pull/8669)）中有介绍。
+
+由于CUDA Kernel的调用有一定的额外开销，所以如果Op中出现多次调用CUDA Kernel，可能会影响Op的执行速度。比如之前的sequence_expand_op中包含很多CUDA Kernel，通常这些CUDA Kernel处理的数据量较小，所以频繁调用这样的Kernel会影响Op的计算速度，这种情况下最好将这些小的CUDA Kernel合并成一个。在优化sequence_expand_op过程（相关PR[#9289](https://github.com/PaddlePaddle/Paddle/pull/9289)）中就是采用这种思路，优化后的sequence_expand_op比之前的实现平均快出约1倍左右，相关实验细节在该PR（[#9289](https://github.com/PaddlePaddle/Paddle/pull/9289)）中有介绍。
+
+减少CPU与GPU之间的拷贝和同步操作的次数。比如fetch操作，在每个迭代之后都会对模型参数进行更新并得到一个loss，并且数据从GPU端到没有页锁定的CPU端的拷贝是同步的，所以频繁的fetch多个参数会导致模型训练速度变慢。
+
+## Op数值稳定性问题
+### 1.有些Op存在数值稳定性问题
+出现数值稳定性的主要原因程序在多次运行时，对浮点型数据施加操作的顺序可能不同，进而导致最终计算结果不同。而GPU是通过多线程并行计算的方式来加速计算的，所以很容易出现对浮点数施加操作的顺序不固定现象。
+
+目前发现cudnn中的卷积操作、cudnn中的MaxPooling、CUDA中CudaAtomicXX、ParallelExecutor的Reduce模式下参数梯度的聚合等操作运行结果是非确定的。
+
+为此Fluid中添加了一些FLAGS，比如使用FLAGS_cudnn_deterministic来强制cudnn使用确定性算法、FLAGS_cpu_deterministic强制CPU端的计算使用确定性方法。
+
+### 2.WITH_FAST_MATH的开与关
+如果WITH_FAST_MATH是ON，NVCC在编译Paddle和Egien的时候会使用--use_fast_math，这样可能会使CUDA中的一些操作在损失一定精度的情况下变快，比如log、exp、tanh等，但也会使一些操作的计算结果是错的，比如pow操作，具体原因请查看[torch/DEPRECEATED-torch7-distro#132](https://github.com/torch/DEPRECEATED-torch7-distro/issues/132)。
+
+## 其他
+### 1.报错信息
+Enforce提示信息不能为空，并且需要写明，因为报错信息可以更快更方便地分析出错误的原因。
+
+### 2.Op的数学公式
+如果Op有数学公式，一定要在代码中将数学公式写明，并在Python API的Doc中显示，因为用户在对比不同框架的计算结果时可能需要了解Paddle对Op是怎么实现的。
+
+**注意：**在merge到develop分支之前一定进行公式预览。可参考[dynamic_lstmp](http://paddlepaddle.org/documentation/docs/zh/1.1/api/layers.html#dynamic-lstmp)。
+
+### 3.Python端Op接口中参数的顺序
+Python API中参数的顺序一般按照重要性来排，以fc为例：
+```
+def fc(input,
+       size,
+       num_flatten_dims=1,
+       param_attr=None,
+       bias_attr=None,
+       act=None,
+       is_test=False,
+       name=None)
+```
--- a/doc/fluid/advanced_usage/development/profiling/index.rst
+++ b/doc/fluid/advanced_usage/development/profiling/index.rst
-##########
-性能调优
-##########
-..  toctree::
-    
-    benchmark.rst
-    cpu_profiling_cn.md
-    gpu_profiling_cn.rst
-    host_memory_profiling_cn.md
-    timeline_cn.md
--- a/doc/fluid/advanced_usage/development/profiling/index_cn.rst
+++ b/doc/fluid/advanced_usage/development/profiling/index_cn.rst
+##########
+性能调优
+##########
+
+本模块介绍 Fluid 使用过程中的调优方法，包括：
+
+- `如何进行基准测试 <benchmark.html>`_：介绍如何选择基准模型，从而验证模型的精度和性能
+- `CPU性能调优 <cpu_profiling_cn.html>`_：介绍如何使用 cProfile 包、yep库、Google perftools 进行性能分析与调优
+- `GPU性能调优 <gpu_profiling_cn.html>`_：介绍如何使用 Fluid 内置的定时工具、nvprof 或 nvvp 进行性能分析和调优
+- `堆内存分析和优化 <host_memory_profiling_cn.html>`_：介绍如何使用 gperftool 进行堆内存分析和优化，以解决内存泄漏的问题
+- `Timeline工具简介 <timeline_cn.html>`_ ：介绍如何使用 Timeline 工具进行性能分析和调优
+
+
+..  toctree::
+	:hidden:
+
+    benchmark.rst
+    cpu_profiling_cn.md
+    gpu_profiling_cn.rst
+    host_memory_profiling_cn.md
+    timeline_cn.md
--- a/doc/fluid/advanced_usage/index.rst
+++ b/doc/fluid/advanced_usage/index.rst
@@ -2,48 +2,30 @@
 进阶使用
 ########

-=====================
-        概览
-=====================
 ..  todo::

 如果您非常熟悉 Fluid，期望获得更高效的模型或者定义自己的Operator，请阅读：

-	- `移动端部署 <../advanced_usage/deploy/index_mobile.html>`_：介绍了 PaddlePaddle 组织下的嵌入式平台深度学习框架——Paddle-Mobile，包括：
+    	- `Fluid 设计思想 <../advanced_usage/design_idea/fluid_design_idea.html>`_：介绍 Fluid 底层的设计思想，帮助您更好的理解框架运作过程

-	- `简介 <../advanced_usage/deploy/mobile_readme.html>`_：简要介绍了 Paddle-Mobile 的应用效果，特点以及使用说明
-	- `环境搭建 <../advanced_usage/deploy/mobile_build.html>`_：从使用 Docker 和不使用 Docker 两种方法下分别介绍如何搭建环境
-	- `ios开发文档 <../advanced_usage/deploy/mobile_dev.html>`_：介绍如何在 ios 系统下运用 Paddle-Mobile 进行开发
+	- `预测部署 <../advanced_usage/deploy/index_cn.html>`_ ：介绍如何应用训练好的模型进行预测

-	- `Anakin预测引擎 <../advanced_usage/deploy/index_anakin.html>`_：介绍如何使用 Anakin 在不同硬件平台实现深度学习的高速预测
-
-	- `如何写新的Operator <../advanced_usage/development/new_op.html>`_ ：介绍如何在 Fluid 中添加新的 Operator
-
-	- `性能调优 <../advanced_usage/development/profiling/index.html>`_ ：介绍 Fluid 使用过程中的调优方法，包括：
-
-	  - `如何进行基准测试 <../advanced_usage/development/profiling/benchmark.html>`_：介绍如何选择基准模型，从而验证模型的精度和性能
-	  - `CPU性能调优 <../advanced_usage/development/profiling/cpu_profiling_cn.html>`_：介绍如何使用 cProfile 包、yep库、Google perftools 进行性能分析与调优
-	  - `GPU性能调优 <../advanced_usage/development/profiling/gpu_profiling_cn.html>`_：介绍如何使用 Fluid 内置的定时工具、nvprof 或 nvvp 进行性能分析和调优
-	  - `堆内存分析和优化 <../advanced_usage/development/profiling/host_memory_profiling_cn.html>`_：介绍如何使用 gperftool 进行堆内存分析和优化，以解决内存泄漏的问题
-	  - `Timeline工具简介 <../advanced_usage/development/profiling/timeline_cn.html>`_ ：介绍如何使用 Timeline 工具进行性能分析和调优
+	- `新增operator <../advanced_usage/development/new_op/index_cn.html>`_ ：介绍新增operator的方法及注意事项

+	- `性能调优 <../advanced_usage/development/profiling/index.html>`_ ：介绍 Fluid 使用过程中的调优方法

 非常欢迎您为我们的开源社区做出贡献，关于如何贡献您的代码或文档，请阅读：

-	- `如何贡献代码 <../advanced_usage/development/contribute_to_paddle.html>`_：介绍如何向 PaddlePaddle 开源社区贡献代码
+	- `如何贡献代码 <../advanced_usage/development/contribute_to_paddle/index_cn.html>`_：介绍如何向 PaddlePaddle 开源社区贡献代码

 	- `如何贡献文档 <../advanced_usage/development/write_docs_cn.html>`_：介绍如何向 PaddlePaddle 开源社区贡献文档

-=====================
-        目录
-=====================
-
 ..  toctree::
-    :maxdepth: 2
+    :hidden:

-    deploy/index_mobile.rst
-    deploy/index_anakin.rst
+    design_idea/fluid_design_idea.md
+    deploy/index_cn.rst
+    development/new_op/index_cn.rst
+    development/profiling/index_cn.rst
    development/contribute_to_paddle/index_cn.rst
    development/write_docs_cn.md
-    development/new_op.md
-    development/profiling/index.rst
--- a/doc/fluid/advanced_usage/pics/int8_design.png
+++ b/doc/fluid/advanced_usage/pics/int8_design.png
--- a/doc/fluid/advanced_usage/pics/op_inheritance_relation_diagram.png
+++ b/doc/fluid/advanced_usage/pics/op_inheritance_relation_diagram.png
--- a/doc/fluid/api/api_guides/index.rst
+++ b/doc/fluid/api/api_guides/index.rst
@@ -12,3 +12,4 @@ API使用指南
    low_level/metrics.rst
    low_level/model_save_reader.rst
    low_level/inference.rst
+    low_level/distributed/index.rst
--- a/doc/fluid/api/api_guides/low_level/cluster/cluster_train_data_cn.rst
+++ b/doc/fluid/api/api_guides/low_level/cluster/cluster_train_data_cn.rst
+..  _api_guide_cluster_train_data:
+
+####################
+分布式训练reader准备
+####################
+
+一个数据并行的分布式训练任务通常会含有多个训练进程，每个训练进程处理整个数据集中的一部分，根据当前进程的唯一序号(trainer_id)以及训练进程总数(trainers)可以决定当前训练进程应该读取哪一部分数据。
+
+实现 cluster_reader 来读取分布式训练数据集
+----------------------------------------
+
+比较通用的方法，可以实现一个 cluster_reader, 根据训练进程数量以及进程序号决定读取哪些 example:
+
+    .. code-block:: python
+        
+        def cluster_reader(reader, trainers, trainer_id):
+            def reader_creator():
+                for idx, data in enumerate(reader()):
+                    if idx % trainers == trainer_id:
+                        yield data
+            return reader
+
+        trainers = int(os.getenv("PADDLE_TRAINERS", "1"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        train_reader = cluster_reader(paddle.dataset.mnist.train(), trainers, trainer_id)
+
+上述代码中，`trainers` 和 `trainer_id` 分别是训练进程总数和当前训练进程的序号，可以通过环境变量或者参数的方式传递给 Python 程序。
+
+预先切分训练文件
+-----------------
+
+由于使用 `cluster_reader` 依然会读取全量数据，对于训练进程比较多的任务，会造成IO资源的浪费、影响训练性能。另一种方法是可以将训练数据切分成多个小文件，每个进程处理其中的一部分文件,
+例如在 Linux 系统中可以使用 `split <http://man7.org/linux/man-pages/man1/split.1.html>`_ 命令将训练数据切分成多个小文件：
+
+  .. code-block:: bash
+    $ split -d -a 4 -d -l 100 housing.data cluster/housing.data.
+    $ find ./cluster
+    cluster/
+    cluster/housing.data.0002
+    cluster/housing.data.0003
+    cluster/housing.data.0004
+    cluster/housing.data.0000
+    cluster/housing.data.0001
+    cluster/housing.data.0005
+
+数据切分好以后, 可以实现一个 file_dispatcher 函数，根据训练进程数量以及序号决定需要读取哪些文件：
+
+    .. code-block:: python
+
+        def file_dispatcher(files_pattern, trainers, trainer_id):
+            file_list = glob.glob(files_pattern)
+            ret_list = []
+            for idx, f in enumerate(file_list):
+                if (idx + trainers) % trainers == trainer_id:
+                    ret_list.append(f)
+            return ret_list
+        
+        trainers = int(os.getenv("PADDLE_TRAINERS", "1"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        files_pattern = "cluster/housing.data.*"
+
+        my_files = file_dispatcher(files_pattern, triners, trainer_id)
+
+在上述例子中，`files_pattern` 是训练文件的 `glob 表达式 <https://docs.python.org/2.7/library/glob.html>`_，一般可以用通配符来表示。
--- a/doc/fluid/api/api_guides/low_level/distributed/async_training.rst
+++ b/doc/fluid/api/api_guides/low_level/distributed/async_training.rst
+.. _api_guide_async_training:
+
+############
+分布式异步训练
+############
+
+Fluid支持数据并行的分布式异步训练，API使用 :code:`DistributedTranspiler` 将单机网络配置转换成可以多机执行的
+:code:`pserver` 端程序和 :code:`trainer` 端程序。用户在不同的节点执行相同的一段代码，根据环境变量或启动参数，
+可以执行对应的 :code:`pserver` 或 :code:`trainer` 角色。Fluid异步训练只支持pserver模式，异步训练和 `同步训练 <../distributed/sync_training.html>`_ 的主要差异在于：异步训练每个trainer的梯度是单独更新到参数上的，
+而同步训练是所有trainer的梯度合并之后统一更新到参数上，因此，同步训练和异步训练的超参数需要分别调节。
+
+pserver模式分布式异步训练
+======================
+
+API详细使用方法参考 :ref: `api_fluid_DistributeTranspiler` ，简单示例用法：
+
+.. code-block:: python
+
+    config = fluid.DistributedTranspilerConfig()
+    # 配置策略config
+    config.slice_var_up = False
+    t = fluid.DistributedTranspiler(config=config)
+    t.transpile(trainer_id, 
+                program=main_program,
+                pservers="192.168.0.1:6174,192.168.0.2:6174",
+                trainers=1,
+                sync_mode=False)
+
+以上参数说明请参考`同步训练 <../distributed/sync_training.html>`_ 
+
+需要注意的是：进行异步训练时，请修改 :code:`sync_mode` 的值
+
+- :code:`sync_mode` ： 是否是同步训练模式，默认为True，不传此参数也默认是同步训练模式，设置为False则为异步训练
--- a/doc/fluid/api/api_guides/low_level/distributed/cpu_train_best_practice.rst
+++ b/doc/fluid/api/api_guides/low_level/distributed/cpu_train_best_practice.rst
+.. _api_guide_cpu_training_best_practice:
+
+##################
+分布式CPU训练最佳实践
+##################
+
+提高CPU分布式训练的训练速度，主要要从两个方面来考虑：
+1）提高训练速度，主要是提高CPU的使用率；2）提高通信速度，主要是减少通信传输的数据量。
+
+提高CPU的使用率
+=============
+
+提高CPU使用率主要依赖 :code:`ParallelExecutor`，可以充分利用多个CPU的计算能力来加速计算。
+
+API详细使用方法参考 :ref:`api_fluid_ParallelExecutor` ，简单实例用法：
+
+.. code-block:: python
+
+    # 配置执行策略，主要是设置线程数
+    exec_strategy = fluid.ExecutionStrategy()
+    exec_strategy.num_threads = 8
+
+    # 配置构图策略，对于CPU训练而言，应该使用Reduce模式进行训练
+    build_strategy = fluid.BuildStrategy()
+    if int(os.getenv("CPU_NUM")) > 1:
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+
+    pe = fluid.ParallelExecutor(
+        use_cuda=False,
+        loss_name=avg_cost.name,
+        main_program=main_program,
+        build_strategy=build_strategy,
+        exec_strategy=exec_strategy)
+
+以上参数中：
+
+- :code:`num_threads` ： 模型训练使用的线程数，最好和训练所在机器的物理CPU核数接近
+- :code:`reduce_strategy` ： 对于CPU训练而言，应该选择 fluid.BuildStrategy.ReduceStrategy.Reduce
+
+
+通用环境变量配置：
+
+- :code:`CPU_NUM` ：模型副本replica的个数，最好和num_threads一致
+
+
+提高通信速度
+==========
+
+要减少通信数据量，提高通信速度，主要是使用稀疏更新 ，目前支持 `稀疏更新 <../distributed/sparse_update.html>`_  的主要是  :ref:`api_fluid_layers_embedding` 。
+
+.. code-block:: python
+
+    data = fluid.layers.data(name='ids', shape=[1], dtype='int64')
+    fc = fluid.layers.embedding(input=data, size=[dict_size, 16], is_sparse=True)
+
+以上参数中：
+
+- :code:`is_sparse` ： 配置embedding使用稀疏更新，如果embedding的dict_size很大，而每次数据data很少，建议使用sparse更新方式。
--- a/doc/fluid/api/api_guides/low_level/distributed/index.rst
+++ b/doc/fluid/api/api_guides/low_level/distributed/index.rst
+=============
+分布式训练
+=============
+
+..  toctree::
+    :maxdepth: 1
+
+    async_training.rst
+    cpu_train_best_practice.rst
+    large_scale_sparse_feature_training.rst
+
--- a/doc/fluid/api/api_guides/low_level/distributed/large_scale_sparse_feature_training.rst
+++ b/doc/fluid/api/api_guides/low_level/distributed/large_scale_sparse_feature_training.rst
+.. _api_guide_large_scale_sparse_feature_training:
+
+###################
+大规模稀疏特征模型训练
+###################
+
+
+模型配置和训练
+=============
+
+embedding被广泛应用在各种网络结构中，尤其是文本处理相关的模型。在某些场景，例如推荐系统或者搜索引擎中，
+embedding的feature id可能会非常多，当feature id达到一定数量时，embedding参数会变得很大，
+会带来两个问题：
+1）单机内存由于无法存放如此巨大的embedding参数，导致无法训练；
+2）普通的训练模式每一轮迭代都需要同步完整的参数，参数太大会让通信变得非常慢，进而影响训练速度。
+
+Fluid支持千亿量级超大规模稀疏特征embedding的训练，embedding参数只会保存在parameter server上，通过
+参数prefetch和梯度稀疏更新的方法，大大减少通信量，提高通信速度。
+
+该功能只对分布式训练有效，单机无法使用。
+需要配合 `稀疏更新 <../distributed/sparse_update.html>`_ 一起使用。
+
+使用方法：在配置embedding的时候，加上参数 :code:`is_distributed=True` 以及 :code:`is_sparse=True` 即可。
+参数 :code:`dict_size` 定义数据中总的id的数量，id可以是int64范围内的任意值，只要总id个数小于等于dict_size就可以支持。
+所以配置之前需要预估一下数据中总的feature id的数量。
+
+.. code-block:: python
+
+  emb = fluid.layers.embedding(
+      is_distributed=True,
+      input=input,
+      size=[dict_size, embedding_width],
+      is_sparse=True,
+      is_distributed=True)
+
+
+模型存储和预测
+=============
+
+当特征数量达到千亿的时候，参数量很大，单机已经无法存下，所以模型的存储和加载都和普通模式不同：
+1）普通模式下，参数是在trainer端保存和加载的；
+2）分布式模式下，参数的保存和加载，都是在pserver端进行，每个pserver只保存和加载该pserver自身对应部分的参数
--- a/doc/fluid/api/api_guides/low_level/distributed/sync_training.rst
+++ b/doc/fluid/api/api_guides/low_level/distributed/sync_training.rst
+.. _api_guide_sync_training:
+
+############
+分布式同步训练
+############
+
+Fluid支持数据并行的分布式同步训练，API使用 :code:`DistributedTranspiler` 将单机网络配置转换成可以多机执行的
+:code:`pserver` 端程序和 :code:`trainer` 端程序。用户在不同的节点执行相同的一段代码，根据环境变量或启动参数，
+可以执行对应的 :code:`pserver` 或 :code:`trainer` 角色。Fluid分布式同步训练同时支持pserver模式和NCCL2模式，
+在API使用上有差别，需要注意。
+
+pserver模式分布式训练
+===================
+
+API详细使用方法参考 :ref:`DistributeTranspiler` ，简单实例用法：
+
+.. code-block:: python
+
+    config = fluid.DistributedTranspilerConfig()
+    # 配置策略config
+    config.slice_var_up = False
+    t = fluid.DistributedTranspiler(config=config)
+    t.transpile(trainer_id, 
+                program=main_program,
+                pservers="192.168.0.1:6174,192.168.0.2:6174",
+                trainers=1,
+                sync_mode=True)
+
+以上参数中：
+
+- :code:`trainer_id` ： trainer节点的id，从0到n-1，n为当前训练任务中trainer节点的个数
+- :code:`program` ： 被转换的 :code:`program` 默认使用 :code:`fluid.default_main_program()`
+- :code:`pservers` ： 当前训练任务中pserver节点的IP端口列表
+- :code:`trainers` ： int类型，当前训练任务中trainer节点的个数。注意：
+    * pserver模式下，trainer节点个数可以和pserver节点个数不一致，比如使用20个pserver和50个trainer。在实际训练任务中，您可以通过调整pserver节点和trainer节点个数找到最佳性能
+    * NCCL2模式中，此项参数是字符串，指定trainer节点的IP端口列表
+- :code:`sync_mode` ： 是否是同步训练模式，默认为True，不传此参数也默认是同步训练模式
+
+
+其中，支持的config包括：
+
+- :code:`slice_var_up` ： 配置是否切分一个参数到多个pserver上进行优化，默认开启。此选项适用于模型参数个数少，但需要使用大量节点的场景，有利于提升pserver端计算并行度
+- :code:`split_method` ： 配置transpiler分配参数（或参数的切片）到多个pserver的方式，默认为"RoundRobin"，也可以使用"HashName"
+- :code:`min_block_size` ： 如果配置了参数切分，指定最小Tensor的切分大小，防止RPC请求包过小，默认为8192，一般情况不需要调整此项参数
+- :code:`enable_dc_asgd` ： 是否开启 :code:`DC-ASGD` 此选项在异步训练中生效，启用异步训练补偿算法
+- :code:`mode` : 可以选择"pserver"或"nccl2"，指定使用pserver模式或NCCL2模式分布式训练
+- :code:`print_log` ： 是否开启transpiler debug日志，此项为开发调试使用
+
+通用环境变量配置：
+
+- :code:`FLAGS_rpc_send_thread_num` ：int，指定RPC通信发送时线程的个数
+- :code:`FLAGS_rpc_get_thread_num` ： int，指定RPC通信接受时线程的个数
+- :code:`FLAGS_rpc_prefetch_thread_num` ： int，分布式lookup table执行RPC通信时，prefetch线程的个数
+- :code:`FLAGS_rpc_deadline` ： int，RPC通信最长等待时间，单位为毫秒，默认180000
+
+
+NCCL2模式分布式训练
+=================
+
+基于NCCL2 (Collective Communication) 的多机同步训练模式，仅支持在GPU集群下进行。
+此部分详细API说明可以参考 :ref:`DistributeTranspiler` 。
+
+注意：NCCL2模式下，集群不需要启动pserver，只需要启动多个trainer节点即可。
+
+使用以下代码，将当前 :code:`Program` 转化成适用于NCCL2分布式计算的Fluid :code:`Program` ：
+
+.. code-block:: python
+
+    config = fluid.DistributeTranspilerConfig()
+    config.mode = "nccl2"
+    t = fluid.DistributedTranspiler(config=config)
+    t.transpile(trainer_id, 
+                program=main_program,
+                startup_program=startup_program,
+                trainers="192.168.0.1:6174,192.168.0.2:6174",
+                current_endpoint="192.168.0.1:6174")
+
+其中：
+
+- :code:`trainer_id` : trainer节点的id，从0到n-1，n为当前训练任务中trainer节点的个数
+- :code:`program` 和 :code:`startup_program` : 分别为Fluid 模型的主配置program和初始化startup_program
+- :code:`trainers` : 字符串类型，指定当前任务所有trainer的IP和端口号，仅用于NCCL2初始化（pserver模式中，此参数为int，指定trainer节点的个数）
+- :code:`current_endpoint` : 当前任务的当前节点的IP和端口号
--- a/doc/fluid/api/api_guides/low_level/layers/index.rst
+++ b/doc/fluid/api/api_guides/low_level/layers/index.rst
@@ -14,4 +14,5 @@
    loss_function.rst
    data_in_out.rst
    control_flow.rst
+    sparse_update.rst
    
--- a/doc/fluid/api/api_guides/low_level/layers/sparse_update.rst
+++ b/doc/fluid/api/api_guides/low_level/layers/sparse_update.rst
+.. _api_guide_sparse_update:
+
+#####
+稀疏更新
+#####
+
+Fluid的 :ref:`api_fluid_layers_embedding`  层在单机训练和分布式训练时，均可以支持“稀疏更新”，即梯度以sparse tensor 结构存储，只保存梯度不为0的行。
+在分布式训练中，对于较大的embedding层，开启稀疏更新有助于减少通信数据量，提升训练速度。
+
+在paddle内部，我们用lookup_table来实现embedding。下边这张图说明了embedding在正向和反向计算的过程：
+
+如图所示：一个Tensor中有两行不为0，正向计算的过程中，我们使用ids存储不为0的行，并使用对应的两行数据来进行计算；反向更新的过程也只更新这两行。
+
+.. image:: ../../../../images/lookup_table_training.png
+   :scale: 50 %
+
+embedding使用例子:
+---------------------
+
+API详细使用方法参考 :ref:`api_fluid_layers_embedding` ，以下是一个简单的例子：
+
+.. code-block:: python
+
+   DICT_SIZE = 10000 * 10
+   EMBED_SIZE = 64
+   IS_SPARSE = False
+   def word_emb(word, dict_size=DICT_SIZE, embed_size=EMBED_SIZE):
+       embed = fluid.layers.embedding(
+           input=word,
+           size=[dict_size, embed_size],
+           dtype='float32',
+           param_attr=fluid.ParamAttr(
+               initializer=fluid.initializer.Normal(scale=1/math.sqrt(dict_size))),
+           is_sparse=IS_SPARSE,
+           is_distributed=False)
+       return embed
+
+以上参数中：
+
+- :code:`is_sparse` ： 反向计算的时候梯度是否为sparse tensor。如果不设置，梯度是一个 `LodTensor <https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/user_guides/howto/prepare_data/lod_tensor.md>`_  。默认为False。
+
+- :code:`is_distributed` ： 标志是否是用在分布式的场景下。一般大规模稀疏更新（embedding的第0维维度很大，比如几百万以上）才需要设置。具体可以参考大规模稀疏的API guide  :ref:`api_guide_async_training`  。默认为False。
+
+- API汇总:
+ - :ref:`api_fluid_layers_embedding`
--- a/doc/fluid/api/average.rst
+++ b/doc/fluid/api/average.rst
@@ -14,3 +14,5 @@ WeightedAverage
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_average_WeightedAverage`
+
--- a/doc/fluid/api/backward.rst
+++ b/doc/fluid/api/backward.rst
@@ -13,3 +13,5 @@ append_backward
 ..  autofunction:: paddle.fluid.backward.append_backward
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_backward_append_backward`
+
--- a/doc/fluid/api/clip.rst
+++ b/doc/fluid/api/clip.rst
@@ -14,6 +14,8 @@ ErrorClipByValue
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_clip_ErrorClipByValue`
+
 .. _api_fluid_clip_GradientClipByGlobalNorm:

 GradientClipByGlobalNorm
@@ -23,6 +25,8 @@ GradientClipByGlobalNorm
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_clip_GradientClipByGlobalNorm`
+
 .. _api_fluid_clip_GradientClipByNorm:

 GradientClipByNorm
@@ -32,6 +36,8 @@ GradientClipByNorm
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_clip_GradientClipByNorm`
+
 .. _api_fluid_clip_GradientClipByValue:

 GradientClipByValue
@@ -41,3 +47,5 @@ GradientClipByValue
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_clip_GradientClipByValue`
+
--- a/doc/fluid/api/data_feeder.rst
+++ b/doc/fluid/api/data_feeder.rst
@@ -14,3 +14,5 @@ DataFeeder
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_data_feeder_DataFeeder`
+
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
@@ -5,14 +5,6 @@
 fluid.executor
 ==============

-.. _api_fluid_executor__switch_scope:
-
-_switch_scope
-------------
-
-..  autofunction:: paddle.fluid.executor._switch_scope
-    :noindex:
-
 .. _api_fluid_executor_Executor:

 Executor
@@ -22,6 +14,8 @@ Executor
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_executor_Executor`
+
 .. _api_fluid_executor_global_scope:

 global_scope
@@ -30,6 +24,8 @@ global_scope
 ..  autofunction:: paddle.fluid.executor.global_scope
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_executor_global_scope`
+
 .. _api_fluid_executor_scope_guard:

 scope_guard
@@ -38,3 +34,5 @@ scope_guard
 ..  autofunction:: paddle.fluid.executor.scope_guard
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_executor_scope_guard`
+
--- a/doc/fluid/api/fluid.rst
+++ b/doc/fluid/api/fluid.rst
@@ -5,14 +5,17 @@
 fluid
 =====

-.. _api_fluid__switch_scope:
+.. _api_fluid_AsyncExecutor:

-_switch_scope
+AsyncExecutor
 -------------

-..  autofunction:: paddle.fluid._switch_scope
+..  autoclass:: paddle.fluid.AsyncExecutor
+    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_AsyncExecutor`
+
 .. _api_fluid_BuildStrategy:

 BuildStrategy
@@ -22,6 +25,8 @@ BuildStrategy
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_BuildStrategy`
+
 .. _api_fluid_CPUPlace:

 CPUPlace
@@ -31,6 +36,8 @@ CPUPlace
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_CPUPlace`
+
 .. _api_fluid_create_lod_tensor:

 create_lod_tensor
@@ -39,6 +46,8 @@ create_lod_tensor
 ..  autofunction:: paddle.fluid.create_lod_tensor
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_create_lod_tensor`
+
 .. _api_fluid_create_random_int_lodtensor:

 create_random_int_lodtensor
@@ -47,6 +56,8 @@ create_random_int_lodtensor
 ..  autofunction:: paddle.fluid.create_random_int_lodtensor
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_create_random_int_lodtensor`
+
 .. _api_fluid_CUDAPinnedPlace:

 CUDAPinnedPlace
@@ -56,6 +67,8 @@ CUDAPinnedPlace
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_CUDAPinnedPlace`
+
 .. _api_fluid_CUDAPlace:

 CUDAPlace
@@ -65,6 +78,19 @@ CUDAPlace
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_CUDAPlace`
+
+.. _api_fluid_DataFeedDesc:
+
+DataFeedDesc
+------------
+
+..  autoclass:: paddle.fluid.DataFeedDesc
+    :members:
+    :noindex:
+
+Read Chinese Version: :ref:`cn_api_fluid_DataFeedDesc`
+
 .. _api_fluid_DataFeeder:

 DataFeeder
@@ -74,6 +100,8 @@ DataFeeder
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_DataFeeder`
+
 .. _api_fluid_default_main_program:

 default_main_program
@@ -82,6 +110,8 @@ default_main_program
 ..  autofunction:: paddle.fluid.default_main_program
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_default_main_program`
+
 .. _api_fluid_default_startup_program:

 default_startup_program
@@ -90,6 +120,8 @@ default_startup_program
 ..  autofunction:: paddle.fluid.default_startup_program
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_default_startup_program`
+
 .. _api_fluid_DistributeTranspiler:

 DistributeTranspiler
@@ -99,6 +131,8 @@ DistributeTranspiler
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_DistributeTranspiler`
+
 .. _api_fluid_DistributeTranspilerConfig:

 DistributeTranspilerConfig
@@ -108,6 +142,8 @@ DistributeTranspilerConfig
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_DistributeTranspilerConfig`
+
 .. _api_fluid_ExecutionStrategy:

 ExecutionStrategy
@@ -117,6 +153,8 @@ ExecutionStrategy
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_ExecutionStrategy`
+
 .. _api_fluid_Executor:

 Executor
@@ -126,6 +164,8 @@ Executor
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_Executor`
+
 .. _api_fluid_global_scope:

 global_scope
@@ -134,6 +174,8 @@ global_scope
 ..  autofunction:: paddle.fluid.global_scope
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_global_scope`
+
 .. _api_fluid_LoDTensor:

 LoDTensor
@@ -143,6 +185,8 @@ LoDTensor
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_LoDTensor`
+
 .. _api_fluid_LoDTensorArray:

 LoDTensorArray
@@ -152,6 +196,8 @@ LoDTensorArray
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_LoDTensorArray`
+
 .. _api_fluid_memory_optimize:

 memory_optimize
@@ -160,6 +206,8 @@ memory_optimize
 ..  autofunction:: paddle.fluid.memory_optimize
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_memory_optimize`
+
 .. _api_fluid_name_scope:

 name_scope
@@ -168,6 +216,8 @@ name_scope
 ..  autofunction:: paddle.fluid.name_scope
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_name_scope`
+
 .. _api_fluid_ParallelExecutor:

 ParallelExecutor
@@ -177,6 +227,8 @@ ParallelExecutor
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_ParallelExecutor`
+
 .. _api_fluid_ParamAttr:

 ParamAttr
@@ -186,6 +238,8 @@ ParamAttr
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_ParamAttr`
+
 .. _api_fluid_Program:

 Program
@@ -195,6 +249,8 @@ Program
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_Program`
+
 .. _api_fluid_program_guard:

 program_guard
@@ -203,6 +259,8 @@ program_guard
 ..  autofunction:: paddle.fluid.program_guard
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_program_guard`
+
 .. _api_fluid_release_memory:

 release_memory
@@ -211,6 +269,8 @@ release_memory
 ..  autofunction:: paddle.fluid.release_memory
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_release_memory`
+
 .. _api_fluid_Scope:

 Scope
@@ -220,6 +280,8 @@ Scope
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_Scope`
+
 .. _api_fluid_scope_guard:

 scope_guard
@@ -228,6 +290,8 @@ scope_guard
 ..  autofunction:: paddle.fluid.scope_guard
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_scope_guard`
+
 .. _api_fluid_Tensor:

 Tensor
@@ -237,6 +301,8 @@ Tensor
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_Tensor`
+
 .. _api_fluid_WeightNormParamAttr:

 WeightNormParamAttr
@@ -246,3 +312,5 @@ WeightNormParamAttr
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_WeightNormParamAttr`
+
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
 #!/bin/bash
 python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op > layers.rst

-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler
+for module in data_feeder clip metrics executor initializer io nets optimizer profiler regularizer transpiler recordio_writer backward average profiler
 do
  python gen_doc.py ${module} > ${module}.rst
 done

--- a/doc/fluid/api/index_cn.rst
+++ b/doc/fluid/api/index_cn.rst
 =============
-API 说明文档
+API Reference
 =============

 ..  toctree::
    :maxdepth: 1

-    api_guides/index.rst
-
    fluid.rst
    average.rst
    backward.rst

--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
@@ -5,8 +5,6 @@ API Reference
 ..  toctree::
    :maxdepth: 1

-    api_guides/index.rst
-
    fluid.rst
    average.rst
    backward.rst

--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -14,6 +14,8 @@ Bilinear
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_Bilinear`
+
 .. _api_fluid_initializer_BilinearInitializer:

 BilinearInitializer
@@ -23,6 +25,8 @@ BilinearInitializer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_BilinearInitializer`
+
 .. _api_fluid_initializer_Constant:

 Constant
@@ -32,6 +36,8 @@ Constant
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_Constant`
+
 .. _api_fluid_initializer_ConstantInitializer:

 ConstantInitializer
@@ -41,6 +47,8 @@ ConstantInitializer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_ConstantInitializer`
+
 .. _api_fluid_initializer_force_init_on_cpu:

 force_init_on_cpu
@@ -49,6 +57,8 @@ force_init_on_cpu
 ..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_force_init_on_cpu`
+
 .. _api_fluid_initializer_init_on_cpu:

 init_on_cpu
@@ -57,6 +67,8 @@ init_on_cpu
 ..  autofunction:: paddle.fluid.initializer.init_on_cpu
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_init_on_cpu`
+
 .. _api_fluid_initializer_MSRA:

 MSRA
@@ -66,6 +78,8 @@ MSRA
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_MSRA`
+
 .. _api_fluid_initializer_MSRAInitializer:

 MSRAInitializer
@@ -75,6 +89,8 @@ MSRAInitializer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_MSRAInitializer`
+
 .. _api_fluid_initializer_Normal:

 Normal
@@ -84,6 +100,8 @@ Normal
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_Normal`
+
 .. _api_fluid_initializer_NormalInitializer:

 NormalInitializer
@@ -93,6 +111,8 @@ NormalInitializer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_NormalInitializer`
+
 .. _api_fluid_initializer_TruncatedNormal:

 TruncatedNormal
@@ -102,6 +122,8 @@ TruncatedNormal
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_TruncatedNormal`
+
 .. _api_fluid_initializer_TruncatedNormalInitializer:

 TruncatedNormalInitializer
@@ -111,6 +133,8 @@ TruncatedNormalInitializer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_TruncatedNormalInitializer`
+
 .. _api_fluid_initializer_Uniform:

 Uniform
@@ -120,6 +144,8 @@ Uniform
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_Uniform`
+
 .. _api_fluid_initializer_UniformInitializer:

 UniformInitializer
@@ -129,6 +155,8 @@ UniformInitializer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_UniformInitializer`
+
 .. _api_fluid_initializer_Xavier:

 Xavier
@@ -138,6 +166,8 @@ Xavier
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_Xavier`
+
 .. _api_fluid_initializer_XavierInitializer:

 XavierInitializer
@@ -147,3 +177,5 @@ XavierInitializer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_initializer_XavierInitializer`
+
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -13,6 +13,8 @@ load_inference_model
 ..  autofunction:: paddle.fluid.io.load_inference_model
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_io_load_inference_model`
+
 .. _api_fluid_io_load_params:

 load_params
@@ -21,6 +23,8 @@ load_params
 ..  autofunction:: paddle.fluid.io.load_params
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_io_load_params`
+
 .. _api_fluid_io_load_persistables:

 load_persistables
@@ -29,6 +33,8 @@ load_persistables
 ..  autofunction:: paddle.fluid.io.load_persistables
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_io_load_persistables`
+
 .. _api_fluid_io_load_vars:

 load_vars
@@ -37,6 +43,8 @@ load_vars
 ..  autofunction:: paddle.fluid.io.load_vars
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_io_load_vars`
+
 .. _api_fluid_io_save_inference_model:

 save_inference_model
@@ -45,6 +53,8 @@ save_inference_model
 ..  autofunction:: paddle.fluid.io.save_inference_model
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_io_save_inference_model`
+
 .. _api_fluid_io_save_params:

 save_params
@@ -53,6 +63,8 @@ save_params
 ..  autofunction:: paddle.fluid.io.save_params
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_io_save_params`
+
 .. _api_fluid_io_save_persistables:

 save_persistables
@@ -61,6 +73,8 @@ save_persistables
 ..  autofunction:: paddle.fluid.io.save_persistables
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_io_save_persistables`
+
 .. _api_fluid_io_save_vars:

 save_vars
@@ -69,3 +83,5 @@ save_vars
 ..  autofunction:: paddle.fluid.io.save_vars
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_io_save_vars`
+
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
--- a/doc/fluid/api/metrics.rst
+++ b/doc/fluid/api/metrics.rst
@@ -14,6 +14,8 @@ Accuracy
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_metrics_Accuracy`
+
 .. _api_fluid_metrics_Auc:

 Auc
@@ -23,6 +25,8 @@ Auc
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_metrics_Auc`
+
 .. _api_fluid_metrics_ChunkEvaluator:

 ChunkEvaluator
@@ -32,6 +36,8 @@ ChunkEvaluator
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_metrics_ChunkEvaluator`
+
 .. _api_fluid_metrics_CompositeMetric:

 CompositeMetric
@@ -41,6 +47,8 @@ CompositeMetric
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_metrics_CompositeMetric`
+
 .. _api_fluid_metrics_DetectionMAP:

 DetectionMAP
@@ -50,6 +58,8 @@ DetectionMAP
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_metrics_DetectionMAP`
+
 .. _api_fluid_metrics_EditDistance:

 EditDistance
@@ -59,6 +69,8 @@ EditDistance
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_metrics_EditDistance`
+
 .. _api_fluid_metrics_MetricBase:

 MetricBase
@@ -68,6 +80,8 @@ MetricBase
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_metrics_MetricBase`
+
 .. _api_fluid_metrics_Precision:

 Precision
@@ -77,6 +91,8 @@ Precision
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_metrics_Precision`
+
 .. _api_fluid_metrics_Recall:

 Recall
@@ -86,3 +102,5 @@ Recall
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_metrics_Recall`
+
--- a/doc/fluid/api/nets.rst
+++ b/doc/fluid/api/nets.rst
@@ -13,6 +13,8 @@ glu
 ..  autofunction:: paddle.fluid.nets.glu
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_nets_glu`
+
 .. _api_fluid_nets_img_conv_group:

 img_conv_group
@@ -21,6 +23,8 @@ img_conv_group
 ..  autofunction:: paddle.fluid.nets.img_conv_group
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_nets_img_conv_group`
+
 .. _api_fluid_nets_scaled_dot_product_attention:

 scaled_dot_product_attention
@@ -29,6 +33,8 @@ scaled_dot_product_attention
 ..  autofunction:: paddle.fluid.nets.scaled_dot_product_attention
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_nets_scaled_dot_product_attention`
+
 .. _api_fluid_nets_sequence_conv_pool:

 sequence_conv_pool
@@ -37,6 +43,8 @@ sequence_conv_pool
 ..  autofunction:: paddle.fluid.nets.sequence_conv_pool
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_nets_sequence_conv_pool`
+
 .. _api_fluid_nets_simple_img_conv_pool:

 simple_img_conv_pool
@@ -45,3 +53,5 @@ simple_img_conv_pool
 ..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_nets_simple_img_conv_pool`
+
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -14,6 +14,8 @@ Adadelta
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_Adadelta`
+
 .. _api_fluid_optimizer_Adagrad:

 Adagrad
@@ -23,6 +25,8 @@ Adagrad
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_Adagrad`
+
 .. _api_fluid_optimizer_AdagradOptimizer:

 AdagradOptimizer
@@ -32,6 +36,8 @@ AdagradOptimizer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_AdagradOptimizer`
+
 .. _api_fluid_optimizer_Adam:

 Adam
@@ -41,6 +47,8 @@ Adam
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_Adam`
+
 .. _api_fluid_optimizer_Adamax:

 Adamax
@@ -50,6 +58,8 @@ Adamax
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_Adamax`
+
 .. _api_fluid_optimizer_AdamaxOptimizer:

 AdamaxOptimizer
@@ -59,6 +69,8 @@ AdamaxOptimizer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_AdamaxOptimizer`
+
 .. _api_fluid_optimizer_AdamOptimizer:

 AdamOptimizer
@@ -68,6 +80,8 @@ AdamOptimizer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_AdamOptimizer`
+
 .. _api_fluid_optimizer_DecayedAdagrad:

 DecayedAdagrad
@@ -77,6 +91,8 @@ DecayedAdagrad
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_DecayedAdagrad`
+
 .. _api_fluid_optimizer_DecayedAdagradOptimizer:

 DecayedAdagradOptimizer
@@ -86,6 +102,8 @@ DecayedAdagradOptimizer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_DecayedAdagradOptimizer`
+
 .. _api_fluid_optimizer_Ftrl:

 Ftrl
@@ -95,6 +113,8 @@ Ftrl
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_Ftrl`
+
 .. _api_fluid_optimizer_FtrlOptimizer:

 FtrlOptimizer
@@ -104,6 +124,30 @@ FtrlOptimizer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_FtrlOptimizer`
+
+.. _api_fluid_optimizer_LarsMomentum:
+
+LarsMomentum
+------------
+
+..  autoclass:: paddle.fluid.optimizer.LarsMomentum
+    :members:
+    :noindex:
+
+Read Chinese Version: :ref:`cn_api_fluid_optimizer_LarsMomentum`
+
+.. _api_fluid_optimizer_LarsMomentumOptimizer:
+
+LarsMomentumOptimizer
+---------------------
+
+..  autoclass:: paddle.fluid.optimizer.LarsMomentumOptimizer
+    :members:
+    :noindex:
+
+Read Chinese Version: :ref:`cn_api_fluid_optimizer_LarsMomentumOptimizer`
+
 .. _api_fluid_optimizer_ModelAverage:

 ModelAverage
@@ -113,6 +157,8 @@ ModelAverage
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_ModelAverage`
+
 .. _api_fluid_optimizer_Momentum:

 Momentum
@@ -122,6 +168,8 @@ Momentum
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_Momentum`
+
 .. _api_fluid_optimizer_MomentumOptimizer:

 MomentumOptimizer
@@ -131,14 +179,7 @@ MomentumOptimizer
    :members:
    :noindex:

-.. _api_fluid_optimizer_RMSPropOptimizer:
-
-RMSPropOptimizer
----------------
-
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
-    :members:
-    :noindex:
+Read Chinese Version: :ref:`cn_api_fluid_optimizer_MomentumOptimizer`

 .. _api_fluid_optimizer_RMSPropOptimizer:

@@ -149,6 +190,8 @@ RMSPropOptimizer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_RMSPropOptimizer`
+
 .. _api_fluid_optimizer_SGD:

 SGD
@@ -158,6 +201,8 @@ SGD
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_SGD`
+
 .. _api_fluid_optimizer_SGDOptimizer:

 SGDOptimizer
@@ -167,3 +212,5 @@ SGDOptimizer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_optimizer_SGDOptimizer`
+
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -13,6 +13,8 @@ cuda_profiler
 ..  autofunction:: paddle.fluid.profiler.cuda_profiler
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_profiler_cuda_profiler`
+
 .. _api_fluid_profiler_profiler:

 profiler
@@ -21,6 +23,8 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_profiler_profiler`
+
 .. _api_fluid_profiler_reset_profiler:

 reset_profiler
@@ -29,6 +33,8 @@ reset_profiler
 ..  autofunction:: paddle.fluid.profiler.reset_profiler
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_profiler_reset_profiler`
+
 .. _api_fluid_profiler_start_profiler:

 start_profiler
@@ -37,6 +43,8 @@ start_profiler
 ..  autofunction:: paddle.fluid.profiler.start_profiler
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_profiler_start_profiler`
+
 .. _api_fluid_profiler_stop_profiler:

 stop_profiler
@@ -45,3 +53,5 @@ stop_profiler
 ..  autofunction:: paddle.fluid.profiler.stop_profiler
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_profiler_stop_profiler`
+
--- a/doc/fluid/api/recordio_writer.rst
+++ b/doc/fluid/api/recordio_writer.rst
@@ -13,6 +13,8 @@ convert_reader_to_recordio_file
 ..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_recordio_writer_convert_reader_to_recordio_file`
+
 .. _api_fluid_recordio_writer_convert_reader_to_recordio_files:

 convert_reader_to_recordio_files
@@ -21,3 +23,5 @@ convert_reader_to_recordio_files
 ..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_recordio_writer_convert_reader_to_recordio_files`
+
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
@@ -14,6 +14,8 @@ L1Decay
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_regularizer_L1Decay`
+
 .. _api_fluid_regularizer_L1DecayRegularizer:

 L1DecayRegularizer
@@ -23,6 +25,8 @@ L1DecayRegularizer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_regularizer_L1DecayRegularizer`
+
 .. _api_fluid_regularizer_L2Decay:

 L2Decay
@@ -32,6 +36,8 @@ L2Decay
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_regularizer_L2Decay`
+
 .. _api_fluid_regularizer_L2DecayRegularizer:

 L2DecayRegularizer
@@ -41,3 +47,5 @@ L2DecayRegularizer
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_regularizer_L2DecayRegularizer`
+
--- a/doc/fluid/api/transpiler.rst
+++ b/doc/fluid/api/transpiler.rst
@@ -14,6 +14,8 @@ DistributeTranspiler
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_transpiler_DistributeTranspiler`
+
 .. _api_fluid_transpiler_DistributeTranspilerConfig:

 DistributeTranspilerConfig
@@ -23,6 +25,8 @@ DistributeTranspilerConfig
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_transpiler_DistributeTranspilerConfig`
+
 .. _api_fluid_transpiler_HashName:

 HashName
@@ -32,6 +36,8 @@ HashName
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_transpiler_HashName`
+
 .. _api_fluid_transpiler_memory_optimize:

 memory_optimize
@@ -40,6 +46,8 @@ memory_optimize
 ..  autofunction:: paddle.fluid.transpiler.memory_optimize
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_transpiler_memory_optimize`
+
 .. _api_fluid_transpiler_release_memory:

 release_memory
@@ -48,6 +56,8 @@ release_memory
 ..  autofunction:: paddle.fluid.transpiler.release_memory
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_transpiler_release_memory`
+
 .. _api_fluid_transpiler_RoundRobin:

 RoundRobin
@@ -57,3 +67,5 @@ RoundRobin
    :members:
    :noindex:

+Read Chinese Version: :ref:`cn_api_fluid_transpiler_RoundRobin`
+
--- a/doc/fluid/api_cn/api_guides/high_low_level_api.md
+++ b/doc/fluid/api_cn/api_guides/high_low_level_api.md
+## High/Low-level API简介
+
+Paddle目前有2套API接口：
+
+- Low-level（底层） API：
+	
+	- 灵活性强并且已经相对成熟，使用它训练的模型，能直接支持C++预测上线。
+	- 提供了大量的模型作为使用示例，包括[Book](https://github.com/PaddlePaddle/book)中的第7和8章，以及[models](https://github.com/PaddlePaddle/models)中的所有章节。
+	- 适用人群：对深度学习有一定了解，需要自定义网络进行训练/预测/上线部署的用户。
+
+- High-level（高层）API：
+	
+	- 使用简单，[Book](https://github.com/PaddlePaddle/book)中前六章提供了示例。
+	- 尚未成熟，接口暂时在[paddle.fluid.contrib](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/contrib)下面。
+	- 适用人群：想通过Book课程进行深度学习基础知识学习的初级用户。
--- a/doc/fluid/api_cn/api_guides/index.rst
+++ b/doc/fluid/api_cn/api_guides/index.rst
+===========
+API使用指南
+===========
+
+..  toctree::
+    :hidden:
+
+    high_low_level_api.md
+    low_level/layers/index.rst
+    low_level/executor.rst
+    low_level/optimizer.rst
+    low_level/metrics.rst
+    low_level/model_save_reader.rst
+    low_level/inference.rst
+    low_level/distributed/index.rst
+    low_level/memory_optimize.rst
+    low_level/nets.rst
+    low_level/parallel_executor.rst
--- a/doc/fluid/api_cn/api_guides/low_level/distributed/async_training.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/distributed/async_training.rst
+.. _api_guide_async_training:
+
+############
+分布式异步训练
+############
+
+Fluid支持数据并行的分布式异步训练，API使用 :code:`DistributedTranspiler` 将单机网络配置转换成可以多机执行的
+:code:`pserver` 端程序和 :code:`trainer` 端程序。用户在不同的节点执行相同的一段代码，根据环境变量或启动参数，
+可以执行对应的 :code:`pserver` 或 :code:`trainer` 角色。Fluid异步训练只支持pserver模式，异步训练和 `同步训练 <../distributed/sync_training.html>`_ 的主要差异在于：异步训练每个trainer的梯度是单独更新到参数上的，
+而同步训练是所有trainer的梯度合并之后统一更新到参数上，因此，同步训练和异步训练的超参数需要分别调节。
+
+pserver模式分布式异步训练
+======================
+
+API详细使用方法参考 :ref: `api_fluid_DistributeTranspiler` ，简单示例用法：
+
+.. code-block:: python
+
+    config = fluid.DistributedTranspilerConfig()
+    # 配置策略config
+    config.slice_var_up = False
+    t = fluid.DistributedTranspiler(config=config)
+    t.transpile(trainer_id, 
+                program=main_program,
+                pservers="192.168.0.1:6174,192.168.0.2:6174",
+                trainers=1,
+                sync_mode=False)
+
+以上参数说明请参考`同步训练 <../distributed/sync_training.html>`_ 
+
+需要注意的是：进行异步训练时，请修改 :code:`sync_mode` 的值
+
+- :code:`sync_mode` ： 是否是同步训练模式，默认为True，不传此参数也默认是同步训练模式，设置为False则为异步训练
--- a/doc/fluid/api_cn/api_guides/low_level/distributed/cluster_train_data_cn.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/distributed/cluster_train_data_cn.rst
+..  _api_guide_cluster_train_data:
+
+####################
+分布式训练reader准备
+####################
+
+一个数据并行的分布式训练任务通常会含有多个训练进程，每个训练进程处理整个数据集中的一部分，根据当前进程的唯一序号(trainer_id)以及训练进程总数(trainers)可以决定当前训练进程应该读取哪一部分数据。
+
+实现 cluster_reader 来读取分布式训练数据集
+----------------------------------------
+
+比较通用的方法，可以实现一个 cluster_reader, 根据训练进程数量以及进程序号决定读取哪些 example:
+
+    .. code-block:: python
+        
+        def cluster_reader(reader, trainers, trainer_id):
+            def reader_creator():
+                for idx, data in enumerate(reader()):
+                    if idx % trainers == trainer_id:
+                        yield data
+            return reader
+
+        trainers = int(os.getenv("PADDLE_TRAINERS", "1"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        train_reader = cluster_reader(paddle.dataset.mnist.train(), trainers, trainer_id)
+
+上述代码中，`trainers` 和 `trainer_id` 分别是训练进程总数和当前训练进程的序号，可以通过环境变量或者参数的方式传递给 Python 程序。
+
+预先切分训练文件
+-----------------
+
+由于使用 `cluster_reader` 依然会读取全量数据，对于训练进程比较多的任务，会造成IO资源的浪费、影响训练性能。另一种方法是可以将训练数据切分成多个小文件，每个进程处理其中的一部分文件,
+例如在 Linux 系统中可以使用 `split <http://man7.org/linux/man-pages/man1/split.1.html>`_ 命令将训练数据切分成多个小文件：
+
+  .. code-block:: bash
+    $ split -d -a 4 -d -l 100 housing.data cluster/housing.data.
+    $ find ./cluster
+    cluster/
+    cluster/housing.data.0002
+    cluster/housing.data.0003
+    cluster/housing.data.0004
+    cluster/housing.data.0000
+    cluster/housing.data.0001
+    cluster/housing.data.0005
+
+数据切分好以后, 可以实现一个 file_dispatcher 函数，根据训练进程数量以及序号决定需要读取哪些文件：
+
+    .. code-block:: python
+
+        def file_dispatcher(files_pattern, trainers, trainer_id):
+            file_list = glob.glob(files_pattern)
+            ret_list = []
+            for idx, f in enumerate(file_list):
+                if (idx + trainers) % trainers == trainer_id:
+                    ret_list.append(f)
+            return ret_list
+        
+        trainers = int(os.getenv("PADDLE_TRAINERS", "1"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        files_pattern = "cluster/housing.data.*"
+
+        my_files = file_dispatcher(files_pattern, triners, trainer_id)
+
+在上述例子中，`files_pattern` 是训练文件的 `glob 表达式 <https://docs.python.org/2.7/library/glob.html>`_，一般可以用通配符来表示。
--- a/doc/fluid/api_cn/api_guides/low_level/distributed/cpu_train_best_practice.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/distributed/cpu_train_best_practice.rst
+.. _api_guide_cpu_training_best_practice:
+
+##################
+分布式CPU训练最佳实践
+##################
+
+提高CPU分布式训练的训练速度，主要要从两个方面来考虑：
+1）提高训练速度，主要是提高CPU的使用率；2）提高通信速度，主要是减少通信传输的数据量。
+
+提高CPU的使用率
+=============
+
+提高CPU使用率主要依赖 :code:`ParallelExecutor`，可以充分利用多个CPU的计算能力来加速计算。
+
+API详细使用方法参考 :ref:`api_fluid_ParallelExecutor` ，简单实例用法：
+
+.. code-block:: python
+
+    # 配置执行策略，主要是设置线程数
+    exec_strategy = fluid.ExecutionStrategy()
+    exec_strategy.num_threads = 8
+
+    # 配置构图策略，对于CPU训练而言，应该使用Reduce模式进行训练
+    build_strategy = fluid.BuildStrategy()
+    if int(os.getenv("CPU_NUM")) > 1:
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+
+    pe = fluid.ParallelExecutor(
+        use_cuda=False,
+        loss_name=avg_cost.name,
+        main_program=main_program,
+        build_strategy=build_strategy,
+        exec_strategy=exec_strategy)
+
+以上参数中：
+
+- :code:`num_threads` ： 模型训练使用的线程数，最好和训练所在机器的物理CPU核数接近
+- :code:`reduce_strategy` ： 对于CPU训练而言，应该选择 fluid.BuildStrategy.ReduceStrategy.Reduce
+
+
+通用环境变量配置：
+
+- :code:`CPU_NUM` ：模型副本replica的个数，最好和num_threads一致
+
+
+提高通信速度
+==========
+
+要减少通信数据量，提高通信速度，主要是使用稀疏更新 ，目前支持 `稀疏更新 <../distributed/sparse_update.html>`_  的主要是  :ref:`api_fluid_layers_embedding` 。
+
+.. code-block:: python
+
+    data = fluid.layers.data(name='ids', shape=[1], dtype='int64')
+    fc = fluid.layers.embedding(input=data, size=[dict_size, 16], is_sparse=True)
+
+以上参数中：
+
+- :code:`is_sparse` ： 配置embedding使用稀疏更新，如果embedding的dict_size很大，而每次数据data很少，建议使用sparse更新方式。
--- a/doc/fluid/api_cn/api_guides/low_level/distributed/index.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/distributed/index.rst
+=============
+分布式训练
+=============
+
+..  toctree::
+    :maxdepth: 1
+
+    sync_training.rst
+    async_training.rst
+    cpu_train_best_practice.rst
+    large_scale_sparse_feature_training.rst
+    cluster_train_data_cn.rst
+
+
--- a/doc/fluid/api_cn/api_guides/low_level/distributed/large_scale_sparse_feature_training.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/distributed/large_scale_sparse_feature_training.rst
+.. _api_guide_large_scale_sparse_feature_training:
+
+###################
+大规模稀疏特征模型训练
+###################
+
+
+模型配置和训练
+=============
+
+embedding被广泛应用在各种网络结构中，尤其是文本处理相关的模型。在某些场景，例如推荐系统或者搜索引擎中，
+embedding的feature id可能会非常多，当feature id达到一定数量时，embedding参数会变得很大，
+会带来两个问题：
+1）单机内存由于无法存放如此巨大的embedding参数，导致无法训练；
+2）普通的训练模式每一轮迭代都需要同步完整的参数，参数太大会让通信变得非常慢，进而影响训练速度。
+
+Fluid支持千亿量级超大规模稀疏特征embedding的训练，embedding参数只会保存在parameter server上，通过
+参数prefetch和梯度稀疏更新的方法，大大减少通信量，提高通信速度。
+
+该功能只对分布式训练有效，单机无法使用。
+需要配合 `稀疏更新 <../distributed/sparse_update.html>`_ 一起使用。
+
+使用方法：在配置embedding的时候，加上参数 :code:`is_distributed=True` 以及 :code:`is_sparse=True` 即可。
+参数 :code:`dict_size` 定义数据中总的id的数量，id可以是int64范围内的任意值，只要总id个数小于等于dict_size就可以支持。
+所以配置之前需要预估一下数据中总的feature id的数量。
+
+.. code-block:: python
+
+  emb = fluid.layers.embedding(
+      is_distributed=True,
+      input=input,
+      size=[dict_size, embedding_width],
+      is_sparse=True,
+      is_distributed=True)
+
+
+模型存储和预测
+=============
+
+当特征数量达到千亿的时候，参数量很大，单机已经无法存下，所以模型的存储和加载都和普通模式不同：
+1）普通模式下，参数是在trainer端保存和加载的；
+2）分布式模式下，参数的保存和加载，都是在pserver端进行，每个pserver只保存和加载该pserver自身对应部分的参数
--- a/doc/fluid/api_cn/api_guides/low_level/distributed/sync_training.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/distributed/sync_training.rst
+.. _api_guide_sync_training:
+
+############
+分布式同步训练
+############
+
+Fluid支持数据并行的分布式同步训练，API使用 :code:`DistributedTranspiler` 将单机网络配置转换成可以多机执行的
+:code:`pserver` 端程序和 :code:`trainer` 端程序。用户在不同的节点执行相同的一段代码，根据环境变量或启动参数，
+可以执行对应的 :code:`pserver` 或 :code:`trainer` 角色。Fluid分布式同步训练同时支持pserver模式和NCCL2模式，
+在API使用上有差别，需要注意。
+
+pserver模式分布式训练
+===================
+
+API详细使用方法参考 :ref:`DistributeTranspiler` ，简单实例用法：
+
+.. code-block:: python
+
+    config = fluid.DistributedTranspilerConfig()
+    # 配置策略config
+    config.slice_var_up = False
+    t = fluid.DistributedTranspiler(config=config)
+    t.transpile(trainer_id, 
+                program=main_program,
+                pservers="192.168.0.1:6174,192.168.0.2:6174",
+                trainers=1,
+                sync_mode=True)
+
+以上参数中：
+
+- :code:`trainer_id` ： trainer节点的id，从0到n-1，n为当前训练任务中trainer节点的个数
+- :code:`program` ： 被转换的 :code:`program` 默认使用 :code:`fluid.default_main_program()`
+- :code:`pservers` ： 当前训练任务中pserver节点的IP端口列表
+- :code:`trainers` ： int类型，当前训练任务中trainer节点的个数。注意：
+    * pserver模式下，trainer节点个数可以和pserver节点个数不一致，比如使用20个pserver和50个trainer。在实际训练任务中，您可以通过调整pserver节点和trainer节点个数找到最佳性能
+    * NCCL2模式中，此项参数是字符串，指定trainer节点的IP端口列表
+- :code:`sync_mode` ： 是否是同步训练模式，默认为True，不传此参数也默认是同步训练模式
+
+
+其中，支持的config包括：
+
+- :code:`slice_var_up` ： 配置是否切分一个参数到多个pserver上进行优化，默认开启。此选项适用于模型参数个数少，但需要使用大量节点的场景，有利于提升pserver端计算并行度
+- :code:`split_method` ： 配置transpiler分配参数（或参数的切片）到多个pserver的方式，默认为"RoundRobin"，也可以使用"HashName"
+- :code:`min_block_size` ： 如果配置了参数切分，指定最小Tensor的切分大小，防止RPC请求包过小，默认为8192，一般情况不需要调整此项参数
+- :code:`enable_dc_asgd` ： 是否开启 :code:`DC-ASGD` 此选项在异步训练中生效，启用异步训练补偿算法
+- :code:`mode` : 可以选择"pserver"或"nccl2"，指定使用pserver模式或NCCL2模式分布式训练
+- :code:`print_log` ： 是否开启transpiler debug日志，此项为开发调试使用
+
+通用环境变量配置：
+
+- :code:`FLAGS_rpc_send_thread_num` ：int，指定RPC通信发送时线程的个数
+- :code:`FLAGS_rpc_get_thread_num` ： int，指定RPC通信接受时线程的个数
+- :code:`FLAGS_rpc_prefetch_thread_num` ： int，分布式lookup table执行RPC通信时，prefetch线程的个数
+- :code:`FLAGS_rpc_deadline` ： int，RPC通信最长等待时间，单位为毫秒，默认180000
+
+
+NCCL2模式分布式训练
+=================
+
+基于NCCL2 (Collective Communication) 的多机同步训练模式，仅支持在GPU集群下进行。
+此部分详细API说明可以参考 :ref:`DistributeTranspiler` 。
+
+注意：NCCL2模式下，集群不需要启动pserver，只需要启动多个trainer节点即可。
+
+使用以下代码，将当前 :code:`Program` 转化成适用于NCCL2分布式计算的Fluid :code:`Program` ：
+
+.. code-block:: python
+
+    config = fluid.DistributeTranspilerConfig()
+    config.mode = "nccl2"
+    t = fluid.DistributedTranspiler(config=config)
+    t.transpile(trainer_id, 
+                program=main_program,
+                startup_program=startup_program,
+                trainers="192.168.0.1:6174,192.168.0.2:6174",
+                current_endpoint="192.168.0.1:6174")
+
+其中：
+
+- :code:`trainer_id` : trainer节点的id，从0到n-1，n为当前训练任务中trainer节点的个数
+- :code:`program` 和 :code:`startup_program` : 分别为Fluid 模型的主配置program和初始化startup_program
+- :code:`trainers` : 字符串类型，指定当前任务所有trainer的IP和端口号，仅用于NCCL2初始化（pserver模式中，此参数为int，指定trainer节点的个数）
+- :code:`current_endpoint` : 当前任务的当前节点的IP和端口号
--- a/doc/fluid/api_cn/api_guides/low_level/executor.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/executor.rst
+..  _api_guide_executor:
+
+##########
+执行引擎
+##########
+
+:code:`Executor` 即 :code:`执行器` 。PaddlePaddle Fluid中有两种执行器可以选择。
+:code:`Executor` 实现了一个简易的执行器，所有Operator会被顺序执行。用户可以使用
+Python脚本驱动 :code:`Executor` 执行。默认情况下 :code:`Executor` 是单线程的，如果
+想使用数据并行，请参考另一个执行器， :ref:`api_guide_parallel_executor` 。
+
+:code:`Executor` 的代码逻辑非常简单。建议用户在调试过程中，先使用
+:code:`Executor` 跑通模型，再切换到多设备计算，甚至多机计算。
+
+:code:`Executor` 在构造的时候接受一个 :code:`Place`， 它们可以是 :ref:`api_fluid_CPUPlace`
+或 :ref:`api_fluid_CUDAPlace` 。 :code:`Executor` 在执行的时候可以选择执行的
+:ref:`api_guide_low_level_program` 。
+
+简单的使用方法，请参考 `quick_start_fit_a_line <http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/quick_start/fit_a_line/README.cn.html>`_ , API Reference 请参考
+:ref:`api_fluid_Executor` 。
--- a/doc/fluid/api_cn/api_guides/low_level/inference.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/inference.rst
+..  _api_guide_inference:
+
+#########
+预测引擎
+#########
+
+预测引擎提供了存储预测模型 :ref:`api_fluid_io_save_inference_model` 和加载预测模型 :ref:`api_fluid_io_load_inference_model` 两个接口。
+
+预测模型的存储格式
+=================
+
+预测模型的存储格式有两种，由上述两个接口中的 :code:`model_filename` 和 :code:`params_filename` 变量控制：
+
+- 参数保存到各个独立的文件，如设置 :code:`model_filename` 为 :code:`None` 、:code:`params_filename` 为 :code:`None`
+
+  .. code-block:: bash
+
+      ls recognize_digits_conv.inference.model/*
+      __model__ conv2d_1.w_0 conv2d_2.w_0 fc_1.w_0 conv2d_1.b_0 conv2d_2.b_0 fc_1.b_0
+
+- 参数保存到同一个文件，如设置 :code:`model_filename` 为 :code:`None` 、:code:`params_filename` 为 :code:`__params__`
+
+  .. code-block:: bash
+
+      ls recognize_digits_conv.inference.model/*
+      __model__ __params__
+
+存储预测模型
+===========
+
+.. code-block:: python
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    path = "./infer_model"
+    fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'], 
+        target_vars=[predict_var], executor=exe)
+
+在这个示例中，:code:`fluid.io.save_inference_model` 接口对默认的 :code:`fluid.Program` 进行裁剪，只保留预测 :code:`predict_var` 所需部分。
+裁剪后的 :code:`program` 会保存在 :code:`./infer_model/__model__` 下，参数会保存到 :code:`./infer_model` 下的各个独立文件。
+
+加载预测模型
+===========
+
+.. code-block:: python
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    path = "./infer_model"
+    [inference_program, feed_target_names, fetch_targets] = 
+        fluid.io.load_inference_model(dirname=path, executor=exe)
+    results = exe.run(inference_program,
+                  feed={feed_target_names[0]: tensor_img},
+                  fetch_list=fetch_targets)
+
+在这个示例中，首先调用 :code:`fluid.io.load_inference_model` 接口，获得预测的 :code:`program` 、输入数据的 :code:`variable` 名称和输出结果的 :code:`variable` ;
+然后调用 :code:`executor` 执行预测的 :code:`program` 获得预测结果。
--- a/doc/fluid/api_cn/api_guides/low_level/layers/activations.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/activations.rst
+.. _api_guide_activations:
+
+####
+激活函数
+#### 
+
+激活函数将非线性的特性引入到神经网络当中。
+
+PaddlePaddle Fluid 对大部分的激活函数进行了支持，其中有:        
+
+:ref:`api_fluid_layers_relu`, :ref:`api_fluid_layers_tanh`, :ref:`api_fluid_layers_sigmoid`, :ref:`api_fluid_layers_elu`, :ref:`api_fluid_layers_relu6`, :ref:`api_fluid_layers_pow`, :ref:`api_fluid_layers_stanh`, :ref:`api_fluid_layers_hard_sigmoid`, :ref:`api_fluid_layers_swish`, :ref:`api_fluid_layers_prelu`, :ref:`api_fluid_layers_brelu`, :ref:`api_fluid_layers_leaky_relu`, :ref:`api_fluid_layers_soft_relu`, :ref:`api_fluid_layers_thresholded_relu`, :ref:`api_fluid_layers_maxout`, :ref:`api_fluid_layers_logsigmoid`, :ref:`api_fluid_layers_hard_shrink`, :ref:`api_fluid_layers_softsign`, :ref:`api_fluid_layers_softplus`, :ref:`api_fluid_layers_tanh_shrink`, :ref:`api_fluid_layers_softshrink`, :ref:`api_fluid_layers_exp`。
+ 
+
+**Fluid提供了两种使用激活函数的方式：**
+
+- 如果一个层的接口提供了 :code:`act` 变量（默认值为None），我们可以通过该变量指定该层的激活函数类型。该方式支持常见的激活函数: :code:`relu`, :code:`tanh`, :code:`sigmoid`, :code:`identity`。
+
+.. code-block:: python
+
+	conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
+
+
+- Fluid为每个Activation提供了接口，我们可以显式的对它们进行调用。
+
+.. code-block:: python
+
+	conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3)
+	relu1 = fluid.layers.relu(conv2d)
--- a/doc/fluid/api_cn/api_guides/low_level/layers/control_flow.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/control_flow.rst
+.. api_guide_control_flow:
+
+######
+控制流
+######
+
+在程序语言中，控制流(control flow)决定了语句的执行顺序，常见的控制流包括顺序执行、分支和循环等。PaddlePaddle Fluid继承了这一概念，提供了多种控制流API, 以控制深度学习模型在训练或者预测过程中的执行逻辑。
+
+IfElse
+======
+
+条件分支，允许对同一个batch的输入，根据给定的条件，分别选择 :code:`true_block` 或 :code:`false_block` 中的逻辑进行执行，执行完成之后再将两个分支的输出合并为同一个输出。通常，条件表达式可由 :ref:`api_fluid_layers_less_than`, :ref:`api_fluid_layers_equal` 等逻辑比较 API 产生。
+
+请参考 :ref:`api_fluid_layers_IfElse`            
+
+
+Switch
+======
+
+多分支选择结构，如同程序语言中常见的 :code:`switch-case` 声明, 其根据输入表达式的取值不同，选择不同的分支执行。具体来说，Fluid 所定义的 :code:`Switch` 控制流有如下特性：
+
+* case的条件是个bool类型的值，即在Program中是一个张量类型的Variable；
+* 依次检查逐个case，选择第一个满足条件的case执行，完成执行后即退出所属的block；
+* 如果所有case均不满足条件，会选择默认的case进行执行。
+
+请参考 :ref:`api_fluid_layers_Switch`
+
+While
+=====
+
+While 循环，当条件判断为真时，循环执行 :code:`While` 控制流所属 :code:`block` 内的逻辑，条件判断为假时退出循环。与之相关的API有
+
+* :ref:`api_fluid_layers_increment` ：累加API，通常用于对循环次数进行计数；
+* :ref:`api_fluid_layers_array_read` ：从 :code:`LOD_TENSOR_ARRAY` 中指定的位置读入Variable，进行计算；
+* :ref:`api_fluid_layers_array_write` ：将 Variable 写回到 :code:`LOD_TENSOR_ARRAY` 指定的位置，存储计算结果。
+
+请参考 :ref:`api_fluid_layers_While`
+
+DynamicRNN
+==========
+
+即动态RNN，可处理一个batch不等长的序列数据，其接受 :code:`lod_level=1` 的 Variable 作为输入，在 :code:`DynamicRNN` 的 :code:`block` 内，用户需自定义RNN的单步计算逻辑。在每一个时间步，用户可将需记忆的状态写入到 :code:`DynamicRNN` 的 :code:`memory` 中，并将需要的输出写出到其 :code:`output` 中。
+
+:ref:`api_fluid_layers_sequence_last_step` 可获取 :code:`DynamicRNN` 最后一个时间步的输出。
+
+请参考 :ref:`api_fluid_layers_DynamicRNN`
+
+StaticRNN
+=========
+
+即静态RNN，只能处理固定长度的序列数据，接受 :code:`lod_level=0` 的 Variable 作为输入。与 :code:`DynamicRNN` 类似，在RNN的每单个时间步，用户需自定义计算逻辑，并可将状态和输出写出。
+
+请参考 :ref:`api_fluid_layers_StaticRNN`
--- a/doc/fluid/api_cn/api_guides/low_level/layers/conv.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/conv.rst
+.. _api_guide_conv:
+
+#####
+卷积
+#####
+
+卷积有两组输入：特征图和卷积核，依据输入特征和卷积核的形状、Layout不同、计算方式的不同，在Fluid里，有针对变长序列特征的一维卷积，有针对定长图像特征的二维(2D Conv)、三维卷积(3D Conv)，同时也有卷积计算的逆向过程，下面先介绍Fluid里的2D/3D卷积，再来介绍序列卷积。
+
+
+2D/3D卷积
+==============
+
+1. 卷积输入参数：
+---------------------
+
+卷积需要依据滑动步长(stride)、填充长度(padding)、卷积核窗口大小(filter size)、分组数(groups)、扩张系数(dilation rate)来决定如何计算。groups最早在 `AlexNet <https://www.nvidia.cn/content/tesla/pdf/machine-learning/imagenet-classification-with-deep-convolutional-nn.pdf>`_ 中引入, 可以理解为将原始的卷积分为独立若干组卷积计算。
+  
+  **注意**: 同cuDNN的方式，Fluid目前只支持在特征图上下填充相同的长度，左右也是。
+
+- 输入输出Layout: 
+
+  2D卷积输入特征的Layout为[N, C, H, W]或[N, H, W, C], N即batch size，C是通道数，H、W是特征的高度和宽度，输出特征和输入特征的Layout一致。(相应的3D卷积输入特征的Layout为[N, C, D, H, W]或[N, D, H, W, C]，但**注意**，Fluid的卷积当前只支持[N, C, H, W]，[N, C, D, H, W]。)
+   
+- 卷积核的Layout: 
+  
+  Fluid中2D卷积的卷积核(也称权重)的Layout为[C_o, C_in / groups, f_h, f_w]，C_o、C_in表示输出、输入通道数，f_h、f_w表示卷积核窗口的高度和宽度，按行序存储。(相应的2D卷积的卷积核Layout为[C_o, C_in / groups, f_d, f_h, d_w]，同样按行序存储。)
+  
+- 深度可分离卷积(depthwise separable convolution): 
+   
+  在深度可分离卷积中包括depthwise convolution和pointwise convolution两组，这两个卷积的接口和上述普通卷积接口相同。前者可以通过给普通卷积设置groups来做，后者通过设置卷积核filters的大小为1x1，深度可分离卷积减少参数的同时减少了计算量。
+  
+  对于depthwise convolution，可以设置groups等于输入通道数，此时，2D卷积的卷积核形状为[C_o, 1, f_h, f_w]。
+  对于pointwise convolution，卷积核的形状为[C_o, C_in, 1, 1]。
+  
+  **注意**：Fluid针对depthwise convolution的GPU计算做了高度优化，您可以通过在 :code:`fluid.layers.conv2d`接口设置 :code:`use_cudnn=False`来使用Fluid自身优化的CUDA程序。
+   
+- 空洞卷积(dilated convolution):
+  
+  空洞卷积相比普通卷积而言，卷积核在特征图上取值时不在连续，而是间隔的，这个间隔数称作dilation，等于1时，即为普通卷积，空洞卷积相比普通卷积的感受野更大。
+  
+- API汇总:
+ - :ref:`api_fluid_layers_conv2d`
+ - :ref:`api_fluid_layers_conv3d`
+ - :ref:`api_fluid_layers_conv2d_transpose`
+ - :ref:`api_fluid_layers_conv3d_transpose`
+
+
+1D序列卷积
+==============
+
+Fluid可以表示变长的序列结构，这里的变长是指不同样本的时间步(step)数不一样，通常是一个2D的Tensor和一个能够区分的样本长度的辅助结构来表示。假定，2D的Tensor的形状是shape，shape[0]是所有样本的总时间步数，shape[1]是序列特征的大小。
+
+基于此数据结构的卷积在Fluid里称作序列卷积，也表示一维卷积。同图像卷积，序列卷积的输入参数有卷积核大小、填充大小、滑动步长，但与2D卷积不同的是，这些参数个数都为1。**注意**，目前仅支持stride为1的情况，输出序列的时间步数和输入序列相同。 
+
+假如：输入序列形状为(T, N)， T即该序列的时间步数，N是序列特征大小；卷积核的上下文步长为K，输出序列长度为M，则卷积核权重形状为(K * N, M），输出序列形状为(T, M)。
+  
+另外，参考DeepSpeech，Fluid实现了行卷积row convolution, 或称
+`look ahead convolution <http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf>`_ ，
+该卷积相比上述普通序列卷积可以减少参数。
+ 
+
+- API汇总:
+ - :ref:`api_fluid_layers_sequence_conv`
+ - :ref:`api_fluid_layers_row_conv`
--- a/doc/fluid/api_cn/api_guides/low_level/layers/data_feeder.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/data_feeder.rst
+..  _api_guide_data_feeder:
+
+使用DataFeeder传入训练/预测数据
+###################################
+
+Fluid提供 :code:`DataFeeder` 类，将numpy array等数据转换为 :code:`LoDTensor` 类型传入训练/预测网络。
+
+用户创建 :code:`DataFeeder` 对象的方式为：
+
+.. code-block:: python
+
+    import paddle.fluid as fluid
+
+    image = fluid.layers.data(name='image', shape=[-1, 3, 224, 224], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[-1, 1], dtype='int64')
+    place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
+    feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
+
+其中，:code:`feed_list` 参数为变量列表，这些变量由 :code:`fluid.layers.data()` 创建，
+:code:`place` 参数表示应将Python端传入的numpy array等数据转换为GPU端或是CPU端的 :code:`LoDTensor` 。
+创建 :code:`DataFeeder` 对象后，用户可调用其 :code:`feed(iterable)` 方法将用户传入的
+:code:`iterable` 数据转换为 :code:`LoDTensor`。
+
+:code:`iterable` 应为Python List或Tuple类型对象，且 :code:`iterable` 的每个元素均为长度为N的
+Python List或Tuple类型对象，其中N为创建 :code:`DataFeeder` 对象时传入的 :code:`feed_list` 变量个数。
+
+:code:`iterable` 的具体格式为：
+
+.. code-block:: python
+
+    iterable = [
+        (image_1, label_1),
+        (image_2, label_2),
+        ...
+        (image_n, label_n)
+    ]
+
+其中，:code:`image_i` 与 :code:`label_i` 均为numpy array类型数据。若传入数据的维度为[1]，如 :code:`label_i`,
+则可传入Python int、float等类型数据。 :code:`image_i` 与 :code:`label_i` 的数据类型和维度不必
+与 :code:`fluid.layers.data()` 创建时指定的 :code:`dtype` 和 :code:`shape` 完全一致，:code:`DataFeeder` 内部
+会完成数据类型和维度的转换。若 :code:`feed_list` 中的变量的 :code:`lod_level` 不为零，则Fluid会将经过维度转换后的
+:code:`iterable` 中每行数据的第0维作为返回结果的 :code:`LoD`。
+
+具体使用方法请参见 :ref:`api_fluid_DataFeeder` 。
\ No newline at end of file
--- a/doc/fluid/api_cn/api_guides/low_level/layers/data_in_out.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/data_in_out.rst
+..  _api_guide_data_in_out:
+
+数据输入输出
+###############
+
+
+数据输入
+-------------
+
+Fluid支持两种数据输入方式，包括：
+
+1. Python Reader: 纯Python的Reader。用户在Python端定义 :code:`fluid.layers.data` 层构建网络，并通过
+:code:`executor.run(feed=...)` 的方式读入数据。数据读取和模型训练/预测的过程是同步进行的。
+
+2. PyReader: 高效灵活的C++ Reader接口。PyReader内部维护容量为 :code:`capacity` 的队列（队列容量由
+:code:`fluid.layers.py_reader` 接口中的 :code:`capacity` 参数设置），Python端调用队列的 :code:`push`
+方法送入训练/预测数据，C++端的训练/预测程序调用队列的 :code:`pop` 方法取出Python端送入的数据。PyReader可与
+:code:`double_buffer` 配合使用，实现数据读取和训练/预测的异步执行。
+
+具体使用方法请参考 :ref:`api_fluid_layers_py_reader`。
+
+
+数据输出
+------------
+
+Fluid支持在训练/预测阶段获取当前batch的数据。
+
+用户可通过 :code:`executor.run(fetch_list=[...], return_numpy=...)` 的方式
+fetch期望的输出变量，通过设置 :code:`return_numpy` 参数设置是否将输出数据转为numpy array。
+若 :code:`return_numpy` 为 :code:`False` ，则返回 :code:`LoDTensor` 类型数据。
+
+具体使用方式请参考相关API文档 :ref:`api_fluid_executor_Executor` 和
+:ref:`api_fluid_ParallelExecutor`。
\ No newline at end of file
--- a/doc/fluid/api_cn/api_guides/low_level/layers/detection.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/detection.rst
+..  _api_guide_detection:
+
+
+图像检测
+#########
+
+PaddlePaddle Fluid在图像检测任务中实现了多个特有的操作。以下分模型介绍各个api：
+
+通用操作
+-------------
+
+图像检测中的一些通用操作，是对检测框的一系列操作，其中包括：
+
+* 对检测框的编码，解码（box_coder）：实现两种框之间编码和解码的转换。例如训练阶段对先验框和真实框进行编码得到训练目标值。API Reference 请参考 :ref:`api_fluid_layers_box_coder`
+
+* 比较两个检测框并进行匹配：
+
+  * iou_similarity：计算两组框的IOU值。API Reference 请参考 :ref:`api_fluid_layers_iou_similarity`
+
+  * bipartite_match：通过贪心二分匹配算法得到每一列中距离最大的一行。API Reference 请参考 :ref:`api_fluid_layers_bipartite_match`
+
+* 根据检测框和标签得到分类和回归目标值（target_assign）：通过匹配索引和非匹配索引得到目标值和对应权重。API Reference 请参考 :ref:`api_fluid_layers_target_assign`
+
+
+Faster RCNN
+-------------
+
+`Faster RCNN <https://arxiv.org/abs/1506.01497>`_ 是典型的两阶段目标检测器，相较于传统提取区域的方法，Faster RCNN中RPN网络通过共享卷积层参数大幅提高提取区域的效率，并提出高质量的候选区域。RPN网络需要对输入anchor和真实值进行比较生成初选候选框，并对初选候选框分配分类和回归值，>需要如下四个特有api：
+
+* rpn_target_assign：通过anchor和真实框为anchor分配RPN网络的分类和回归目标值。API Reference 请参考 :ref:`api_fluid_layers_rpn_target_assign`
+
+* anchor_generator：为每个位置生成一系列anchor。API Reference 请参考 :ref:`api_fluid_layers_anchor_generator`
+
+* generate_proposal_labels: 通过generate_proposals得到的候选框和真实框得到RCNN部分的分类和回归的目标值。API Reference 请参考 :ref:`api_fluid_layers_generate_proposal_labels`
+
+* generate_proposals: 对RPN网络输出box解码并筛选得到新的候选框。API Reference 请参考 :ref:`api_fluid_layers_generate_proposals`
+
+
+SSD
+----------------
+
+`SSD <https://arxiv.org/abs/1512.02325>`_ 全称Single Shot MultiBox Detector，是目标检测领域较新且效果较好的检测算法之一，具有检测速度快且检测精度高的特点。与两阶段的检测方法不同，单阶段目标检测并不进行区域推荐，而是直接从特征图回归出目标的边界框和分类概率。SSD网络对六个尺度特>征图计算损失，进行预测，需要如下五种特有api：
+
+* Prior Box：根据不同参数为每个输入位置生成一系列候选框。API Reference 请参考 :ref:`api_fluid_layers_prior_box`
+
+* multi_box_head ：得到不同prior box的位置和置信度。API Reference 请参考 :ref:`api_fluid_layers_multi_box_head`
+
+* detection_output：对prioir box解码，通过多分类NMS得到检测结果。API Reference 请参考 :ref:`api_fluid_layers_detection_output`
+
+* ssd_loss：通过位置偏移预测值，置信度，检测框位置和真实框位置和标签计算损失。API Reference 请参考 :ref:`api_fluid_layers_ssd_loss`
+
+* detection map: 利用mAP评估SSD网络模型。API Reference 请参考 :ref:`api_fluid_layers_detection_map`
+
+OCR
+---------
+
+场景文字识别是在图像背景复杂、分辨率低下、字体多样、分布随意等情况下，将图像信息转化为文字序列的过程，可认为是一种特别的翻译过程：将图像输入翻译为自然语言输出。OCR任务中需要对检测框进行不规则变换，其中需要如下两个api：
+
+* roi_perspective_transform：对输入roi做透视变换。API Reference 请参考 :ref:`api_fluid_layers_roi_perspective_transform`
+
+* polygon_box_transform：对不规则检测框进行坐标变换。API Reference 请参考 :ref:`api_fluid_layers_polygon_box_transform`
+
+
--- a/doc/fluid/api_cn/api_guides/low_level/layers/index.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/index.rst
+=============
+神经网络层
+=============
+
+..  toctree::
+    :maxdepth: 1
+
+    conv.rst
+    pooling.rst
+    detection.rst
+    sequence.rst
+    math.rst
+    activations.rst
+    loss_function.rst
+    data_in_out.rst
+    control_flow.rst
+    sparse_update.rst
+    data_feeder.rst
+    learning_rate_scheduler.rst
+    tensor.rst
+
--- a/doc/fluid/api_cn/api_guides/low_level/layers/learning_rate_scheduler.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/learning_rate_scheduler.rst
+.. _api_guide_learning_rate_scheduler:
+
+############
+学习率调度器
+############
+
+当我们使用诸如梯度下降法等方式来训练模型时，一般会兼顾训练速度和损失(loss)来选择相对合适的学习率。但若在训练过程中一直使用一个学习率，训练集的损失下降到一定程度后便不再继续下降，而是在一定范围内震荡。其震荡原理如下图所示，即当损失函数收敛到局部极小值附近时，会由于学习率过大导致更新步幅过大，每步参数更新会反复越过极小值而出现震荡。
+
+.. image:: ../../../../images/learning_rate_scheduler.png
+    :scale: 80 %
+    :align: center
+
+
+学习率调度器定义了常用的学习率衰减策略来动态生成学习率，学习率衰减函数以epoch或step为参数，返回一个随训练逐渐减小的学习率，从而兼顾降低训练时间和在局部极小值能更好寻优两个方面。
+
+下面介绍学习率调度器中相关的Api：
+
+======
+
+* :code:`noam_decay`: 诺姆衰减，相关算法请参考 `《Attention Is All You Need》 <https://arxiv.org/pdf/1706.03762.pdf>`_ 。
+  相关API Reference请参考 :ref:`api_fluid_layers_noam_decay`
+
+* :code:`exponential_decay`: 指数衰减，即每次将当前学习率乘以给定的衰减率得到下一个学习率。
+  相关API Reference请参考 :ref:`api_fluid_layers_exponential_decay`
+
+* :code:`natural_exp_decay`: 自然指数衰减，即每次将当前学习率乘以给定的衰减率的自然指数得到下一个学习率。
+  相关API Reference请参考 :ref:`api_fluid_layers_natural_exp_decay`
+
+* :code:`inverse_time_decay`: 逆时间衰减，即得到的学习率与当前衰减次数成反比。
+  相关API Reference请参考 :ref:`api_fluid_layers_inverse_time_decay`
+
+* :code:`polynomial_decay`: 多项式衰减，即得到的学习率为初始学习率和给定最终学习之间由多项式计算权重定比分点的插值。
+  相关API Reference请参考 :ref:`api_fluid_layers_polynomial_decay`
+
+* :code:`piecewise_decay`: 分段衰减，即由给定step数分段呈阶梯状衰减，每段内学习率相同。
+  相关API Reference请参考 :ref:`api_fluid_layers_piecewise_decay`
+
+* :code:`append_LARS`: 通过Layer-wise Adaptive Rate Scaling算法获得学习率，相关算法请参考 `《Train Feedfoward Neural Network with Layer-wise Adaptive Rate via Approximating Back-matching Propagation》 <https://arxiv.org/abs/1802.09750>`_ 。
+  相关API Reference请参考 :ref:`api_fluid_layers_append_LARS`
+
--- a/doc/fluid/api_cn/api_guides/low_level/layers/loss_function.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/loss_function.rst
+..  _api_guide_loss_function:
+
+#######
+损失函数
+#######
+
+损失函数定义了拟合结果和真实结果之间的差异，作为优化的目标直接关系模型训练的好坏，很多研究工作的内容也集中在损失函数的设计优化上。
+Paddle Fluid 中提供了面向多种任务的多种类型的损失函数，以下列出了一些 Paddle Fluid 中包含的较为常用的损失函数。
+
+回归
+====
+
+平方误差损失（squared error loss）使用预测值和真实值之间误差的平方作为样本损失，是回归问题中最为基本的损失函数。
+API Reference 请参考 :ref:`api_fluid_layers_square_error_cost`。
+
+平滑 L1 损失（smooth_l1 loss）是一种分段的损失函数，较平方误差损失其对异常点相对不敏感，因而更为鲁棒。
+API Reference 请参考 :ref:`api_fluid_layers_smooth_l1`。
+
+
+分类
+====
+
+`交叉熵（cross entropy） <https://en.wikipedia.org/wiki/Cross_entropy>`_ 是分类问题中使用最为广泛的损失函数，Paddle Fluid 中提供了接受归一化概率值和非归一化分值输入的两种交叉熵损失函数的接口，并支持 soft label 和 hard label 两种样本类别标签。
+API Reference 请参考 :ref:`api_fluid_layers_cross_entropy` 和 :ref:`api_fluid_layers_softmax_with_cross_entropy`。
+
+多标签分类
+---------
+对于多标签分类问题，如一篇文章同属于政治、科技等多个类别的情况，需要将各类别作为独立的二分类问题计算损失，Paddle Fluid 中为此提供了 sigmoid_cross_entropy_with_logits 损失函数，
+API Reference 请参考 :ref:`api_fluid_layers_sigmoid_cross_entropy_with_logits`。
+
+大规模分类
+---------
+对于大规模分类问题，通常需要特殊的方法及相应的损失函数以加速训练，常用的方法有 `噪声对比估计（Noise-contrastive estimation，NCE） <http://proceedings.mlr.press/v9/gutmann10a/gutmann10a.pdf>`_ 和 `层级 sigmoid <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_ 。
+
+* 噪声对比估计通过将多分类问题转化为学习分类器来判别数据来自真实分布和噪声分布的二分类问题，基于二分类来进行极大似然估计，避免在全类别空间计算归一化因子从而降低了计算复杂度。
+* 层级 sigmoid 通过二叉树进行层级的二分类来实现多分类，每个样本的损失对应了编码路径上各节点二分类交叉熵的和，避免了归一化因子的计算从而降低了计算复杂度。
+这两种方法对应的损失函数在 Paddle Fluid 中均有提供，API Reference 请参考 :ref:`api_fluid_layers_nce` 和 :ref:`api_fluid_layers_hsigmoid`。
+
+序列分类
+-------
+序列分类可以分为以下三种：
+
+* 序列分类（Sequence Classification）问题，整个序列对应一个预测标签，如文本分类。这种即是普通的分类问题，可以使用 cross entropy 作为损失函数。
+* 序列片段分类（Segment Classification）问题，序列中的各个片段对应有自己的类别标签，如命名实体识别。对于这种序列标注问题，`（线性链）条件随机场（Conditional Random Field，CRF） <http://www.cs.columbia.edu/~mcollins/fb.pdf>`_ 是一种常用的模型方法，其使用句子级别的似然概率，序列中不同位置的标签不再是条件独立，能够有效解决标记偏置问题。Paddle Fluid 中提供了 CRF 对应损失函数的支持，API Reference 请参考 :ref:`api_fluid_layers_linear_chain_crf`。
+* 时序分类（Temporal Classification）问题，需要对未分割的序列进行标注，如语音识别。对于这种时序分类问题，`CTC（Connectionist Temporal Classification） <http://people.idsia.ch/~santiago/papers/icml2006.pdf>`_ 损失函数不需要对齐输入数据及标签，可以进行端到端的训练，Paddle Fluid 提供了 warpctc 的接口来计算相应的损失，API Reference 请参考 :ref:`api_fluid_layers_warpctc`。
+
+排序
+====
+
+`排序问题 <https://en.wikipedia.org/wiki/Learning_to_rank>`_ 可以使用 Pointwise、Pairwise 和 Listwise 的学习方法，不同的方法需要使用不同的损失函数：
+
+* Pointwise 的方法通过近似为回归问题解决排序问题，可以使用回归问题的损失函数。
+* Pairwise 的方法需要特殊设计的损失函数，其通过近似为分类问题解决排序问题，使用两篇文档与 query 的相关性得分以偏序作为二分类标签来计算损失。Paddle Fluid 中提供了两种常用的 Pairwise 方法的损失函数，API Reference 请参考 :ref:`api_fluid_layers_rank_loss` 和 :ref:`api_fluid_layers_margin_rank_loss`。
+
+更多
+====
+
+对于一些较为复杂的损失函数，可以尝试使用其他损失函数组合实现；Paddle Fluid 中提供的用于图像分割任务的 :ref:`api_fluid_layers_dice_loss` 即是使用其他 OP 组合（计算各像素位置似然概率的均值）而成；多目标损失函数也可看作这样的情况，如 Faster RCNN 就使用 cross entropy 和 smooth_l1 loss 的加权和作为损失函数。
+
+**注意**，在定义损失函数之后为能够使用 :ref:`api_guide_optimizer` 进行优化，通常需要使用 :ref:`api_fluid_layers_mean` 或其他操作将损失函数返回的高维 Tensor 转换为 Scalar 值。
\ No newline at end of file
--- a/doc/fluid/api_cn/api_guides/low_level/layers/math.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/math.rst
--- a/doc/fluid/api_cn/api_guides/low_level/layers/pooling.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/pooling.rst
--- a/doc/fluid/api_cn/api_guides/low_level/layers/sequence.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/sequence.rst
--- a/doc/fluid/api_cn/api_guides/low_level/layers/sparse_update.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/sparse_update.rst
--- a/doc/fluid/api_cn/api_guides/low_level/layers/tensor.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/layers/tensor.rst
--- a/doc/fluid/api_cn/api_guides/low_level/memory_optimize.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/memory_optimize.rst
--- a/doc/fluid/api_cn/api_guides/low_level/metrics.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/metrics.rst
--- a/doc/fluid/api_cn/api_guides/low_level/model_save_reader.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/model_save_reader.rst
--- a/doc/fluid/api_cn/api_guides/low_level/nets.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/nets.rst
--- a/doc/fluid/api_cn/api_guides/low_level/optimizer.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/optimizer.rst
--- a/doc/fluid/api_cn/api_guides/low_level/parallel_executor.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/parallel_executor.rst
--- a/doc/fluid/api_cn/api_guides/low_level/parameter.rst
+++ b/doc/fluid/api_cn/api_guides/low_level/parameter.rst
--- a/doc/fluid/api_cn/average_cn.rst
+++ b/doc/fluid/api_cn/average_cn.rst
--- a/doc/fluid/api_cn/backward_cn.rst
+++ b/doc/fluid/api_cn/backward_cn.rst
--- a/doc/fluid/api_cn/clip_cn.rst
+++ b/doc/fluid/api_cn/clip_cn.rst
--- a/doc/fluid/api_cn/data_feeder_cn.rst
+++ b/doc/fluid/api_cn/data_feeder_cn.rst
--- a/doc/fluid/api_cn/executor_cn.rst
+++ b/doc/fluid/api_cn/executor_cn.rst
--- a/doc/fluid/api_cn/fluid_cn.rst
+++ b/doc/fluid/api_cn/fluid_cn.rst
--- a/doc/fluid/api_cn/gen_index.py
+++ b/doc/fluid/api_cn/gen_index.py
--- a/doc/fluid/api_cn/index_cn.rst
+++ b/doc/fluid/api_cn/index_cn.rst
--- a/doc/fluid/api_cn/initializer_cn.rst
+++ b/doc/fluid/api_cn/initializer_cn.rst
--- a/doc/fluid/api_cn/io_cn.rst
+++ b/doc/fluid/api_cn/io_cn.rst
--- a/doc/fluid/api_cn/layers_cn.rst
+++ b/doc/fluid/api_cn/layers_cn.rst
--- a/doc/fluid/api_cn/metrics_cn.rst
+++ b/doc/fluid/api_cn/metrics_cn.rst
--- a/doc/fluid/api_cn/net_cn.rst
+++ b/doc/fluid/api_cn/net_cn.rst
--- a/doc/fluid/api_cn/nets_cn.rst
+++ b/doc/fluid/api_cn/nets_cn.rst
--- a/doc/fluid/api_cn/optimizer_cn.rst
+++ b/doc/fluid/api_cn/optimizer_cn.rst
--- a/doc/fluid/api_cn/param_attr_cn.rst
+++ b/doc/fluid/api_cn/param_attr_cn.rst
--- a/doc/fluid/api_cn/profiler_cn.rst
+++ b/doc/fluid/api_cn/profiler_cn.rst
--- a/doc/fluid/api_cn/recordio_writer_cn.rst
+++ b/doc/fluid/api_cn/recordio_writer_cn.rst
--- a/doc/fluid/api_cn/regularizer_cn.rst
+++ b/doc/fluid/api_cn/regularizer_cn.rst
--- a/doc/fluid/api_cn/transpiler_cn.rst
+++ b/doc/fluid/api_cn/transpiler_cn.rst
--- a/doc/fluid/beginners_guide/basics/index.rst
+++ b/doc/fluid/beginners_guide/basics/index.rst
--- a/doc/fluid/beginners_guide/index.rst
+++ b/doc/fluid/beginners_guide/index.rst
--- a/doc/fluid/beginners_guide/install/compile/compile_CentOS.md
+++ b/doc/fluid/beginners_guide/install/compile/compile_CentOS.md
--- a/doc/fluid/beginners_guide/install/compile/compile_MacOS.md
+++ b/doc/fluid/beginners_guide/install/compile/compile_MacOS.md
--- a/doc/fluid/beginners_guide/install/compile/compile_Ubuntu.md
+++ b/doc/fluid/beginners_guide/install/compile/compile_Ubuntu.md
--- a/doc/fluid/beginners_guide/install/compile/compile_Windows.md
+++ b/doc/fluid/beginners_guide/install/compile/compile_Windows.md
--- a/doc/fluid/beginners_guide/install/compile/fromsource.rst
+++ b/doc/fluid/beginners_guide/install/compile/fromsource.rst
--- a/doc/fluid/beginners_guide/install/Start.rst
+++ b/doc/fluid/beginners_guide/install/Start.rst
--- a/doc/fluid/beginners_guide/install/install_CentOS.md
+++ b/doc/fluid/beginners_guide/install/install_CentOS.md
--- a/doc/fluid/beginners_guide/install/install_MacOS.md
+++ b/doc/fluid/beginners_guide/install/install_MacOS.md
--- a/doc/fluid/beginners_guide/install/install_Ubuntu.md
+++ b/doc/fluid/beginners_guide/install/install_Ubuntu.md
--- a/doc/fluid/beginners_guide/install/install_Windows.md
+++ b/doc/fluid/beginners_guide/install/install_Windows.md
--- a/doc/fluid/book/fit_a_line/README.md
+++ b/doc/fluid/book/fit_a_line/README.md
--- a/doc/fluid/book/fit_a_line/image
+++ b/doc/fluid/book/fit_a_line/image
--- a/doc/fluid/book/image_classification/README.md
+++ b/doc/fluid/book/image_classification/README.md
--- a/doc/fluid/book/image_classification/image
+++ b/doc/fluid/book/image_classification/image
--- a/doc/fluid/book/index_en.rst
+++ b/doc/fluid/book/index_en.rst
--- a/doc/fluid/book/label_semantic_roles/README.md
+++ b/doc/fluid/book/label_semantic_roles/README.md
--- a/doc/fluid/book/label_semantic_roles/image
+++ b/doc/fluid/book/label_semantic_roles/image
--- a/doc/fluid/book/machine_translation/README.md
+++ b/doc/fluid/book/machine_translation/README.md
--- a/doc/fluid/book/machine_translation/image
+++ b/doc/fluid/book/machine_translation/image
--- a/doc/fluid/book/recognize_digits/README.md
+++ b/doc/fluid/book/recognize_digits/README.md
--- a/doc/fluid/book/recognize_digits/image
+++ b/doc/fluid/book/recognize_digits/image
--- a/doc/fluid/book/recommender_system/README.md
+++ b/doc/fluid/book/recommender_system/README.md
--- a/doc/fluid/book/recommender_system/image
+++ b/doc/fluid/book/recommender_system/image
--- a/doc/fluid/book/understand_sentiment/README.md
+++ b/doc/fluid/book/understand_sentiment/README.md
--- a/doc/fluid/book/understand_sentiment/image
+++ b/doc/fluid/book/understand_sentiment/image
--- a/doc/fluid/book/word2vec/README.md
+++ b/doc/fluid/book/word2vec/README.md
--- a/doc/fluid/book/word2vec/images
+++ b/doc/fluid/book/word2vec/images
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
--- a/doc/fluid/index_cn.rst
+++ b/doc/fluid/index_cn.rst
--- a/doc/fluid/index_en.rst
+++ b/doc/fluid/index_en.rst
--- a/doc/fluid/user_guides/howto/basic_concept/index_cn.rst
+++ b/doc/fluid/user_guides/howto/basic_concept/index_cn.rst
--- a/doc/fluid/user_guides/howto/prepare_data/lod_tensor.md
+++ b/doc/fluid/user_guides/howto/prepare_data/lod_tensor.md
--- a/doc/fluid/user_guides/howto/evaluation_and_debugging/index.rst
+++ b/doc/fluid/user_guides/howto/evaluation_and_debugging/index.rst
--- a/doc/fluid/user_guides/howto/prepare_data/index.rst
+++ b/doc/fluid/user_guides/howto/prepare_data/index.rst
--- a/doc/fluid/user_guides/howto/training/cluster_howto.rst
+++ b/doc/fluid/user_guides/howto/training/cluster_howto.rst
--- a/doc/fluid/user_guides/howto/training/index.rst
+++ b/doc/fluid/user_guides/howto/training/index.rst
--- a/doc/fluid/user_guides/howto/training/multi_node.rst
+++ b/doc/fluid/user_guides/howto/training/multi_node.rst
--- a/doc/fluid/user_guides/howto/training/save_load_variables.rst
+++ b/doc/fluid/user_guides/howto/training/save_load_variables.rst
--- a/doc/fluid/user_guides/index.rst
+++ b/doc/fluid/user_guides/index.rst
--- a/doc/fluid/user_guides/models/index_en.rst
+++ b/doc/fluid/user_guides/models/index_en.rst
--- a/Anakin @ 65178d41
+++ b/Anakin @ 65178d41
--- a/Paddle @ 08f927de
+++ b/Paddle @ 08f927de
--- a/book @ ef293450
+++ b/book @ ef293450
--- a/paddle-mobile @ 2c088e20
+++ b/paddle-mobile @ 2c088e20