Merge branch 'develop' into fix/1.1_dev

36399fbd · Jiabin Yang · GitHub · fec10796 · ac9ce18a · 36399fbd
43 changed file
--- a/.gitignore
+++ b/.gitignore
+.vscode/
--- a/.gitmodules
+++ b/.gitmodules
-[submodule "external/Paddle"]
-	path = external/Paddle
-	url = https://github.com/PaddlePaddle/Paddle
 [submodule "external/book"]
 	path = external/book
 	url = https://github.com/PaddlePaddle/book
@@ -10,6 +7,9 @@
 [submodule "external/paddle-mobile"]
 	path = external/paddle-mobile
 	url = https://github.com/PaddlePaddle/paddle-mobile
+[submodule "external/Paddle"]
+	path = external/Paddle
+	url = https://github.com/PaddlePaddle/Paddle
 [submodule "external/models"]
 	path = external/models
 	url = https://github.com/PaddlePaddle/models
--- a/doc/fluid/advanced_usage/deploy/anakin_gpu_benchmark.md
+++ b/doc/fluid/advanced_usage/deploy/anakin_gpu_benchmark.md
@@ -30,21 +30,19 @@
 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 8.8690 | 8.2815 |
+| 1 | 8.53945 | 8.18737 |
-| 2 | 15.5344 | 13.9116 |
+| 2 | 14.2269 | 13.8976 |
-| 4 | 26.6000 | 21.8747 |
+| 4 | 24.2803 | 21.7976 |
-| 8 | 49.8279 | 40.4076 |
+| 8 | 45.6003 | 40.319 |
-| 32 | 188.6270 | 163.7660 |
 - GPU Memory Used (`MB`)
 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 963 | 997 |
+| 1 | 1053.88 | 762.73 |
-| 2 | 965 | 1039 |
+| 2 | 1055.71 | 762.41 |
-| 4 | 991 | 1115 |
+| 4 | 1003.22 | 832.75 |
-| 8 | 1067 | 1269 |
+| 8 | 1108.77 | 926.9 |
-| 32 | 1715 | 2193 |
 ### <span id = '2'>Yolo </span>
@@ -53,21 +51,19 @@
 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 16.4596| 15.2124 |
+| 1 | 8.41606| 7.07977 |
-| 2 | 26.6347| 25.0442 |
+| 2 | 16.6588| 15.2216 |
-| 4 | 43.3695| 43.5017 |
+| 4 | 31.9955| 30.5102 |
-| 8 | 80.9139 | 80.9880 |
+| 8 | 66.1107 | 64.3658 |
-| 32 | 293.8080| 310.8810 |
 - GPU Memory Used (`MB`)
 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 1569 | 1775 |
+| 1 | 1054.71  | 299.8 |
-| 2 | 1649 | 1815 |
+| 2 | 951.51  | 347.47 |
-| 4 | 1709 | 1887 |
+| 4 | 846.9  | 438.47 |
-| 8 | 1731 | 2031 |
+| 8 | 1042.31  | 515.15 |
-| 32 | 2253 | 2907 |
 ### <span id = '3'> Resnet50 </span>
@@ -75,21 +71,19 @@
 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 4.2459   |  4.1061 |
+| 1 | 4.10063  |  3.33845 |
-| 2 |  6.2627  |  6.5159 |
+| 2 |  6.10941 |  5.54814 |
-| 4 | 10.1277  | 11.3327 |
+| 4 | 9.90233  | 10.2763 |
-| 8 | 17.8209  | 20.6680 |
+| 8 | 17.3287  |   20.0783 |
-| 32 | 65.8582 | 77.8858 |
 - GPU Memory Used (`MB`)
 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 531  | 503 |
+| 1 | 1059.15 | 299.86 |
-| 2 | 543  | 517 |
+| 2 | 1077.8  | 340.78 |
-| 4 | 583 | 541 |
+| 4 | 903.04  | 395 |
-| 8 | 611 | 589 |
+| 8 | 832.53  | 508.86 |
-| 32 |  809 | 879 |
 ### <span id = '4'> Resnet101 </span>
@@ -97,21 +91,19 @@
 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 7.5562 | 7.0837 |
+| 1 | 7.29828 | 5.672 |
-| 2 | 11.6023 | 11.4079 |
+| 2 | 11.2037 | 9.42352 |
-| 4 | 18.3650 | 20.0493 |
+| 4 | 17.9306 | 18.0936 |
-| 8 | 32.7632 | 36.0648 |
+| 8 | 31.4804 | 35.7439 |
-| 32 | 123.2550 | 135.4880 |
 - GPU Memory Used (`MB)`
 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 701  | 683 |
+| 1 | 1161.94 | 429.22 |
-| 2 | 713  | 697 |
+| 2 | 1190.92 | 531.92 |
-| 4 | 793 | 721 |
+| 4 | 994.11  | 549.7 |
-| 8 | 819 | 769 |
+| 8 | 945.47  | 653.06 |
-| 32 | 1043 | 1059 |
 ###  <span id = '5'> MobileNet V1 </span>
@@ -119,21 +111,19 @@
 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 45.5156  |  1.3947 |
+| 1 | 1.52692  |  1.39282 |
-| 2 |  46.5585  |  2.5483 |
+| 2 |  1.98091  |  2.05788 |
-| 4 | 48.4242  | 4.3404 |
+| 4 | 3.2705  | 4.03476 |
-| 8 |  52.7957 |  8.1513 |
+| 8 |  5.15652 |  7.06651 |
-| 32 | 83.2519 | 31.3178 |
 - GPU Memory Used (`MB`)
 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 329  | 283 |
+| 1 | 1144.35   | 99.6 |
-| 2 | 345  | 289 |
+| 2 | 1160.03    | 199.75 |
-| 4 | 371 | 299 |
+| 4 | 1098  | 184.33 |
-| 8 | 393 | 319 |
+| 8 | 990.71  | 232.11 |
-| 32 |  531 | 433 |
 ###  <span id = '6'> MobileNet V2</span>
@@ -141,21 +131,20 @@
 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 65.6861 | 2.9842 |
+| 1 | 1.95961 | 1.78249 |
-| 2 | 66.6814 | 4.7472 |
+| 2 | 2.8709 | 3.01144 |
-| 4 | 69.7114 | 7.4163 |
+| 4 | 4.46131 | 5.43946 |
-| 8 | 76.1092 | 12.8779 |
+| 8 | 7.161 | 10.2081 |
-| 32 | 124.9810 | 47.2142 |
 - GPU Memory Used (`MB`)
 | BatchSize | TensorRT | Anakin |
 | --- | --- | --- |
-| 1 | 341 | 293 |
+| 1 | 1154.69 | 195.25 |
-| 2 | 353 | 301 |
+| 2 | 1187.25 | 227.6 |
-| 4 | 385 | 319 |
+| 4 | 1053 | 241.75 |
-| 8 | 421 | 351 |
+| 8 | 1062.48 | 352.18 |
-| 32 | 637 | 551 |
 ## How to run those Benchmark models

--- a/doc/fluid/advanced_usage/deploy/anakin_tutorial.md
+++ b/doc/fluid/advanced_usage/deploy/anakin_tutorial.md
@@ -114,64 +114,67 @@ Anakin中数据类型与基本数据类型的对应如下:
  理论上，Anakin支持申明1维以上的tensor，但是对于Anakin中的Op来说，只支持NW、NHW、NCHW、NCHW_C4这四种LayOut，其中NCHW是默认的LayOuteType，NCHW_C4是专门针对于int8这种数据类型的。
-  **例子：**
+  例子
-下面的代码将展示如何使用tensor， 我们建议先看看这些示例。
+    下面的代码将展示如何使用tensor， 我们建议先看看这些示例。
-要想获得更多关于tensor的信息， 请参考 *soure_path/core/tensor.h*
+    要想获得更多关于tensor的信息， 请参考 *soure_path/core/tensor.h*
-1. 使用shape对象初始化tensor
+    > 1. 使用shape对象初始化tensor
-    ```cpp
+    ```c++
-    //create a null tensor. A null tensor holds for nothing.
+      //create a null tensor. A null tensor holds for nothing.
-    //tensor's buffer  is resident at CPU and its datatype is AK_FLOAT.
+      //tensor's buffer  is resident at CPU and its datatype is AK_FLOAT.
-    //tensor's Layout is NCHW(default)
+      //tensor's Layout is NCHW(default)
-    Tensor<X86, AK_FLOAT> mytensor;
+      Tensor<X86, AK_FLOAT> mytensor;
-    //1. using shape object to create a tensor.
+      //1. using shape object to create a tensor.
-    Shape shape1(NUM); //1-D shape. NUM is the number of dimention.
+      Shape shape1(NUM); //1-D shape. NUM is the number of dimention.
-    Tensor<X86, AK_FLOAT, W> mytensor1(shape1); //1-D tensor.
+      Tensor<X86, AK_FLOAT, W> mytensor1(shape1); //1-D tensor.
-    // A 4-D shape
+      // A 4-D shape
-    Shape shape2(N, C, H, W); // batch x channel x height x width
+      Shape shape2(N, C, H, W); // batch x channel x height x width
    ```
-    `注意：Shape的维度必须和tensor的`[LayoutType](#layout)`相同，比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW，否则会出错。如下列代码所示`
+    >`注意：Shape的维度必须和tensor的`[LayoutType](#layout)`相同，比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW，否则会出错。如下列代码所示`
    ```c++
-    // A 4-D tensor.
+       // A 4-D tensor.
-    Tensor<X86, AK_FLOAT> mytensor2(shape2);  //right
+       Tensor<X86, AK_FLOAT> mytensor2(shape2);  //right
+       //A 4-D tensor which is resident at GPU and its datatype is AK_INT8
+       Tensor<NV, AK_INT8> mytensor3(shape2);   //right
-    //A 4-D tensor which is resident at GPU and its datatype is AK_INT8
+       Tensor<X86, AK_FLOAT, NHW> mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout.
-    Tensor<NV, AK_INT8> mytensor3(shape2);   //right
+       Tensor<NV, AK_FLOAT, NCHW_C4> mytensor5(shape2); //wrong!!!!
-    Tensor<X86, AK_FLOAT, NHW> mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout.
-    Tensor<NV, AK_FLOAT, NCHW_C4> mytensor5(shape2); //wrong!!!!
    ```
-2. 使用现有的数据和shape初始化tensor
+    > 2. 使用现有的数据和shape初始化tensor
    ```c++
-    /**
-    *  A construtor of Tensor.
-    *  data_ptr is a pointer to any data type of data
-    *  TargetType is type of a platform [Anakin TargetType]
-    *  id : device id
-    *  shape: a Anakin shape
-    */
-    Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape);
-    //using existing data feed to a tensor
+       /**
-    Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W).
+       *  A construtor of Tensor.
+       *  data_ptr is a pointer to any data type of data
+       *  TargetType is type of a platform [Anakin TargetType]
+       *  id : device id
+       *  shape: a Anakin shape
+       */
+       Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape);
+       //using existing data feed to a tensor
+       Tensor<X86, AK_FLOAT> mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W).
    ```
-3. 使用tensor初始化tensor
+    > 3. 使用tensor初始化tensor
    ```c++
-    Tensor<NV, AK_FLOAT> tensor(exist_tensor);
+       Tensor<NV, AK_FLOAT> tensor(exist_tensor);
    ```
->提示： 你可以用` typedef Tensor<X86, AK_FLOAT> Tensor4d_X86 `方便定义tensor
+    > 提示： 你可以用` typedef Tensor<X86, AK_FLOAT> Tensor4d_X86 `方便定义tensor
 #### 填充tensor数据区

--- a/doc/fluid/advanced_usage/deploy/index_anakin.rst
+++ b/doc/fluid/advanced_usage/deploy/index_anakin.rst
@@ -13,6 +13,7 @@ Anakin 预测引擎
   anakin_tutorial.md
   anakin_run_on_arm.md
   anakin_example.md
+   int8_design_anakin.md
   anakin_gpu_benchmark.md
   anakin_arm_benchmark.md

--- a/doc/fluid/advanced_usage/pics/int8_design.png
+++ b/doc/fluid/advanced_usage/pics/int8_design.png
--- a/doc/fluid/api/api_guides/index.rst
+++ b/doc/fluid/api/api_guides/index.rst
@@ -12,5 +12,4 @@ API使用指南
    low_level/metrics.rst
    low_level/model_save_reader.rst
    low_level/inference.rst
+    low_level/distributed/index.rst
--- a/doc/fluid/api/api_guides/low_level/cluster/cluster_train_data_cn.rst
+++ b/doc/fluid/api/api_guides/low_level/cluster/cluster_train_data_cn.rst
+..  _api_guide_cluster_train_data:
+####################
+分布式训练reader准备
+####################
+一个数据并行的分布式训练任务通常会含有多个训练进程，每个训练进程处理整个数据集中的一部分，根据当前进程的唯一序号(trainer_id)以及训练进程总数(trainers)可以决定当前训练进程应该读取哪一部分数据。
+实现 cluster_reader 来读取分布式训练数据集
+----------------------------------------
+比较通用的方法，可以实现一个 cluster_reader, 根据训练进程数量以及进程序号决定读取哪些 example:
+    .. code-block:: python
+        def cluster_reader(reader, trainers, trainer_id):
+            def reader_creator():
+                for idx, data in enumerate(reader()):
+                    if idx % trainers == trainer_id:
+                        yield data
+            return reader
+        trainers = int(os.getenv("PADDLE_TRAINERS", "1"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        train_reader = cluster_reader(paddle.dataset.mnist.train(), trainers, trainer_id)
+上述代码中，`trainers` 和 `trainer_id` 分别是训练进程总数和当前训练进程的序号，可以通过环境变量或者参数的方式传递给 Python 程序。
+预先切分训练文件
+-----------------
+由于使用 `cluster_reader` 依然会读取全量数据，对于训练进程比较多的任务，会造成IO资源的浪费、影响训练性能。另一种方法是可以将训练数据切分成多个小文件，每个进程处理其中的一部分文件,
+例如在 Linux 系统中可以使用 `split <http://man7.org/linux/man-pages/man1/split.1.html>`_ 命令将训练数据切分成多个小文件：
+  .. code-block:: bash
+    $ split -d -a 4 -d -l 100 housing.data cluster/housing.data.
+    $ find ./cluster
+    cluster/
+    cluster/housing.data.0002
+    cluster/housing.data.0003
+    cluster/housing.data.0004
+    cluster/housing.data.0000
+    cluster/housing.data.0001
+    cluster/housing.data.0005
+数据切分好以后, 可以实现一个 file_dispatcher 函数，根据训练进程数量以及序号决定需要读取哪些文件：
+    .. code-block:: python
+        def file_dispatcher(files_pattern, trainers, trainer_id):
+            file_list = glob.glob(files_pattern)
+            ret_list = []
+            for idx, f in enumerate(file_list):
+                if (idx + trainers) % trainers == trainer_id:
+                    ret_list.append(f)
+            return ret_list
+        trainers = int(os.getenv("PADDLE_TRAINERS", "1"))
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        files_pattern = "cluster/housing.data.*"
+        my_files = file_dispatcher(files_pattern, triners, trainer_id)
+在上述例子中，`files_pattern` 是训练文件的 `glob 表达式 <https://docs.python.org/2.7/library/glob.html>`_，一般可以用通配符来表示。
--- a/doc/fluid/api/api_guides/low_level/distributed/async_training.rst
+++ b/doc/fluid/api/api_guides/low_level/distributed/async_training.rst
+.. _api_guide_async_training:
+############
+分布式异步训练
+############
+Fluid支持数据并行的分布式异步训练，API使用 :code:`DistributedTranspiler` 将单机网络配置转换成可以多机执行的
+:code:`pserver` 端程序和 :code:`trainer` 端程序。用户在不同的节点执行相同的一段代码，根据环境变量或启动参数，
+可以执行对应的 :code:`pserver` 或 :code:`trainer` 角色。Fluid异步训练只支持pserver模式，异步训练和 `同步训练 <../distributed/sync_training.html>`_ 的主要差异在于：异步训练每个trainer的梯度是单独更新到参数上的，
+而同步训练是所有trainer的梯度合并之后统一更新到参数上，因此，同步训练和异步训练的超参数需要分别调节。
+pserver模式分布式异步训练
+======================
+API详细使用方法参考 :ref: `api_fluid_DistributeTranspiler` ，简单示例用法：
+.. code-block:: python
+    config = fluid.DistributedTranspilerConfig()
+    # 配置策略config
+    config.slice_var_up = False
+    t = fluid.DistributedTranspiler(config=config)
+    t.transpile(trainer_id, 
+                program=main_program,
+                pservers="192.168.0.1:6174,192.168.0.2:6174",
+                trainers=1,
+                sync_mode=False)
+以上参数说明请参考`同步训练 <../distributed/sync_training.html>`_ 
+需要注意的是：进行异步训练时，请修改 :code:`sync_mode` 的值
+- :code:`sync_mode` ： 是否是同步训练模式，默认为True，不传此参数也默认是同步训练模式，设置为False则为异步训练
--- a/doc/fluid/api/api_guides/low_level/distributed/cpu_train_best_practice.rst
+++ b/doc/fluid/api/api_guides/low_level/distributed/cpu_train_best_practice.rst
+.. _api_guide_cpu_training_best_practice:
+##################
+分布式CPU训练最佳实践
+##################
+提高CPU分布式训练的训练速度，主要要从两个方面来考虑：
+1）提高训练速度，主要是提高CPU的使用率；2）提高通信速度，主要是减少通信传输的数据量。
+提高CPU的使用率
+=============
+提高CPU使用率主要依赖 :code:`ParallelExecutor`，可以充分利用多个CPU的计算能力来加速计算。
+API详细使用方法参考 :ref:`api_fluid_ParallelExecutor` ，简单实例用法：
+.. code-block:: python
+    # 配置执行策略，主要是设置线程数
+    exec_strategy = fluid.ExecutionStrategy()
+    exec_strategy.num_threads = 8
+    # 配置构图策略，对于CPU训练而言，应该使用Reduce模式进行训练
+    build_strategy = fluid.BuildStrategy()
+    if int(os.getenv("CPU_NUM")) > 1:
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+    pe = fluid.ParallelExecutor(
+        use_cuda=False,
+        loss_name=avg_cost.name,
+        main_program=main_program,
+        build_strategy=build_strategy,
+        exec_strategy=exec_strategy)
+以上参数中：
+- :code:`num_threads` ： 模型训练使用的线程数，最好和训练所在机器的物理CPU核数接近
+- :code:`reduce_strategy` ： 对于CPU训练而言，应该选择 fluid.BuildStrategy.ReduceStrategy.Reduce
+通用环境变量配置：
+- :code:`CPU_NUM` ：模型副本replica的个数，最好和num_threads一致
+提高通信速度
+==========
+要减少通信数据量，提高通信速度，主要是使用稀疏更新 ，目前支持 `稀疏更新 <../distributed/sparse_update.html>`_  的主要是  :ref:`api_fluid_layers_embedding` 。
+.. code-block:: python
+    data = fluid.layers.data(name='ids', shape=[1], dtype='int64')
+    fc = fluid.layers.embedding(input=data, size=[dict_size, 16], is_sparse=True)
+以上参数中：
+- :code:`is_sparse` ： 配置embedding使用稀疏更新，如果embedding的dict_size很大，而每次数据data很少，建议使用sparse更新方式。
--- a/doc/fluid/api/api_guides/low_level/distributed/index.rst
+++ b/doc/fluid/api/api_guides/low_level/distributed/index.rst
+=============
+分布式训练
+=============
+..  toctree::
+    :maxdepth: 1
+    async_training.rst
+    cpu_train_best_practice.rst
+    large_scale_sparse_feature_training.rst
--- a/doc/fluid/api/api_guides/low_level/distributed/large_scale_sparse_feature_training.rst
+++ b/doc/fluid/api/api_guides/low_level/distributed/large_scale_sparse_feature_training.rst
+.. _api_guide_large_scale_sparse_feature_training:
+###################
+大规模稀疏特征模型训练
+###################
+模型配置和训练
+=============
+embedding被广泛应用在各种网络结构中，尤其是文本处理相关的模型。在某些场景，例如推荐系统或者搜索引擎中，
+embedding的feature id可能会非常多，当feature id达到一定数量时，embedding参数会变得很大，
+会带来两个问题：
+1）单机内存由于无法存放如此巨大的embedding参数，导致无法训练；
+2）普通的训练模式每一轮迭代都需要同步完整的参数，参数太大会让通信变得非常慢，进而影响训练速度。
+Fluid支持千亿量级超大规模稀疏特征embedding的训练，embedding参数只会保存在parameter server上，通过
+参数prefetch和梯度稀疏更新的方法，大大减少通信量，提高通信速度。
+该功能只对分布式训练有效，单机无法使用。
+需要配合 `稀疏更新 <../distributed/sparse_update.html>`_ 一起使用。
+使用方法：在配置embedding的时候，加上参数 :code:`is_distributed=True` 以及 :code:`is_sparse=True` 即可。
+参数 :code:`dict_size` 定义数据中总的id的数量，id可以是int64范围内的任意值，只要总id个数小于等于dict_size就可以支持。
+所以配置之前需要预估一下数据中总的feature id的数量。
+.. code-block:: python
+  emb = fluid.layers.embedding(
+      is_distributed=True,
+      input=input,
+      size=[dict_size, embedding_width],
+      is_sparse=True,
+      is_distributed=True)
+模型存储和预测
+=============
+当特征数量达到千亿的时候，参数量很大，单机已经无法存下，所以模型的存储和加载都和普通模式不同：
+1）普通模式下，参数是在trainer端保存和加载的；
+2）分布式模式下，参数的保存和加载，都是在pserver端进行，每个pserver只保存和加载该pserver自身对应部分的参数
--- a/doc/fluid/api/api_guides/low_level/distributed/sync_training.rst
+++ b/doc/fluid/api/api_guides/low_level/distributed/sync_training.rst
+.. _api_guide_sync_training:
+############
+分布式同步训练
+############
+Fluid支持数据并行的分布式同步训练，API使用 :code:`DistributedTranspiler` 将单机网络配置转换成可以多机执行的
+:code:`pserver` 端程序和 :code:`trainer` 端程序。用户在不同的节点执行相同的一段代码，根据环境变量或启动参数，
+可以执行对应的 :code:`pserver` 或 :code:`trainer` 角色。Fluid分布式同步训练同时支持pserver模式和NCCL2模式，
+在API使用上有差别，需要注意。
+pserver模式分布式训练
+===================
+API详细使用方法参考 :ref:`DistributeTranspiler` ，简单实例用法：
+.. code-block:: python
+    config = fluid.DistributedTranspilerConfig()
+    # 配置策略config
+    config.slice_var_up = False
+    t = fluid.DistributedTranspiler(config=config)
+    t.transpile(trainer_id, 
+                program=main_program,
+                pservers="192.168.0.1:6174,192.168.0.2:6174",
+                trainers=1,
+                sync_mode=True)
+以上参数中：
+- :code:`trainer_id` ： trainer节点的id，从0到n-1，n为当前训练任务中trainer节点的个数
+- :code:`program` ： 被转换的 :code:`program` 默认使用 :code:`fluid.default_main_program()`
+- :code:`pservers` ： 当前训练任务中pserver节点的IP端口列表
+- :code:`trainers` ： int类型，当前训练任务中trainer节点的个数。注意：
+    * pserver模式下，trainer节点个数可以和pserver节点个数不一致，比如使用20个pserver和50个trainer。在实际训练任务中，您可以通过调整pserver节点和trainer节点个数找到最佳性能
+    * NCCL2模式中，此项参数是字符串，指定trainer节点的IP端口列表
+- :code:`sync_mode` ： 是否是同步训练模式，默认为True，不传此参数也默认是同步训练模式
+其中，支持的config包括：
+- :code:`slice_var_up` ： 配置是否切分一个参数到多个pserver上进行优化，默认开启。此选项适用于模型参数个数少，但需要使用大量节点的场景，有利于提升pserver端计算并行度
+- :code:`split_method` ： 配置transpiler分配参数（或参数的切片）到多个pserver的方式，默认为"RoundRobin"，也可以使用"HashName"
+- :code:`min_block_size` ： 如果配置了参数切分，指定最小Tensor的切分大小，防止RPC请求包过小，默认为8192，一般情况不需要调整此项参数
+- :code:`enable_dc_asgd` ： 是否开启 :code:`DC-ASGD` 此选项在异步训练中生效，启用异步训练补偿算法
+- :code:`mode` : 可以选择"pserver"或"nccl2"，指定使用pserver模式或NCCL2模式分布式训练
+- :code:`print_log` ： 是否开启transpiler debug日志，此项为开发调试使用
+通用环境变量配置：
+- :code:`FLAGS_rpc_send_thread_num` ：int，指定RPC通信发送时线程的个数
+- :code:`FLAGS_rpc_get_thread_num` ： int，指定RPC通信接受时线程的个数
+- :code:`FLAGS_rpc_prefetch_thread_num` ： int，分布式lookup table执行RPC通信时，prefetch线程的个数
+- :code:`FLAGS_rpc_deadline` ： int，RPC通信最长等待时间，单位为毫秒，默认180000
+NCCL2模式分布式训练
+=================
+基于NCCL2 (Collective Communication) 的多机同步训练模式，仅支持在GPU集群下进行。
+此部分详细API说明可以参考 :ref:`DistributeTranspiler` 。
+注意：NCCL2模式下，集群不需要启动pserver，只需要启动多个trainer节点即可。
+使用以下代码，将当前 :code:`Program` 转化成适用于NCCL2分布式计算的Fluid :code:`Program` ：
+.. code-block:: python
+    config = fluid.DistributeTranspilerConfig()
+    config.mode = "nccl2"
+    t = fluid.DistributedTranspiler(config=config)
+    t.transpile(trainer_id, 
+                program=main_program,
+                startup_program=startup_program,
+                trainers="192.168.0.1:6174,192.168.0.2:6174",
+                current_endpoint="192.168.0.1:6174")
+其中：
+- :code:`trainer_id` : trainer节点的id，从0到n-1，n为当前训练任务中trainer节点的个数
+- :code:`program` 和 :code:`startup_program` : 分别为Fluid 模型的主配置program和初始化startup_program
+- :code:`trainers` : 字符串类型，指定当前任务所有trainer的IP和端口号，仅用于NCCL2初始化（pserver模式中，此参数为int，指定trainer节点的个数）
+- :code:`current_endpoint` : 当前任务的当前节点的IP和端口号
--- a/doc/fluid/api/api_guides/low_level/layers/index.rst
+++ b/doc/fluid/api/api_guides/low_level/layers/index.rst
@@ -14,4 +14,5 @@
    loss_function.rst
    data_in_out.rst
    control_flow.rst
+    sparse_update.rst
--- a/doc/fluid/api/api_guides/low_level/layers/sparse_update.rst
+++ b/doc/fluid/api/api_guides/low_level/layers/sparse_update.rst
+.. _api_guide_sparse_update:
+#####
+稀疏更新
+#####
+Fluid的 :ref:`api_fluid_layers_embedding`  层在单机训练和分布式训练时，均可以支持“稀疏更新”，即梯度以sparse tensor 结构存储，只保存梯度不为0的行。
+在分布式训练中，对于较大的embedding层，开启稀疏更新有助于减少通信数据量，提升训练速度。
+在paddle内部，我们用lookup_table来实现embedding。下边这张图说明了embedding在正向和反向计算的过程：
+如图所示：一个Tensor中有两行不为0，正向计算的过程中，我们使用ids存储不为0的行，并使用对应的两行数据来进行计算；反向更新的过程也只更新这两行。
+.. image:: ../../../../images/lookup_table_training.png
+   :scale: 50 %
+embedding使用例子:
+---------------------
+API详细使用方法参考 :ref:`api_fluid_layers_embedding` ，以下是一个简单的例子：
+.. code-block:: python
+   DICT_SIZE = 10000 * 10
+   EMBED_SIZE = 64
+   IS_SPARSE = False
+   def word_emb(word, dict_size=DICT_SIZE, embed_size=EMBED_SIZE):
+       embed = fluid.layers.embedding(
+           input=word,
+           size=[dict_size, embed_size],
+           dtype='float32',
+           param_attr=fluid.ParamAttr(
+               initializer=fluid.initializer.Normal(scale=1/math.sqrt(dict_size))),
+           is_sparse=IS_SPARSE,
+           is_distributed=False)
+       return embed
+以上参数中：
+- :code:`is_sparse` ： 反向计算的时候梯度是否为sparse tensor。如果不设置，梯度是一个 `LodTensor <https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/user_guides/howto/prepare_data/lod_tensor.md>`_  。默认为False。
+- :code:`is_distributed` ： 标志是否是用在分布式的场景下。一般大规模稀疏更新（embedding的第0维维度很大，比如几百万以上）才需要设置。具体可以参考大规模稀疏的API guide  :ref:`api_guide_async_training`  。默认为False。
+- API汇总:
+ - :ref:`api_fluid_layers_embedding`
--- a/doc/fluid/api/api_guides/low_level/memory_optimize.rst
+++ b/doc/fluid/api/api_guides/low_level/memory_optimize.rst
+.. _api_guide_memory_optimize:
+#####
+显存优化
+#####
+显存优化是通过分析、复用 :code:`Program` 中 :code:`Varaible` 使用的显存，从而降低 :code:`Program` 执行时显存消耗的方法。用户可以通过Python脚本调用 :code:`memory_optimize` 接口进行显存优化，显存优化的执行策略如下：
+- 首先根据 :code:`Program` 中 :code:`Operator` 之间的关系对 :code:`Variable` 的最后存活时间进行分析，得到每个 :code:`Variable` 的最后存活时间;
+- 其次根据每个 :code:`Variable` 的最后存活时间，我们将到达存活时间、不再存活的 :code:`Variable` 所占用的显存提供给后来的 :code:`Variable` 使用。
+.. code-block:: python
+    z = fluid.layers.sum([x, y])
+    m = fluid.layers.matmul(y, z)
+在这个示例中，:code:`x` 的存活时间到 :code:`fluid.layers.sum` 操作为止，所以它的显存可以被 :code:`m` 复用。
+针对特定部分禁用显存优化
+===========
+:code:`memory_optimize` 支持针对特定部分禁用显存优化，用户可以通过传入 :code:`Variable` 名字的集合来指定哪些 :code:`Variable` 所使用的显存不会被复用;
+与此同时，:code:`memory_optimize` 能够针对网络的反向部分禁用显存优化，用户可以通过传入 :code:`skip_grads` 参数来开启这个功能。
+.. code-block:: python
+    fluid.memory_optimize(fluid.default_main_program(),
+        skip_opt_set=("fc"), skip_grads=True)
+在这个示例中，:code:`fluid.memory_optimize` 接口对默认的 :code:`Program` 进行了 :code:`Variable` 最后存活时间的分析，并跳过了名字为 :code:`fc` 的 :code:`Variable` 以及网络反向部分的所有 :code:`Variable` 。
+这部分 :code:`Variable` 的显存都不会被别的 :code:`Varaible` 再次使用。
+指定显存优化等级
+===========
+:code:`memory_optimize` 支持打印显存复用的信息以方便用户进行调试，用户可以通过指定 :code:`print_log=True` 来开启显存复用的调试信息;
+:code:`memory_optimize` 支持两种显存优化的等级，:code:`0` 或者 :code:`1` :
+- 优化等级为 :code:`0` 时： :code:`memory_optimize` 在分析完 :code:`Variable` 的最后生存时间后，会判断 :code:`Variable` 的 :code:`shape` ，只有 :code:`shape` 相同的 :code:`Variable` 才会进行显存复用；
+- 优化等级为 :code:`1` 时： :code:`memory_optimize` 会尽可能地进行显存复用，在分析完 :code:`Variable` 的最后生存时间后，即使是 :code:`shape` 不同的 :code:`Variable` 也会进行最大程度的显存复用。
+.. code-block:: python
+    fluid.memory_optimize(fluid.default_main_program(),
+        level=0, print_log=True)
+在这个示例中，:code:`fluid.memory_optimize` 接口对默认的 :code:`Program` 进行了 :code:`Variable` 最后存活时间的分析。
+只有 :code:`shape` 完全相同的 :code:`Variable` 才会进行显存复用，并且在分析结束后，会打印出所有显存复用相关的调试信息。
--- a/doc/fluid/api/api_guides/low_level/parameter.rst
+++ b/doc/fluid/api/api_guides/low_level/parameter.rst
+..  _api_guide_parameter:
+#########
+模型参数
+#########
+模型参数为模型中的weight和bias统称，在fluid中对应fluid.Parameter类，继承自fluid.Variable，是一种可持久化的variable。模型的训练就是不断学习更新模型参数的过程。模型参数相关的属性可以通过 :ref:`api_fluid_param_attr_ParamAttr` 来配置，可配置内容有：
+- 初始化方式
+- 正则化
+- 梯度剪切
+- 模型平均
+初始化方式
+=================
+fluid通过设置 :code:`ParamAttr` 的 :code:`initializer` 属性为单个parameter设置初始化方式。
+示例如下：
+  .. code-block:: python
+      param_attrs = fluid.ParamAttr(name="fc_weight",
+                                initializer=fluid.initializer.ConstantInitializer(1.0))
+      y_predict = fluid.layers.fc(input=x, size=10, param_attr=param_attrs)
+以下为fluid支持的初始化方式：
+1. BilinearInitializer
+-----------------------
+线性初始化方法。用该方法初始化的反卷积操作可当做线性插值操作使用。
+可用别名：Bilinear
+API请参考：:ref:`api_fluid_initializer_BilinearInitializer`
+2. ConstantInitializer
+----------------------
+常数初始化方式，将parameter初始化为指定的数值。
+可用别名：Constant
+API请参考：:ref:`api_fluid_initializer_ConstantInitializer`
+3. MSRAInitializer
+------------------
+该初始化方法参考论文: https://arxiv.org/abs/1502.01852
+可用别名：MSRA
+API请参考：:ref:`api_fluid_initializer_MSRAInitializer`
+4. NormalInitializer
+---------------------
+随机高斯分布初始化方法。
+可用别名：Normal
+API请参考：:ref:`api_fluid_initializer_NormalInitializer`
+5. TruncatedNormalInitializer
+-----------------------------
+随机截断高斯分布初始化方法。
+可用别名：TruncatedNormal
+API请参考：:ref:`api_fluid_initializer_TruncatedNormalInitializer`
+6. UniformInitializer
+--------------------
+随机均匀分布初始化方式。
+可用别名：Uniform
+API请参考：:ref:`api_fluid_initializer_UniformInitializer`
+7. XavierInitializer
+--------------------
+该初始化方式参考论文: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf
+可用别名：Xavier
+API请参考：:ref:`api_fluid_initializer_XavierInitializer`
+正则化方式
+=============
+fluid通过设置 :code:`ParamAttr` 的 :code:`regularizer` 属性为单个parameter设置正则化。
+  .. code-block:: python
+      param_attrs = fluid.ParamAttr(name="fc_weight",
+                                regularizer=fluid.regularizer.L1DecayRegularizer(0.1))
+      y_predict = fluid.layers.fc(input=x, size=10, param_attr=param_attrs)
+以下为fluid支持的正则化方式：
+- :ref:`api_fluid_regularizer_L1DecayRegularizer` (别名：L1Decay)
+- :ref:`api_fluid_regularizer_L2DecayRegularizer` (别名：L2Decay)
+Clipping
+==========
+fluid通过设置 :code:`ParamAttr` 的 :code:`gradient_clip` 属性为单个parameter设置clipping方式。
+  .. code-block:: python
+      param_attrs = fluid.ParamAttr(name="fc_weight",
+                                regularizer=fluid.regularizer.L1DecayRegularizer(0.1))
+      y_predict = fluid.layers.fc(input=x, size=10, param_attr=param_attrs)
+以下为fluid支持的clipping方式：
+1. ErrorClipByValue
+-------------------
+用来将一个tensor的值clipping到指定范围。
+API请参考：:ref:`api_fluid_clip_ErrorClipByValue`
+2. GradientClipByGlobalNorm
+---------------------------
+用来将多个Tensor的global-norm限制在 :code:`clip_norm` 以内。
+API请参考：:ref:`api_fluid_clip_GradientClipByGlobalNorm`
+3. GradientClipByNorm
+---------------------
+将Tensor的l2-norm限制在 :code:`max_norm` 以内。如果Tensor的l2-norm超过了 :code:`max_norm` ，
+会将计算出一个 :code:`scale` ，该Tensor的所有值乘上计算出来的 :code:`scale` .
+API请参考：:ref:`api_fluid_clip_GradientClipByNorm`
+4. GradientClipByValue
+----------------------
+将parameter对应的gradient的值限制在[min, max]范围内。
+API请参考：:ref:`api_fluid_clip_GradientClipByValue`
+模型平均
+========
+fluid通过 :code:`ParamAttr` 的 :code:`do_model_average` 属性设置单个parameter是否进行平均优化。
+示例如下：
+  .. code-block:: python
+      param_attrs = fluid.ParamAttr(name="fc_weight",
+                                do_model_average=true)
+      y_predict = fluid.layers.fc(input=x, size=10, param_attr=param_attrs)
+在miniBatch训练过程中，每个batch过后，都会更新一次parameters，模型平均做的就是平均最近k次更新产生的parameters。
+平均后的parameters只是被用来进行测试和预测，其并不参与实际的训练过程。
+具体API请参考：:ref:`api_fluid_optimizer_ModelAverage`
--- a/doc/fluid/api/fluid.rst
+++ b/doc/fluid/api/fluid.rst
@@ -13,6 +13,15 @@ _switch_scope
 ..  autofunction:: paddle.fluid._switch_scope
    :noindex:
+.. _api_fluid_AsyncExecutor:
+AsyncExecutor
+-------------
+..  autoclass:: paddle.fluid.AsyncExecutor
+    :members:
+    :noindex:
 .. _api_fluid_BuildStrategy:
 BuildStrategy
@@ -65,6 +74,15 @@ CUDAPlace
    :members:
    :noindex:
+.. _api_fluid_DataFeedDesc:
+DataFeedDesc
+------------
+..  autoclass:: paddle.fluid.DataFeedDesc
+    :members:
+    :noindex:
 .. _api_fluid_DataFeeder:
 DataFeeder

--- a/doc/fluid/api/index_cn.rst
+++ b/doc/fluid/api/index_cn.rst
 =============
-API 说明文档
+API Reference
 =============
 ..  toctree::
    :maxdepth: 1
    api_guides/index.rst
    fluid.rst
    average.rst
    backward.rst

--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
@@ -5,8 +5,6 @@ API Reference
 ..  toctree::
    :maxdepth: 1
-    api_guides/index.rst
    fluid.rst
    average.rst
    backward.rst

--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -147,6 +147,14 @@ batch
 ..  autofunction:: paddle.fluid.layers.batch
    :noindex:
+.. _api_fluid_layers_create_py_reader_by_data:
+create_py_reader_by_data
+------------------------
+..  autofunction:: paddle.fluid.layers.create_py_reader_by_data
+    :noindex:
 .. _api_fluid_layers_data:
 data
@@ -223,6 +231,30 @@ shuffle
 nn
 ==
+.. _api_fluid_layers_add_position_encoding:
+add_position_encoding
+---------------------
+..  autofunction:: paddle.fluid.layers.add_position_encoding
+    :noindex:
+.. _api_fluid_layers_affine_channel:
+affine_channel
+--------------
+..  autofunction:: paddle.fluid.layers.affine_channel
+    :noindex:
+.. _api_fluid_layers_affine_grid:
+affine_grid
+-----------
+..  autofunction:: paddle.fluid.layers.affine_grid
+    :noindex:
 .. _api_fluid_layers_autoincreased_step_counter:
 autoincreased_step_counter
@@ -255,6 +287,14 @@ beam_search_decode
 ..  autofunction:: paddle.fluid.layers.beam_search_decode
    :noindex:
+.. _api_fluid_layers_bilinear_tensor_product:
+bilinear_tensor_product
+-----------------------
+..  autofunction:: paddle.fluid.layers.bilinear_tensor_product
+    :noindex:
 .. _api_fluid_layers_brelu:
 brelu
@@ -527,6 +567,22 @@ gaussian_random_batch_size_like
 ..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
    :noindex:
+.. _api_fluid_layers_grid_sampler:
+grid_sampler
+------------
+..  autofunction:: paddle.fluid.layers.grid_sampler
+    :noindex:
+.. _api_fluid_layers_group_norm:
+group_norm
+----------
+..  autofunction:: paddle.fluid.layers.group_norm
+    :noindex:
 .. _api_fluid_layers_gru_unit:
 gru_unit
@@ -543,6 +599,14 @@ hard_sigmoid
 ..  autofunction:: paddle.fluid.layers.hard_sigmoid
    :noindex:
+.. _api_fluid_layers_hash:
+hash
+----
+..  autofunction:: paddle.fluid.layers.hash
+    :noindex:
 .. _api_fluid_layers_hsigmoid:
 hsigmoid
@@ -631,6 +695,14 @@ log
 ..  autofunction:: paddle.fluid.layers.log
    :noindex:
+.. _api_fluid_layers_log_loss:
+log_loss
+--------
+..  autofunction:: paddle.fluid.layers.log_loss
+    :noindex:
 .. _api_fluid_layers_logical_and:
 logical_and
@@ -679,6 +751,14 @@ lstm_unit
 ..  autofunction:: paddle.fluid.layers.lstm_unit
    :noindex:
+.. _api_fluid_layers_margin_rank_loss:
+margin_rank_loss
+----------------
+..  autofunction:: paddle.fluid.layers.margin_rank_loss
+    :noindex:
 .. _api_fluid_layers_matmul:
 matmul
@@ -887,6 +967,22 @@ resize_bilinear
 ..  autofunction:: paddle.fluid.layers.resize_bilinear
    :noindex:
+.. _api_fluid_layers_resize_nearest:
+resize_nearest
+--------------
+..  autofunction:: paddle.fluid.layers.resize_nearest
+    :noindex:
+.. _api_fluid_layers_roi_align:
+roi_align
+---------
+..  autofunction:: paddle.fluid.layers.roi_align
+    :noindex:
 .. _api_fluid_layers_roi_pool:
 roi_pool
@@ -927,6 +1023,14 @@ scatter
 ..  autofunction:: paddle.fluid.layers.scatter
    :noindex:
+.. _api_fluid_layers_selu:
+selu
+----
+..  autofunction:: paddle.fluid.layers.selu
+    :noindex:
 .. _api_fluid_layers_sequence_concat:
 sequence_concat
@@ -1015,6 +1119,14 @@ sequence_reshape
 ..  autofunction:: paddle.fluid.layers.sequence_reshape
    :noindex:
+.. _api_fluid_layers_sequence_reverse:
+sequence_reverse
+----------------
+..  autofunction:: paddle.fluid.layers.sequence_reverse
+    :noindex:
 .. _api_fluid_layers_sequence_scatter:
 sequence_scatter
@@ -1023,6 +1135,14 @@ sequence_scatter
 ..  autofunction:: paddle.fluid.layers.sequence_scatter
    :noindex:
+.. _api_fluid_layers_sequence_slice:
+sequence_slice
+--------------
+..  autofunction:: paddle.fluid.layers.sequence_slice
+    :noindex:
 .. _api_fluid_layers_sequence_softmax:
 sequence_softmax
@@ -1031,6 +1151,14 @@ sequence_softmax
 ..  autofunction:: paddle.fluid.layers.sequence_softmax
    :noindex:
+.. _api_fluid_layers_sequence_unpad:
+sequence_unpad
+--------------
+..  autofunction:: paddle.fluid.layers.sequence_unpad
+    :noindex:
 .. _api_fluid_layers_shape:
 shape
@@ -1047,6 +1175,14 @@ sigmoid_cross_entropy_with_logits
 ..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
    :noindex:
+.. _api_fluid_layers_similarity_focus:
+similarity_focus
+----------------
+..  autofunction:: paddle.fluid.layers.similarity_focus
+    :noindex:
 .. _api_fluid_layers_slice:
 slice
@@ -1087,6 +1223,14 @@ softmax_with_cross_entropy
 ..  autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
    :noindex:
+.. _api_fluid_layers_space_to_depth:
+space_to_depth
+--------------
+..  autofunction:: paddle.fluid.layers.space_to_depth
+    :noindex:
 .. _api_fluid_layers_split:
 split
@@ -1453,6 +1597,30 @@ fill_constant_batch_size_like
 ..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
    :noindex:
+.. _api_fluid_layers_has_inf:
+has_inf
+-------
+..  autofunction:: paddle.fluid.layers.has_inf
+    :noindex:
+.. _api_fluid_layers_has_nan:
+has_nan
+-------
+..  autofunction:: paddle.fluid.layers.has_nan
+    :noindex:
+.. _api_fluid_layers_isfinite:
+isfinite
+--------
+..  autofunction:: paddle.fluid.layers.isfinite
+    :noindex:
 .. _api_fluid_layers_ones:
 ones
@@ -1477,6 +1645,14 @@ sums
 ..  autofunction:: paddle.fluid.layers.sums
    :noindex:
+.. _api_fluid_layers_tensor_array_to_tensor:
+tensor_array_to_tensor
+----------------------
+..  autofunction:: paddle.fluid.layers.tensor_array_to_tensor
+    :noindex:
 .. _api_fluid_layers_zeros:
 zeros
@@ -1571,6 +1747,14 @@ box_coder
 ..  autofunction:: paddle.fluid.layers.box_coder
    :noindex:
+.. _api_fluid_layers_density_prior_box:
+density_prior_box
+-----------------
+..  autofunction:: paddle.fluid.layers.density_prior_box
+    :noindex:
 .. _api_fluid_layers_detection_map:
 detection_map

--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -104,6 +104,24 @@ FtrlOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_LarsMomentum:
+LarsMomentum
+------------
+..  autoclass:: paddle.fluid.optimizer.LarsMomentum
+    :members:
+    :noindex:
+.. _api_fluid_optimizer_LarsMomentumOptimizer:
+LarsMomentumOptimizer
+---------------------
+..  autoclass:: paddle.fluid.optimizer.LarsMomentumOptimizer
+    :members:
+    :noindex:
 .. _api_fluid_optimizer_ModelAverage:
 ModelAverage
@@ -136,15 +154,6 @@ MomentumOptimizer
 RMSPropOptimizer
 ----------------
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
-    :members:
-    :noindex:
-.. _api_fluid_optimizer_RMSPropOptimizer:
-RMSPropOptimizer
----------------
 ..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
    :members:
    :noindex:

--- a/doc/fluid/beginners_guide/index.rst
+++ b/doc/fluid/beginners_guide/index.rst
@@ -2,19 +2,26 @@
 新手入门
 ########
+PaddlePaddle (PArallel Distributed Deep LEarning)是一个易用、高效、灵活、可扩展的深度学习框架
+您可参考我们的 `Github <https://github.com/PaddlePaddle/Paddle>`_ 了解详情，也可阅读 `版本说明 <../release_note.html>`_ 了解新版本的特性
 =========
  概览
 =========
-请您首先阅读以下文档，了解安装方法：
+当您第一次来到PaddlePaddle，请您首先阅读以下文档，了解安装方法：
-    - `安装说明 <../beginners_guide/install/Start.html>`_：我们支持在Ubunt/CentOS/Windows/MacOS环境上的安装
+    - `安装说明 <../beginners_guide/install/Start.html>`_：我们支持在Ubuntu/CentOS/Windows/MacOS环境上的安装
 如果您初次接触深度学习，在学习PaddlePaddle之前建议您先阅读以下资料：
    - `学习资料 <../beginners_guide/basics/learning_materials.html>`_：推荐机器学习、深度学习和编程语言三个方面的书籍与视频公开课
-如果您已经具备一定的深度学习基础，第一次使用 Fluid 时，可以跟随下列简单的模型案例供您快速上手：
+如果您已经具备一定的深度学习基础，第一次使用PaddlePaddle时，可以跟随下列简单的模型案例供您快速上手：
+    - `Fluid编程指南 <../beginners_guide/programming_guide/programming_guide.html>`_：介绍 Fluid 的基本概念和使用方法
    - `Fluid编程指南 <../beginners_guide/programming_guide/programming_guide.html>`_：介绍 Fluid 的基本概念和使用方法
@@ -29,7 +36,7 @@
 ..  toctree::
    :maxdepth: 2
    install/Start.rst
    quick_start/index.rst
    basics/index.rst

--- a/doc/fluid/beginners_guide/install/compile/compile_Ubuntu.md
+++ b/doc/fluid/beginners_guide/install/compile/compile_Ubuntu.md
@@ -19,7 +19,7 @@
 在Ubuntu的系统下我们提供2种编译方式：
 * Docker源码编译
-* 直接本机源码编译
+* 直接本机源码编译（不支持ubuntu18.04下GPU版本）      
 我们更加推荐**使用Docker进行编译**，因为我们在把工具和配置都安装在一个 Docker image 里。这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。

--- a/doc/fluid/beginners_guide/install/compile/compile_Windows.md
+++ b/doc/fluid/beginners_guide/install/compile/compile_Windows.md
+***
+# **Windows下从源码编译**
+本说明将介绍如何在*64位台式机或笔记本电脑*以及Windows 10系统下编译PaddlePaddle，我们支持的Windows系统需满足以下要求：
+* Windows 10 家庭版/专业版/企业版
+* Visual Studio 2015 Update3
+## 确定要编译的版本
+* **仅支持CPU的PaddlePaddle**。
+<!--* 支持GPU的PaddlePaddle，为了使得PaddlePaddle程序运行的更加迅速，我们通常使用GPU对PaddlePaddle程序进行加速，但安装GPU版本的PaddlePaddle需要先拥有满足以下条件的NVIDIA? GPU（具体安装流程和配置请务必参见NVIDIA官方文档：[For CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/)，[For cuDNN](https://docs.nvidia.com/deeplearning/sdk/cudnn-install/)）
+	* *Cuda 工具包9.0配合cuDNN v7*
+	* *Cuda 工具包8.0配合cuDNN v7*
+	* *GPU运算能力超过1.0的硬件设备*-->
+## 选择如何编译
+我们在Windows的系统下提供1种编译方式：
+* 直接本机源码编译
+由于在本机上的情况更加复杂，因此我们只支持特定的系统。
+<a name="ct_source"></a>
+### ***本机编译***
+**请严格按照以下指令顺序执行**
+1. 检查您的计算机和操作系统是否符合我们支持的编译标准
+    * Windows 10 家庭版/专业版/企业版
+    * Visual Studio 2015 Update3
+2. 安装必要的工具 cmake，git 以及 python ：
+    > cmake 需要3.0 及以上版本, 可以在官网进行下载，并添加到环境变量中。 [下载地址](https://cmake.org/download/)
+    > git可以在官网进行下载，并添加到环境变量中。 [下载地址](https://gitforwindows.org/)
+    > python 需要2.7 及以上版本, 同时确保 `numpy, protobuf, wheel` 等模块得到安装 [下载地址](https://www.python.org/download/releases/2.7/)
+        * 安装 numpy 包可以通过命令 `pip install numpy` 或 `pip3 install numpy`
+        * 安装 protobuf 包可以通过命令 `pip install protobuf` 或 `pip3 install protobuf`
+        * 安装 wheel 包可以通过命令 `pip install wheel` 或 `pip3 install wheel`
+3. 将PaddlePaddle的源码clone在当下目录下的Paddle的文件夹中，并进入Padde目录下：
+	- `git clone https://github.com/PaddlePaddle/Paddle.git`
+	- `cd Paddle`
+4. 切换到较稳定release分支下进行编译(支持1.2.x及以上版本)：
+	- `git checkout release/x.x.x`
+5. 创建名为build的目录并进入：
+	- `mkdir build`
+	- `cd build`
+6. 执行cmake：
+	>具体编译选项含义请参见[编译选项表](../Tables.html/#Compile)<!--TODO：Link 安装选项表到这里-->
+	*  对于需要编译**CPU版本PaddlePaddle**的用户：
+		For Python2: `cmake .. -G "Visual Studio 14 2015 Win64" -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIRS}
+			 -DPYTHON_LIBRARY=${PYTHON_LIBRARY}
+			 -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE} -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release`
+		For Python3: `cmake .. -G "Visual Studio 14 2015 Win64" -DPY_VERSION=3.5 -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIRS}
+			 -DPYTHON_LIBRARY=${PYTHON_LIBRARY}
+			 -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE} -DWITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release`
+		> 如果遇到`Could NOT find PROTOBUF (missing:  PROTOBUF_LIBRARY PROTOBUF_INCLUDE_DIR)`可以重新执行一次cmake指令
+7. 部分第三方依赖包（openblas，snappystream）目前需要用户自己提供预编译版本，也可以到 `https://github.com/wopeizl/Paddle_deps` 下载预编译好的文件， 将整个 `third_party` 文件夹放到 `build` 目录下.
+8. 使用Blend for Visual Studio 2015 打开 `paddle.sln` 文件，选择平台为 `x64`，配置为 `Release`，开始编译
+9. 编译成功后进入 `\paddle\build\python\dist` 目录下找到生成的 `.whl` 包：
+	`cd \paddle\build\python\dist`
+10. 在当前机器或目标机器安装编译好的 `.whl` 包：
+	`pip install （whl包的名字）` 或 `pip3 install （whl包的名字）`
+恭喜您，现在您已经完成使本机编译PaddlePaddle的过程了。
+## ***验证安装***
+安装完成后您可以使用：`python` 进入Python解释器，然后使用 `import paddle.fluid`, 如沒有提示错误，则表明安装成功。
+## ***如何卸载***
+请使用以下命令卸载PaddlePaddle：
+* ***CPU版本的PaddlePaddle***: `pip uninstall paddlepaddle` 或 `pip3 uninstall paddlepaddle`
--- a/doc/fluid/beginners_guide/install/compile/fromsource.rst
+++ b/doc/fluid/beginners_guide/install/compile/fromsource.rst
@@ -11,3 +11,4 @@
 	compile_Ubuntu.md
 	compile_CentOS.md
 	compile_MacOS.md
+	compile_Windows.md
--- a/doc/fluid/beginners_guide/install/install_Ubuntu.md
+++ b/doc/fluid/beginners_guide/install/install_Ubuntu.md
@@ -32,7 +32,7 @@
 **使用pip安装**（最便捷的安装方式），我们为您提供pip安装方法，但它更依赖您的本机环境，可能会出现和您本机环境相关的一些问题。
-**使用Docker进行安装**（最保险的安装方式），因为我们在把工具和配置都安装在一个 Docker image 里，这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。                 
+**使用Docker进行安装**（最保险的安装方式），因为我们在把工具和配置都安装在一个 Docker image 里，这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
@@ -46,14 +46,15 @@
 您可以直接粘贴以下命令到命令行来安装PaddlePaddle(适用于ubuntu16.04及以上安装CPU-ONLY的版本)，如果出现问题，您可以参照后面的解释对命令作出适应您系统的更改：
 Python2.7：
 	apt update && apt install -y python-dev python-pip && pip install paddlepaddle
-Python3.5（该指令适用于本机未安装python2的用户，否则，请卸载python2之后再使用本指令）：
+Python3.5（该指令适用于本机未安装python2的用户，否则，请卸载python2之后再使用本指令）：        
 	apt-get udpate && apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && apt-get install -y curl python3.5 python3.5-dev wget vim git && curl https://bootstrap.pypa.io/get-pip.py -o - | python3.5 && easy_install pip && pip3 install paddlepaddle
 首先，我们使用以下指令来**检测本机的环境**是否适合安装PaddlePaddle：
 `uname -m && cat /etc/*release`
@@ -63,25 +64,25 @@ Python3.5（该指令适用于本机未安装python2的用户，否则，请卸
 其次，您的电脑需要满足以下任一要求：
-*	Python2.7.x (dev)，Pip >= 9.0.1 
+*	Python2.7.x (dev)，Pip >= 9.0.1
-*	Python3.5.x (dev)，Pip3 >= 9.0.1    
+*	Python3.5.x (dev)，Pip3 >= 9.0.1
 	> 您的Ubuntu上可能已经安装pip请使用pip -V或pip3 -V来确认我们建议使用pip 9.0.1或更高版本来安装
 	更新apt的源：   `apt update`
-	使用以下命令安装或升级Python和pip到需要的版本：      
+	使用以下命令安装或升级Python和pip到需要的版本：
-	- For python2： `sudo apt install python-dev python-pip`  
+	- For python2： `sudo apt install python-dev python-pip`
 	- For python3：`sudo apt install python3.5-dev` and `curl https://bootstrap.pypa.io/get-pip.py -o - | python3.5 && easy_install pip`
 	> 即使您的环境中已经有Python2或Python3也需要安装Python-dev或Python3.5-dev。
 现在，让我们来安装PaddlePaddle：
 1. 使用pip install来安装PaddlePaddle
-	* 对于需要**CPU版本PaddlePaddle**的用户：`pip install paddlepaddle` 或 `pip3 install paddlepaddle`
+	* 对于需要**CPU版本PaddlePaddle**的用户：`pip install paddlepaddle` 或 `pip3 install paddlepaddle`        
 	* 对于需要**GPU版本PaddlePaddle**的用户：`pip install paddlepaddle-gpu` 或 `pip3 install paddlepaddle-gpu`
@@ -90,18 +91,20 @@ Python3.5（该指令适用于本机未安装python2的用户，否则，请卸
 			i. `wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb`     
 			ii.  `dpkg -i nvidia-machine-learning-repo-ubuntu1604_1.0.0-1_amd64.deb`       	 	          
 			iii. `sudo apt-get install -y libnccl2=2.2.13-1+cuda9.0 libnccl-dev=2.2.13-1+cuda9.0` 
 	> 2. 如果您不规定pypi包版本号，我们默认为您提供支持Cuda 9/cuDNN v7的PaddlePaddle版本。
-	对于出现`Cannot uninstall 'six'.`问题的用户，可是由于您的系统中已有的Python安装问题造成的，请使用`pip install paddlepaddle --ignore-installed six`（CPU）或`pip 	install paddlepaddle --ignore-installed six`（GPU）解决。      
+	对于出现`Cannot uninstall 'six'.`问题的用户，可是由于您的系统中已有的Python安装问题造成的，请使用`pip install paddlepaddle --ignore-installed six`（CPU）或`pip 	install paddlepaddle --ignore-installed six`（GPU）解决。
-	* 对于有**其他要求**的用户：`pip install paddlepaddle==[版本号]` 或 `pip3 install paddlepaddle==[版本号]`       
+	* 对于有**其他要求**的用户：`pip install paddlepaddle==[版本号]` 或 `pip3 install paddlepaddle==[版本号]`
 	> `版本号`参见[安装包列表](./Tables.html/#whls)或者您如果需要获取并安装**最新的PaddlePaddle开发分支**，可以从我们的[CI系统](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) 中下载最新的whl安装包和c-api开发包并安装。如需登录，请点击“Log in as guest”。
 现在您已经完成使用`pip install` 来安装的PaddlePaddle的过程。
@@ -127,19 +130,19 @@ Python3.5（该指令适用于本机未安装python2的用户，否则，请卸
 	* 对于需要**CPU版本的PaddlePaddle**的用户请使用以下指令拉取我们为您预安装好*PaddlePaddle For CPU*的镜像：
 		`docker pull hub.baidubce.com/paddlepaddle/paddle:1.1`
 	* 对于需要**GPU版本的PaddlePaddle**的用户请使用以下指令拉取我们为您预安装好*PaddlePaddle For GPU*的镜像：
 		`docker pull hub.baidubce.com/paddlepaddle/paddle:1.1-gpu-cuda9.0-cudnn7`
 	* 您也可以通过以下指令拉取任意的我们提供的Docker镜像：
 		`docker pull hub.baidubce.com/paddlepaddle/paddle:[tag]`
 		> （请把[tag]替换为[镜像表](./Tables.html/#dockers)中的内容）
 2. 使用以下指令用已经拉取的镜像构建并进入Docker容器：
 	`docker run --name [Name of container] -it -v $PWD:/paddle <imagename> /bin/bash`
@@ -149,13 +152,13 @@ Python3.5（该指令适用于本机未安装python2的用户，否则，请卸
 3. （可选：当您需要第二次进入Docker容器中）使用如下命令使用PaddlePaddle：
 	`docker start [Name of container]`
 	> 启动之前创建的容器。
 	`docker attach [Name of container]`
 	> 进入启动的容器。
 至此您已经成功使用Docker安装PaddlePaddle，您只需要进入Docker容器后运行PaddlePaddle即可，更多Docker使用请参见[Docker官方文档](https://docs.docker.com)。
 > 注：PaddlePaddle Docker镜像为了减小体积，默认没有安装`vim`，您可以在容器中执行 `apt-get install -y vim` 安装后，在容器中编辑代码。

--- a/doc/fluid/beginners_guide/install/install_Windows.md
+++ b/doc/fluid/beginners_guide/install/install_Windows.md
@@ -12,25 +12,33 @@
 * Windows下我们目前仅提供支持CPU的PaddlePaddle。
 ## 选择如何安装
-在Windows系统下请使用我们为您提供的[一键安装包](http://paddle-windows.bj.bcebos.com/1.1/PaddlePaddle-windows-1.1.zip)进行安装
-> 我们提供的一键安装包将基于Docker为您进行便捷的安装流程
+### ***使用pip安装***
-我们之所以使用**基于Docker的安装方式**，是因为我们在把工具和配置都安装在一个 Docker image 里，这样如果遇到问题，其他人可以复现问题以便帮助。另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。        
+我们暂不提供快速安装的命令，请您按照以下步骤进行安装
+* 首先，**检查您的计算机和操作系统**是否满足以下要求：
+		For python2: 使用Python官方下载的python2.7.15
+		For python3: 使用Python官方下载的python3.5.x
+*  Python2.7.x，pip >= 9.0.1
+*  Python3.5.x，pip3 >= 9.0.1
+下面将说明如何安装PaddlePaddle：
+* 使用pip install来安装PaddlePaddle：
+    ** paddlepaddle 的依赖包 `recordio` 有可能用 `pip` 的默认源无法安装，可以使用 `easy_install recordio` 来安装 **
+	** 对于需要**CPU版本PaddlePaddle**的用户：`pip install paddlepaddle` 或 `pip3 install paddlepaddle` **
+现在您已经完成通过`pip install` 来安装的PaddlePaddle的过程。
-<br/><br/>
 ## ***验证安装***
 安装完成后您可以使用：`python` 或 `python3` 进入python解释器，然后使用`import paddle.fluid` 验证是否安装成功。
-<br/><br/>
 ## ***如何卸载***
 请使用以下命令卸载PaddlePaddle（使用docker安装PaddlePaddle的用户请进入包含PaddlePaddle的容器中使用以下命令）：

--- a/doc/fluid/book/index_en.rst
+++ b/doc/fluid/book/index_en.rst
+Book
+======
+..  toctree::
+  :maxdepth: 1
+  ../beginners_guide/basics/index.rst
+  ../beginners_guide/quick_start/index.rst
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -11,6 +11,5 @@ Development
  new_op_kernel.md
  use_eigen_en.md
  name_convention.md
-  support_new_device.md
  releasing_process_en.md
  op_markdown_format.md
--- a/doc/fluid/index_cn.rst
+++ b/doc/fluid/index_cn.rst
@@ -15,3 +15,4 @@
    user_guides/index.rst
    advanced_usage/index.rst
    api/index_cn.rst
+    release_note.rst
--- a/doc/fluid/index_en.rst
+++ b/doc/fluid/index_en.rst
@@ -9,5 +9,7 @@
  design/index_en.rst
  howto/index_en.rst
  dev/index_en.rst
-  faq/index_en.rst
  api/index_en.rst
+  book/index_en.rst
+  user_guides/models/index_en.rst
+  advanced_usage/deploy/index_mobile.rst
--- a/doc/fluid/release_note.rst
+++ b/doc/fluid/release_note.rst
+==============
+版本说明
+==============
+PaddlePaddle v1.1
+#####################
+PaddlePaddle v1.1 在基础框架、模型建设、分布式训练、预测引擎各个方向上完成多项更新。OP进行了全面完善和优化，模型库新增了自然语言处理、视觉和推荐等领域的大量经典模型，分布式训练能力显著提升，支持千亿规模稀疏参数大规模多机异步训练，预测库易用性和效率提升，移动端预测支持更多模型和更多硬件。详情如下：
+基础框架
+=========
+* 安装
+	* Mac OS X 10.11及以上pip安装支持。
+	* Mac OS X 10.12及以上从源码编译安装支持。
+* 编程语言
+	* Python3的支持（python3.5版本）。
+* IO
+	* 新增PyReader，支持用户基于python自定义数据读取和预处理的的高性能数据输入。在ResNet50模型上，单机情况下：单卡数据读取速度提升4%、4卡数据读取速度提升38%、8卡数据读取速度提升60%。
+	* 实现一个多进程PyReader decorator，配合PyReader可以实现数据读取性能线性提升。
+* OP优化
+	* 优化了 :code:`split operator` ，显著提升性能。
+	* 扩展 :code:`multiclass_nms operator` ，支持多边形的预测框。
+	* 通过 :code:`generatoe_proposals operator` 的CUDA实现，显著提升性能。
+	* 通过 :code:`affine_channel operator` 融合batch_norm operator，显著提升性能。
+	* 优化 :code:`depthwise_conv operator` 的forward和backward，显著提升性能。
+	* 优化 :code:`reduce_mean operator` 。
+	* 优化 :code:`sum operator` ，该operator在输入是 :code:`Tensor` 的情况下，减少一次zero memory耗时。
+	* 优化 :code:`top_k operator` ，显著提升性能。
+	* 优化 :code:`sequence_pool operator` ，显著提升性能。
+	* 优化 :code:`elementwise_add operator` ，显著提升性能。
+	*  :code:`while operator` 性能优化，相关的模型性能整体提升 30%+。
+	*  :code:`sequence_slice operator` 的实现，对于一个sequence，可以从指定位置开始，slice出指定长度的subsequence。
+	*  :code:`sequence_unpad operator` 的实现，支持padding Tensor转LoDTensor。
+	* 支持截断正态分布初始化方法(truncated normal initializer)。
+	* 二维 :code:`padding operator` 的实现，支持一个每个纬度的首尾padding不同的大小。
+	* 更多 operator支持： :code:`sequence_reverse operator` ， :code:`sequence_enumerate operator` , :code:`sequence_scatter operator` , :code:`roi_align operator` ， :code:`affine_channel operator` , :code:`anchor_generator operator` , :code:`generate_proposal_labels operator` , :code:`generate_proposals operator` , :code:`rpn_target_assign operator` 、 :code:`roi透视变换operator` ,  :code:`seq_pool operator` 、 :code:`seq_expand operator` 、 :code:`seq_concat operator` 、 :code:`seq_softmax operator` 、 :code:`lod_to_array operator` 、 :code:`array_to_lod operator` 。
+* 显存优化
+	* 显存优化策略eager deletion支持control flow (e.g. if-else, while)中子block的优化。显著降低包含control flow的模型的显存开销。
+模型建设
+=========
+* 自然语言处理方向增加开源语义匹配DAM模型和阅读理解BiDAF模型，机器翻译Transformer模型性能优化后训练速度提升超过30%，模型效果和训练速度均达到业界领先水平。
+* 计算机视觉方向增加开源OCR识别Seq2Seq-Attention模型，目标检测Faster-RCNN模型，图像语义分割DeepLab v3+模型，视频分类TSN模型，图像生成CircleGAN/ConditionalGAN/DCGAN模型，以及Deep Metric Learning模型，模型效果和训练速度均达到业界领先水平。
+* 个性化推荐任务系列模型支持：新闻自动标签模型TagSpace，序列语义召回模型GRU4Rec、SequenceSemanticRetrieval，点击率预估模型DeepCTR，多视角兴趣匹配模型multiview-simnet。
+	* TagSpace : TagSpace: Semantic Embeddings from Hashtags。
+	* SequenceSemanticRetrieval  : Multi-Rate Deep Learning for Temporal Recommendation。
+	* multiview-simnet  : A Multi-View Deep Learning Approach for Cross Domain User Modeling in Recommendation Systems。
+	* GRU4Rec  : Session-based Recommendations with Recurrent Neural Networks。
+	* DeepCTR  : DeepFM: A Factorization-Machine based Neural Network for CTR Prediction。
+* 公开的Quora数据集上，实现并复现了四个公开匹配算法，具有较好的通用性，可应用于NLP、搜索、推荐等场景。
+	* cdssmNet：Learning semantic representations using convolutional neural networks for web search 。
+	* decAttNet：Neural paraphrase identification of questions with noisy pretraining 。
+	* inferSentNet：Supervised learning of universal sentence representations from natural language inference data 。
+	* SSENet：Shortcut-stacked sentence encoders for multi-domain inference。
+分布式训练
+==========
+* GPU多机多卡同步训练支持参数同步频率可配置化，在V100上支持的batch size提升为v1.0版本的8倍，通过合理的通信间隔配置，使GPU卡数较少的情况下超大Batch同步训练成为可能，并在优化算法方面保证了收敛效果不变。
+* 支持千亿规模稀疏参数服务器，用于大规模多机异步训练，适用于推荐、广告等领域的点击率预估模型。
+预测引擎
+========
+* 服务器预测
+	* 预测库Windows支持。
+	* PaddlePredictor C++ 接口稳定版发布，已经实际支持一部分业务上线，并持续向前兼容。
+	* 预发布整合了 TensorRT 子图加速方案。运行时自动切割计算子图调用TensorRT加速。目前Paddle TensorRT 依旧在持续开发中，已支持的模型有 AlexNet, MobileNet, ResNet50, VGG19, ResNet, MobileNet-SSD等。
+	* 基于图优化的 CPU 加速 feature，实现了 LSTM，GRU 等一系列 operator 的 fuse，理论上可以大幅提升序列相关模型的性能。
+	* 增加了部署时 AVX 和 NOAVX 自动切换的feature，可以针对重点模型实现AVX, AVX2, AVX512自动切换。
+	* 提升预测库易用性：只需要 include一个头文件和一个库。
+	* ICNet 预测性能大幅提升。
+* 移动端预测
+	* 树莓派上MobileNet、GoogleNet、ResNet 34等多个模型支持。
+	* Mali GPU和Andreno GPU上MobileNet v1模型支持。
+	* ZU5、ZU9等FPGA开发板上ResNet 34和ResNet 50模型支持。
--- a/doc/fluid/user_guides/howto/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/user_guides/howto/inference/build_and_install_lib_cn.rst
@@ -22,18 +22,18 @@ cuda9.0_cudnn7_avx_mkl   `fluid_inference.tgz <https://guest:@paddleci.ngrok.io/
 ----------
 用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：
-=================   =========
+============================  =========
-选项                 值   
+选项                           值   
-=================   =========
+============================  =========
-CMAKE_BUILD_TYPE    Release
+CMAKE_BUILD_TYPE              Release
 FLUID_INFERENCE_INSTALL_DIR   安装路径    
-WITH_FLUID_ONLY     ON（推荐）
+WITH_FLUID_ONLY               ON（推荐）
-WITH_SWIG_PY        OFF（推荐
+WITH_SWIG_PY                  OFF（推荐）
-WITH_PYTHON         OFF（推荐）
+WITH_PYTHON                   OFF（推荐）
-WITH_GPU            ON/OFF
+ON_INFER                      ON（推荐）
-WITH_MKL            ON/OFF
+WITH_GPU                      ON/OFF
-ON_INFER            ON（预测优化）
+WITH_MKL                      ON/OFF
-=================   =========
+============================  =========
 建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
@@ -67,7 +67,12 @@ ON_INFER            ON（预测优化）
     ├── CMakeCache.txt
     ├── paddle
     │   ├── include
-     │   │   └── paddle_inference_api.h
+     │   │   ├── paddle_anakin_config.h
+     │   │   ├── paddle_analysis_config.h
+     │   │   ├── paddle_api.h
+     │   │   ├── paddle_inference_api.h
+     │   │   ├── paddle_inference_pass.h
+     │   │   └── paddle_pass_builder.h
     │   └── lib
     │       ├── libpaddle_fluid.a
     │       └── libpaddle_fluid.so
@@ -80,10 +85,12 @@ ON_INFER            ON（预测优化）
     │   └── install
     │       ├── gflags
     │       ├── glog
+     │       ├── mkldnn
     │       ├── mklml
     │       ├── protobuf
     │       ├── snappy
     │       ├── snappystream
+     │       ├── xxhash
     │       └── zlib
     └── version.txt
@@ -91,9 +98,9 @@ version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使
  .. code-block:: text
-     GIT COMMIT ID: 23da8defc8314b0c711130c1d9536e2cf2fb8414
+     GIT COMMIT ID: cc9028b90ef50a825a722c55e5fda4b7cd26b0d6
     WITH_MKL: ON
-     WITH_MKLDNN: OFF
+     WITH_MKLDNN: ON
     WITH_GPU: ON
     CUDA version: 8.0
     CUDNN version: v5
--- a/doc/fluid/user_guides/howto/inference/windows_cpp_inference.md
+++ b/doc/fluid/user_guides/howto/inference/windows_cpp_inference.md
@@ -8,79 +8,50 @@ Windows环境模型预测使用说明
 测试环境硬件配置：
-| CPU   |      I7-8700K      |
+| CPU      |      I7-8700K      |
-|----------|:-------------:|
+|:---------|:-------------------|
-| 内存 |  16G |
+| 内存 | 16G               |
-| 硬盘 |  1T hdd + 256G ssd |
+| 硬盘 | 1T hdd + 256G ssd |
-| 显卡 |  GTX1080 8G |
+| 显卡 | GTX1080 8G        |
-测试环境操作系统使用win10 Version 18.03 版本。下载地址：
+测试环境操作系统使用 win10 家庭版本。
 ### 环境配置步骤
-**一定要严格按照安装步骤顺序，否则会安装失败！**
+**请您严格按照以下步骤进行安装，否则可能会导致安装失败！**
-**安装vs2015**
+**安装Visual Studio 2015 update3**
-安装vs2015，安装选项中选择安装内容时勾选自定义，把关于c，c++，vc++的功能都安装上。下载地址：
+安装Visual Studio 2015，安装选项中选择安装内容时勾选自定义，选择安装全部关于c，c++，vc++的功能。
-**安装CUDA8**
-需要去NVIDIA官网[https://www.geforce.cn/drivers](https://www.geforce.cn/drivers)
-下载显卡对应的驱动。推荐391版本
-<p align="center">
- <img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image1.png" >
-</p>
-安装时需要勾选自定义，勾选安装全部。
-验证安装需要进入cmd中，输入nvcc -V查看。
-<p align="center">
-<img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image2.png">
-</p>
-如果有显卡安装驱动，也可以选择直接安装CUDA8.0，[https://developer.nvidia.com/cuda-80-ga2-download-archive](https://developer.nvidia.com/cuda-80-ga2-download-archive)
-**安装CUDNN**
-安装CUDNN只需要将文件中CUDNN
-7下的文件复制到对应的CUDA安装目录下。文件名，cudnn-8.0-windows10-x64-v7.zip。这里提供了cudnn
-7
-64位的版本。需要其他版本可在[https://developer.nvidia.com/cudnn](https://developer.nvidia.com/cudnn)
-下载。
 预测demo使用
 ------------
-解压Paddle，Release，fluid\_install\_dir压缩包。
+解压Paddle，Release，fluid_install_dir压缩包。
-进入Paddle/paddle/fluid/inference/api/demo\_ci目录，新建build目录并进入，然后使用cmake生成vs2015的solution文件。
+进入Paddle/paddle/fluid/inference/api/demo_ci目录，新建build目录并进入，然后使用cmake生成vs2015的solution文件。
 指令为：
-```cmake
-cmake .. -G \"Visual Studio 14 2015 Win64\" -DWITH\_GPU=ON
+`cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=simple_on_word2vec -DPADDLE_LIB=path_to_the_patddle\paddle_fluid.lib`
-DWITH\_MKL=OFF -DWITH\_STATIC\_LIB=ON -DCMAKE\_BUILD\_TYPE=Release
-DDEMO\_NAME=simple\_on\_word2vec
-DPADDLE\_LIB=D:\\to\_the\_paddle\_fluid.lib
-DCUDA\_LIB=D:\\CUDA\\v8.0\\lib\\x64
-```
 注：
-DDEMO\_NAME 是要编译的文件
+-DDEMO_NAME 是要编译的文件
-DPADDLE\_LIB 是fluid\_install\_dir路径，例如
+-DPADDLE_LIB 是fluid_install_dir路径，例如
-DPADDLE\_LIB=D:\\fluid\_install\_dir
+-DPADDLE_LIB=D:\fluid_install_dir
-DCUDA\_LIB 是CUDA安装目录对应的文件夹
-Cmake可以在官网进行下载，并添加到环境变量中。[[https://cmake.org/download/]{.underline}](https://cmake.org/download/)
+Cmake可以在[官网进行下载](https://cmake.org/download/)，并添加到环境变量中。
-执行完毕后，build目录如图所示，打开 箭头指向的solution文件：
+执行完毕后，build 目录如图所示，打开箭头指向的 solution 文件：
 <p align="center">
 <img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image3.png">
 </p>
-修改编译属性为/MT：
+修改编译属性为 `/MT` ：
 <p align="center">
 <img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image4.png">
@@ -90,7 +61,7 @@ Cmake可以在官网进行下载，并添加到环境变量中。[[https://cmake
 <img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image5.png">
 </p>
-编译生成选项改成Release。
+编译生成选项改成 `Release` 。
 <p align="center">
 <img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image6.png">
@@ -110,17 +81,13 @@ Cmake可以在官网进行下载，并添加到环境变量中。[[https://cmake
  1.  开启GLOG
-  	set GLOG\_v=3
+  	`set GLOG_v=100`
  2.  进行预测
-  	simple\_on\_word2vec.exe \--dirname=.\\word2vec.inference.model
+  	`simple_on_word2vec.exe --dirname=.\word2vec.inference.model`
 <p align="center">
 <img src="https://raw.githubusercontent.com/PaddlePaddle/FluidDoc/develop/doc/fluid/user_guides/howto/inference/image/image9.png">
 </p>
-**FAQ：**
-路径中尽量不要包含空格，例如发现CUDA\_LIB路径是Program
-Files(x86)可能会出错。可以将CUDA拷贝到一个新位置（这里直接拷贝就行）
--- a/doc/fluid/user_guides/howto/training/save_load_variables.rst
+++ b/doc/fluid/user_guides/howto/training/save_load_variables.rst
 .. _user_guide_save_load_vars:
 ##################
-保存与载入模型变量
+模型/变量的保存、载入与增量训练
 ##################
 模型变量分类
@@ -37,8 +37,6 @@
 那么我们应该将各种长期变量都保存下来，甚至还需要记录一下当前的epoch和step的id。
 因为一些模型变量虽然不是参数，但对于模型的训练依然必不可少。
-因此，根据需求的不同，我们提供了两套API来分别进行模型的参数和checkpoint的保存。
 保存模型用于对新样本的预测
 ==========================
@@ -61,44 +59,11 @@
 筛选出其中所有的模型参数，并将这些模型参数保存到指定的 :code:`param_path` 之中。
-保存checkpoint用于将来恢复训练
-==============================
-在训练过程中，我们可能希望在一些节点上将当前的训练状态保存下来，
-以便在将来需要的时候恢复训练环境继续进行训练。这一般被称作“checkpoint”。
-想要保存checkpoint，可以使用 :code:`fluid.io.save_checkpiont()` 接口。
-例如：
-.. code-block:: python
-    import paddle.fluid as fluid
-    exe = fluid.Executor(fluid.CPUPlace())
-    path = "./checkpoints"
-    prog = fluid.default_main_program()
-    trainer_args = {"epoch_id": 200,
-                    "step_id": 20} # just an example
-    fluid.io.save_checkpoint(executor=exe,
-                                checkpoint_dir=path,
-                                trainer_id=0,
-                                trainer_args=trainer_args,
-                                main_program=prog,
-                                max_num_checkpoints=3)
-上面的例子中，通过调用 :code:`fluid.io.save_checkpoint` 函数，PaddlePaddle Fluid会对默认
-:code:`fluid.Program` 也就是 :code:`prog` 中的所有模型变量进行扫描，
-根据一系列内置的规则自动筛选出其中所有需要保存的变量，并将他们保存到指定的 :code:`path` 目录下。
-:code:`fluid.io.save_checkpoint` 的各个参数中， :code:`trainer_id` 在单机情况下设置为0即可； :code:`trainer_args`
-为一个Python dict，用于给定当前的epoch_id和step_id；
-:code:`max_num_checkpoints` 用于表示的最大checkpoint数量，
-如果目录中已经存在的checkpoint数量超过这个值，那最早的checkpoint将被删除。
 如何载入模型变量
 ################
-与模型变量的保存相对应，我们提供了两套API来分别载入模型的参数和载入模型的checkpoint。
+与模型变量的保存相对应，我们提供了两套API来分别载入模型的参数和载入模型的长期变量。
 载入模型用于对新样本的预测
 ==========================
@@ -132,11 +97,29 @@
 之前。如果在之后运行，可能会覆盖已加载的模型参数导致错误。
-载入checkpoint用于恢复训练
+预测所用的模型与参数的保存：
+##################
+预测引擎提供了存储预测模型 :code:`fluid.io.save_inference_model` 和加载预测模型 :code:`fluid.io.load_inference_model` 两个接口。
+- :code:`fluid.io.save_inference_model`：请参考  :ref:`api_guide_inference`。
+-  :code:`fluid.io.load_inference_model`：请参考  :ref:`api_guide_inference`。
+增量训练
+############
+增量训练指一个学习系统能不断地从新样本中学习新的知识，并能保存大部分以前已经学习到的知识。因此增量学习涉及到两点：在上一次训练结束的时候保存需要持久化的参数， 在下一次训练开始的时候加载上一次保存的持久化参数。 因此增量训练涉及到如下几个API:
+:code:`fluid.io.save_persistables`、:code:`fluid.io.load_persistables` 。
+单机增量训练
 ==========================
+单机的增量训练的一般步骤如下：
+1. 在训练的最后调用 :code:`fluid.io.save_persistables` 保存持久性参数到指定的位置。
+2. 在训练的startup_program通过执行器 :code:`Executor` 执行成功之后调用 :code:`fluid.io.load_persistables` 加载之前保存的持久性参数。
+3. 通过执行器 :code:`Executor` 或者 :code:`ParallelExecutor` 继续训练。
-对于通过 :code:`fluid.io.save_checkpoint` 保存的模型，可以使用 :code:`fluid.io.load_checkpoint`
-来进行载入。
 例如：
@@ -145,101 +128,100 @@
    import paddle.fluid as fluid
    exe = fluid.Executor(fluid.CPUPlace())
-    path = "./checkpoints"
+    path = "./models"
    prog = fluid.default_main_program()
-    fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path,
+    fluid.io.save_persistables(exe, path, prog)
-                             serial=9, main_program=prog)
-上面的例子中，通过调用 :code:`fluid.io.save_checkpoint` 函数，PaddlePaddle Fluid会对
-:code:`prog` 中的所有模型变量进行扫描，根据内置规则自动筛选出需要加载的变量，
-并尝试从 :code:`path` 之中加载它们。
-参数 :code:`serial` 用来标记具体要加载的checkpoint的版本号。在保存checkpoint的时候，
+上面的例子中，通过调用 :code:`fluid.io.save_persistables` 函数，PaddlePaddle Fluid会从默认 :code:`fluid.Program` 也就是 :code:`prog` 的所有模型变量中找出长期变量，并将他们保存到指定的 :code:`path` 目录下。
-一个checkpoint会被保存在一个子目录中，并在目录名上体现出自己的版本号。
-一般越大的版本号表示这个checkpoint越新。
-这里的 :code:`prog` 必须和调用 :code:`fluid.io.save_checkpoint` 时所用的 :code:`prog`
-完全一致，否则会导致变量加载错误或者未加载。另外，与 :code:`fluid.io.save_params` 类似，
-运行 :code:`fluid.default_startup_program()` 也必须在 :code:`fluid.io.load_checkpoint`
-之前进行。
-多机checkpoint保存
+.. code-block:: python
-##################
-Checkpoint功能使用指南
-======================
-* 背景
-单机/多机在训练过程中会由于软件/硬件的问题出现异常，导致训练中断，进而导致训练无结果或结果不可用，浪费大量时间和机器性能。
-* 目的
+    import paddle.fluid as fluid
-Checkpoint功能能够在训练中途对训练数据中间数据进行保存，出现异常恢复训练的时候能够加载中途保存的数据继续训练， 实现单机/多机的容错训练的功能。
-* 说明
+    exe = fluid.Executor(fluid.CPUPlace())
+    path = "./models"
+    startup_prog = fluid.default_startup_program()
+    exe.run(startup_prog)
+    fluid.io.load_persistables(exe, path, startup_prog)
+    main_prog = fluid.default_main_program()
+    exe.run(main_prog)
+上面的例子中，通过调用 :code:`fluid.io.load_persistables` 函数，PaddlePaddle Fluid会从默认
+:code:`fluid.Program` 也就是 :code:`prog` 的所有模型变量中找出长期变量，从指定的 :code:`path` 目录中将它们一一加载， 然后再继续进行训练。
-  * 目前已实现的参数保存：
-  1. 基于Trainer 0 实现训练过程中的参数保存
-  2. 基于PServer 实现了`Distribute Lookup Table`相关参数保存
+多机增量（不带分布式大规模稀疏矩阵）训练的一般步骤为：
+==========================
+多机增量训练和单机增量训练有若干不同点：
-  * Fluid Checkpoint 保存数据目录结构：
+1. 在训练的最后调用 :code:`fluid.io.save_persistables` 保存持久性参数时，不必要所有的trainer都调用这个方法，一般0号trainer来保存。
+2. 多机增量训练的参数加载在PServer端，trainer端不用加载参数。在PServer全部启动后，trainer会从PServer端同步参数。
-.. code-block:: python
+多机增量（不启用分布式大规模稀疏矩阵）训练的一般步骤为：
-    checkpoint_dir (用户定义的checkpoint目录)
+1. 0号trainer在训练的最后调用 :code:`fluid.io.save_persistables` 保存持久性参数到指定的 :code:`path` 下。
-    ├── checkpoint_0 (第一次保存)
+2. 通过HDFS等方式将0号trainer保存下来的所有的参数共享给所有的PServer(每个PServer都需要有完整的参数)。
-    │   ├── __lockup_table__ (Distribute Lookup Table 目录)
+3. PServer在训练的startup_program通过执行器（:code:`Executor`）执行成功之后调用 :code:`fluid.io.load_persistables` 加载0号trainer保存的持久性参数。
-    │   │   ├── table_pserver_0 (Pserver 0 号保存的lookup table 数据)
+4. PServer通过执行器 :code:`Executor` 继续启动PServer_program.
-    │   │   └── table_pserver_1
+5. 所有的训练节点trainer通过执行器 :code:`Executor` 或者 :code:`ParallelExecutor` 正常训练。
-    │   ├── __model__ (model 目录)
-    │   │   └── var.w_1
-    │   └── trainer_0 (trainer 自有数据保存)
-    │       ├── epoch_id
-    │       └── step_id
-    └── checkpoint_1 (第二次保存)
-* 使用方法
-  * 声明Fluid.CheckpointConfig
+对于训练过程中待保存参数的trainer， 例如：
-  用户对checkpoint功能的配置，主要是配置对象 :code:`Fluid` 中的 :code:`CheckpointConfig` .
+.. code-block:: python
-  :code:`CheckpointConfig` 包括4个参数：
+    import paddle.fluid as fluid
-  =====================   =====  ==========================
+    exe = fluid.Executor(fluid.CPUPlace())
-          参数             类型            说明
+    path = "./models"
-  =====================   =====  ==========================
+    trainer_id = 0
-    checkpoint_dir         int    checkpoint存储目录
+    if trainer_id == 0:
+        prog = fluid.default_main_program()
+        fluid.io.save_persistables(exe, path, prog)
-    max_num_checkpoints    int    最大保存的checkpoint副本数
-    epoch_interval         int    每隔epoch_interval轮epoch
+.. code-block:: bash
+    hadoop fs -mkdir /remote/$path
+    hadoop fs -put $path /remote/$path
-    step_interval          int      每隔step_interval轮step
+上面的例子中，0号train通过调用 :code:`fluid.io.save_persistables` 函数，PaddlePaddle Fluid会从默认
-  =====================   =====  ==========================
+:code:`fluid.Program` 也就是 :code:`prog` 的所有模型变量中找出长期变量，并将他们保存到指定的 :code:`path` 目录下。然后通过调用第三方的文件系统（如HDFS）将存储的模型进行上传到所有PServer都可访问的位置。
-  * 在Fluid.Trainer对象的声明中加入Fluid.CheckpointConfig的声明
+对于训练过程中待载入参数的PServer， 例如：
-  Trainer的__init__方法的参数中包含了对 :code:`CheckpointConfig` ， 需要传入在声明Trainer前声明的 :code:`CheckpointConfig` 对象。
-  如：
-  .. code-block:: python
+.. code-block:: bash
+    hadoop fs -get /remote/$path $path
-      config = CheckpointConfig(
-          checkpoint_dir = "/tmp/ckpt", max_num_checkpoints = 2,
-          epoch_interval = 2, step_interval = 10)
-      trainer = Trainer(..., checkpoint_config=config)
-定义和声明完成后， 训练在运行过程中就会在指定的step和epoch处进行保存，出现异常时，就会自动从最新的checkpoint目录进行参数恢复啦！
+.. code-block:: python
-* 相关API
+    import paddle.fluid as fluid
-  `Trainer API 说明 <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/trainer.py>`_
+    exe = fluid.Executor(fluid.CPUPlace())
+    path = "./models"
+    pserver_endpoints = "127.0.0.1:1001,127.0.0.1:1002"
+    trainers = 4
+    training_role == "PSERVER"
+    config = fluid.DistributeTranspilerConfig()
+    t = fluid.DistributeTranspiler(config=config)
+    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers, sync_mode=True)
+    if training_role == "PSERVER":
+        current_endpoint = "127.0.0.1:1001"
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
+        fluid.io.load_persistables(exe, path, pserver_startup)
+        exe.run(pserver_prog)
+    if training_role == "TRAINER":
+        main_program = t.get_trainer_program()
+                exe.run(main_program)
+上面的例子中，每个PServer通过调用HDFS的命令获取到0号trainer保存的参数，通过配置获取到PServer的 :code:`fluid.Program` ，PaddlePaddle Fluid会从此
+:code:`fluid.Program` 也就是 :code:`pserver_startup` 的所有模型变量中找出长期变量，并通过指定的 :code:`path` 目录下一一加载。
-* 注意
-1. 保证每个训练的 :code:`checkpoint_dir` 与其他训练独立。
-2. 最大副本数量 :code:`max_num_checkpoints` 需要根据磁盘容量以及模型的大小进行调整， 保证磁盘的可用性。
-3.  :code:`epoch_interval`  和  :code:`step_interval`  不宜过小， 频繁的进行checkpoint会拖慢训练速度。
-4.  **分布式训练** 的过程中：每个Trainer都会在 :code:`checkpoint_dir` 目录中保存当前Trainer的参数（只有Trainer 0会保存模型的参数），需要 **分布式文件系统(HDFS等)** 将同 :code:`checkpoint_dir` 目录的数据进行合并才能得到完整的数据，恢复训练的时候需要用完整的数据进行恢复。
--- a/doc/fluid/user_guides/index.rst
+++ b/doc/fluid/user_guides/index.rst
@@ -19,7 +19,7 @@
    - `训练神经网络 <../user_guides/howto/training/index.html>`_：介绍如何使用 Fluid 进行单机训练、多机训练、以及保存和载入模型变量
    - `模型评估与调试 <../user_guides/howto/evaluation_and_debugging/index.html>`_：介绍在 Fluid 下进行模型评估和调试的方法，包括：
      - `模型评估 <../user_guides/howto/evaluation_and_debugging/evaluation/metrics.html>`_：介绍常用模型评估指标的构造方法
      - `Visual DL 工具 <../user_guides/howto/evaluation_and_debugging/debug/visualdl.html>`_：介绍如何利用 Visual DL 工具可视化训练过程
@@ -28,7 +28,7 @@
 基于 Fluid 复现的多领域经典模型：
-    - `Fluid 模型库 <../user_guides/models/index.html>`_
+    - `Fluid 模型库 <../user_guides/models/index_cn.html>`_
 ==============
@@ -43,5 +43,5 @@
    howto/training/index
    howto/evaluation_and_debugging/index
    howto/inference/index
-    models/index.rst
+    models/index_cn.rst
    design_idea/fluid_design_idea.md
--- a/doc/fluid/user_guides/models/index.rst
+++ b/doc/fluid/user_guides/models/index.rst
-../../../../external/models/fluid/README.cn.rst
\ No newline at end of file
--- a/doc/fluid/user_guides/models/index_cn.rst
+++ b/doc/fluid/user_guides/models/index_cn.rst
+`Fluid 模型库 <https://github.com/PaddlePaddle/models/tree/develop/fluid>`__
+============
+图像分类
+--------
+图像分类是根据图像的语义信息对不同类别图像进行区分，是计算机视觉中重要的基础问题，是物体检测、图像分割、物体跟踪、行为分析、人脸识别等其他高层视觉任务的基础，在许多领域都有着广泛的应用。如：安防领域的人脸识别和智能视频分析等，交通领域的交通场景识别，互联网领域基于内容的图像检索和相册自动归类，医学领域的图像识别等。
+在深度学习时代，图像分类的准确率大幅度提升，在图像分类任务中，我们向大家介绍了如何在经典的数据集ImageNet上，训练常用的模型，包括AlexNet、VGG、GoogLeNet、ResNet、Inception-v4、MobileNet、DPN(Dual
+Path
+Network)、SE-ResNeXt模型，也开源了\ `训练的模型 <https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleCV/image_classification/README_cn.md#已有模型及其性能>`__\ 方便用户下载使用。同时提供了能够将Caffe模型转换为PaddlePaddle
+Fluid模型配置和参数文件的工具。
+-  `AlexNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification/models>`__
+-  `VGG <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification/models>`__
+-  `GoogleNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification/models>`__
+-  `Residual
+   Network <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification/models>`__
+-  `Inception-v4 <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification/models>`__
+-  `MobileNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification/models>`__
+-  `Dual Path
+   Network <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification/models>`__
+-  `SE-ResNeXt <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification/models>`__
+-  `Caffe模型转换为Paddle
+   Fluid配置和模型文件工具 <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/caffe2fluid>`__
+目标检测
+--------
+目标检测任务的目标是给定一张图像或是一个视频帧，让计算机找出其中所有目标的位置，并给出每个目标的具体类别。对于人类来说，目标检测是一个非常简单的任务。然而，计算机能够“看到”的是图像被编码之后的数字，很难解图像或是视频帧中出现了人或是物体这样的高层语义概念，也就更加难以定位目标出现在图像中哪个区域。与此同时，由于目标会出现在图像或是视频帧中的任何位置，目标的形态千变万化，图像或是视频帧的背景千差万别，诸多因素都使得目标检测对计算机来说是一个具有挑战性的问题。
+在目标检测任务中，我们介绍了如何基于\ `PASCAL
+VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`__\ 、\ `MS
+COCO <http://cocodataset.org/#home>`__\ 数据训练通用物体检测模型，当前介绍了SSD算法，SSD全称Single Shot MultiBox Detector，是目标检测领域较新且效果较好的检测算法之一，具有检测速度快且检测精度高的特点。
+开放环境中的检测人脸，尤其是小的、模糊的和部分遮挡的人脸也是一个具有挑战的任务。我们也介绍了如何基于 `WIDER FACE <http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/>`_ 数据训练百度自研的人脸检测PyramidBox模型，该算法于2018年3月份在WIDER FACE的多项评测中均获得 `第一名 <http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/WiderFace_Results.html>`_。
+-  `Single Shot MultiBox
+   Detector <https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleCV/object_detection/README_cn.md>`__
+-  `Face Detector: PyramidBox <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/face_detection/README_cn.md>`_
+图像语义分割
+------------
+图像语意分割顾名思义是将图像像素按照表达的语义含义的不同进行分组/分割，图像语义是指对图像内容的理解，例如，能够描绘出什么物体在哪里做了什么事情等，分割是指对图片中的每个像素点进行标注，标注属于哪一类别。近年来用在无人车驾驶技术中分割街景来避让行人和车辆、医疗影像分析中辅助诊断等。
+在图像语义分割任务中，我们介绍如何基于图像级联网络(Image Cascade
+Network,ICNet)进行语义分割，相比其他分割算法，ICNet兼顾了准确率和速度。
+-  `ICNet <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/icnet>`__
+图像生成
+-----------
+图像生成是指根据输入向量，生成目标图像。这里的输入向量可以是随机的噪声或用户指定的条件向量。具体的应用场景有：手写体生成、人脸合成、风格迁移、图像修复等。当前的图像生成任务主要是借助生成对抗网络（GAN）来实现。
+生成对抗网络（GAN）由两种子网络组成：生成器和识别器。生成器的输入是随机噪声或条件向量，输出是目标图像。识别器是一个分类器，输入是一张图像，输出是该图像是否是真实的图像。在训练过程中，生成器和识别器通过不断的相互博弈提升自己的能力。
+在图像生成任务中，我们介绍了如何使用DCGAN和ConditioanlGAN来进行手写数字的生成，另外还介绍了用于风格迁移的CycleGAN.
+- `DCGAN & ConditionalGAN <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/gan/c_gan>`__
+- `CycleGAN <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/gan/cycle_gan>`__
+场景文字识别
+------------
+许多场景图像中包含着丰富的文本信息，对理解图像信息有着重要作用，能够极大地帮助人们认知和理解场景图像的内容。场景文字识别是在图像背景复杂、分辨率低下、字体多样、分布随意等情况下，将图像信息转化为文字序列的过程，可认为是一种特别的翻译过程：将图像输入翻译为自然语言输出。场景图像文字识别技术的发展也促进了一些新型应用的产生，如通过自动识别路牌中的文字帮助街景应用获取更加准确的地址信息等。
+在场景文字识别任务中，我们介绍如何将基于CNN的图像特征提取和基于RNN的序列翻译技术结合，免除人工定义特征，避免字符分割，使用自动学习到的图像特征，完成字符识别。当前，介绍了CRNN-CTC模型和基于注意力机制的序列到序列模型。
+-  `CRNN-CTC模型 <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/ocr_recognition>`__
+-  `Attention模型 <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/ocr_recognition>`__
+度量学习
+-------
+度量学习也称作距离度量学习、相似度学习，通过学习对象之间的距离，度量学习能够用于分析对象时间的关联、比较关系，在实际问题中应用较为广泛，可应用于辅助分类、聚类问题，也广泛用于图像检索、人脸识别等领域。以往，针对不同的任务，需要选择合适的特征并手动构建距离函数，而度量学习可根据不同的任务来自主学习出针对特定任务的度量距离函数。度量学习和深度学习的结合，在人脸识别/验证、行人再识别(human Re-ID)、图像检索等领域均取得较好的性能，在这个任务中我们主要介绍了基于Fluid的深度度量学习模型，包含了三元组、四元组等损失函数。
+- `Metric Learning <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/metric_learning>`__
+视频分类
+-------
+视频分类是视频理解任务的基础，与图像分类不同的是，分类的对象不再是静止的图像，而是一个由多帧图像构成的、包含语音数据、包含运动信息等的视频对象，因此理解视频需要获得更多的上下文信息，不仅要理解每帧图像是什么、包含什么，还需要结合不同帧，知道上下文的关联信息。视频分类方法主要包含基于卷积神经网络、基于循环神经网络、或将这两者结合的方法。该任务中我们介绍基于Fluid的视频分类模型，目前包含Temporal Segment Network(TSN)模型，后续会持续增加更多模型。
+- `TSN <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/video_classification>`__
+语音识别
+--------
+自动语音识别（Automatic Speech Recognition,
+ASR）是将人类声音中的词汇内容转录成计算机可输入的文字的技术。语音识别的相关研究经历了漫长的探索过程，在HMM/GMM模型之后其发展一直较为缓慢，随着深度学习的兴起，其迎来了春天。在多种语言识别任务中，将深度神经网络(DNN)作为声学模型，取得了比GMM更好的性能，使得
+ASR
+成为深度学习应用最为成功的领域之一。而由于识别准确率的不断提高，有越来越多的语言技术产品得以落地，例如语言输入法、以智能音箱为代表的智能家居设备等
+—— 基于语言的交互方式正在深刻的改变人类的生活。
+与 `DeepSpeech <https://github.com/PaddlePaddle/DeepSpeech>`__
+中深度学习模型端到端直接预测字词的分布不同，本实例更接近传统的语言识别流程，以音素为建模单元，关注语言识别中声学模型的训练，利用\ `kaldi <http://www.kaldi-asr.org>`__\ 进行音频数据的特征提取和标签对齐，并集成
+kaldi 的解码器完成解码。
+-  `DeepASR <https://github.com/PaddlePaddle/models/blob/develop/fluid/DeepASR/README_cn.md>`__
+机器翻译
+--------
+机器翻译（Machine
+Translation）将一种自然语言(源语言)转换成一种自然语言（目标语音），是自然语言处理中非常基础和重要的研究方向。在全球化的浪潮中，机器翻译在促进跨语言文明的交流中所起的重要作用是不言而喻的。其发展经历了统计机器翻译和基于神经网络的神经机器翻译(Nueural
+Machine Translation, NMT)等阶段。在 NMT
+成熟后，机器翻译才真正得以大规模应用。而早阶段的 NMT
+主要是基于循环神经网络 RNN
+的，其训练过程中当前时间步依赖于前一个时间步的计算，时间步之间难以并行化以提高训练速度。因此，非
+RNN 结构的 NMT 得以应运而生，例如基于卷积神经网络 CNN
+的结构和基于自注意力机制（Self-Attention）的结构。
+本实例所实现的 Transformer
+就是一个基于自注意力机制的机器翻译模型，其中不再有RNN或CNN结构，而是完全利用
+Attention 学习语言中的上下文依赖。相较于RNN/CNN,
+这种结构在单层内计算复杂度更低、易于并行化、对长程依赖更易建模，最终在多种语言之间取得了最好的翻译效果。
+-  `Transformer <https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleNLP/neural_machine_translation/transformer/README_cn.md>`__
+强化学习
+--------
+强化学习是近年来一个愈发重要的机器学习方向，特别是与深度学习相结合而形成的深度强化学习(Deep
+Reinforcement Learning,
+DRL)，取得了很多令人惊异的成就。人们所熟知的战胜人类顶级围棋职业选手的
+AlphaGo 就是 DRL
+应用的一个典型例子，除游戏领域外，其它的应用还包括机器人、自然语言处理等。
+深度强化学习的开山之作是在Atari视频游戏中的成功应用，
+其可直接接受视频帧这种高维输入并根据图像内容端到端地预测下一步的动作，所用到的模型被称为深度Q网络(Deep
+Q-Network, DQN)。本实例就是利用PaddlePaddle Fluid这个灵活的框架，实现了
+DQN 及其变体，并测试了它们在 Atari 游戏中的表现。
+-  `DeepQNetwork <https://github.com/PaddlePaddle/models/blob/develop/fluid/DeepQNetwork/README_cn.md>`__
+中文词法分析
+------------
+中文分词(Word Segmentation)是将连续的自然语言文本，切分出具有语义合理性和完整性的词汇序列的过程。因为在汉语中，词是承担语义的最基本单位，切词是文本分类、情感分析、信息检索等众多自然语言处理任务的基础。 词性标注（Part-of-speech Tagging）是为自然语言文本中的每一个词汇赋予一个词性的过程，这里的词性包括名词、动词、形容词、副词等等。 命名实体识别（Named Entity Recognition，NER）又称作“专名识别”，是指识别自然语言文本中具有特定意义的实体，主要包括人名、地名、机构名、专有名词等。 我们将这三个任务统一成一个联合任务，称为词法分析任务，基于深度神经网络，利用海量标注语料进行训练，提供了一个端到端的解决方案。
+我们把这个联合的中文词法分析解决方案命名为LAC。LAC既可以认为是Lexical Analysis of Chinese的首字母缩写，也可以认为是LAC Analyzes Chinese的递归缩写。
+- `LAC <https://github.com/baidu/lac/blob/master/README.md>`__
+情感倾向分析
+------------
+情感倾向分析针对带有主观描述的中文文本，可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极、 中性。情感倾向分析能够帮助企业理解用户消费习惯、分析热点话题和危机舆情监控，为企业提供有力的决策支持。本次我们开放 AI开放平台中情感倾向分析采用的\ `模型 <http://ai.baidu.com/tech/nlp/sentiment_classify>`__\， 提供给用户使用。
+- `Senta <https://github.com/baidu/Senta/blob/master/README.md>`__
+语义匹配
+--------
+在自然语言处理很多场景中，需要度量两个文本在语义上的相似度，这类任务通常被称为语义匹配。例如在搜索中根据查询与候选文档的相似度对搜索结果进行排序，文本去重中文本与文本相似度的计算，自动问答中候选答案与问题的匹配等。
+本例所开放的DAM (Deep Attention Matching Network)为百度自然语言处理部发表于ACL-2018的工作，用于检索式聊天机器人多轮对话中应答的选择。DAM受Transformer的启发，其网络结构完全基于注意力(attention)机制，利用栈式的self-attention结构分别学习不同粒度下应答和语境的语义表示，然后利用cross-attention获取应答与语境之间的相关性，在两个大规模多轮对话数据集上的表现均好于其它模型。
+- `Deep Attention Matching Network <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleNLP/deep_attention_matching_net>`__
+AnyQ
+----
+`AnyQ <https://github.com/baidu/AnyQ>`__\ (ANswer Your Questions)
+开源项目主要包含面向FAQ集合的问答系统框架、文本语义匹配工具SimNet。
+问答系统框架采用了配置化、插件化的设计，各功能均通过插件形式加入，当前共开放了20+种插件。开发者可以使用AnyQ系统快速构建和定制适用于特定业务场景的FAQ问答系统，并加速迭代和升级。
+SimNet是百度自然语言处理部于2013年自主研发的语义匹配框架，该框架在百度各产品上广泛应用，主要包括BOW、CNN、RNN、MM-DNN等核心网络结构形式，同时基于该框架也集成了学术界主流的语义匹配模型，如MatchPyramid、MV-LSTM、K-NRM等模型。使用SimNet构建出的模型可以便捷的加入AnyQ系统中，增强AnyQ系统的语义匹配能力。
+-  `SimNet in PaddlePaddle
+   Fluid <https://github.com/baidu/AnyQ/blob/master/tools/simnet/train/paddle/README.md>`__
+机器阅读理解
+----
+机器阅读理解(MRC)是自然语言处理(NLP)中的核心任务之一，最终目标是让机器像人类一样阅读文本，提炼文本信息并回答相关问题。深度学习近年来在NLP中得到广泛使用，也使得机器阅读理解能力在近年有了大幅提高，但是目前研究的机器阅读理解都采用人工构造的数据集，以及回答一些相对简单的问题，和人类处理的数据还有明显差距，因此亟需大规模真实训练数据推动MRC的进一步发展。
+百度阅读理解数据集是由百度自然语言处理部开源的一个真实世界数据集，所有的问题、原文都来源于实际数据(百度搜索引擎数据和百度知道问答社区)，答案是由人类回答的。每个问题都对应多个答案，数据集包含200k问题、1000k原文和420k答案，是目前最大的中文MRC数据集。百度同时开源了对应的阅读理解模型，称为DuReader，采用当前通用的网络分层结构，通过双向attention机制捕捉问题和原文之间的交互关系，生成query-aware的原文表示，最终基于query-aware的原文表示通过point network预测答案范围。
+-  `DuReader in PaddlePaddle Fluid <https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleNLP/machine_reading_comprehension/README.md>`__
+个性化推荐
+-------
+推荐系统在当前的互联网服务中正在发挥越来越大的作用，目前大部分电子商务系统、社交网络，广告推荐，搜索引擎，都不同程度的使用了各种形式的个性化推荐技术，帮助用户快速找到他们想要的信息。
+在工业可用的推荐系统中，推荐策略一般会被划分为多个模块串联执行。以新闻推荐系统为例，存在多个可以使用深度学习技术的环节，例如新闻的自动化标注，个性化新闻召回，个性化匹配与排序等。PaddlePaddle对推荐算法的训练提供了完整的支持，并提供了多种模型配置供用户选择。
+- `TagSpace <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/TagSpace>`_
+- `GRU4Rec <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/gru4rec>`_
+- `SequenceSemanticRetrieval <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/ssr>`_
+- `DeepCTR <https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleRec/ctr/README.cn.md>`_
+- `Multiview-Simnet <https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleRec/multiview_simnet>`_
--- a/doc/fluid/user_guides/models/index_en.rst
+++ b/doc/fluid/user_guides/models/index_en.rst
+############################################################################
+`Models <https://github.com/PaddlePaddle/models/tree/develop/fluid>`_
+############################################################################
+PaddlePaddle provides a rich set of computational units to enable users to adopt a modular approach to solving various learning problems. In this repo, we demonstrate how to use PaddlePaddle to solve common machine learning tasks, providing several different neural network model that anyone can easily learn and use.
+- `fluid models <https://github.com/PaddlePaddle/models/tree/develop/fluid>`_ : use PaddlePaddle's Fluid APIs. We especially recommend users to use Fluid models.
+- `legacy models <https://github.com/PaddlePaddle/models/tree/develop/legacy>`_ : use PaddlePaddle's v2 APIs.
--- a/Anakin @ 65178d41
+++ b/Anakin @ 65178d41
-Subproject commit beec126e4cfe762e4b6b542496069323dca35ee7
+Subproject commit 65178d41c3a61ba846f1e94909e3cb50a8c19c92
--- a/Paddle @ cc9028b9
+++ b/Paddle @ cc9028b9
-Subproject commit cb27a9219d8dfc02be49484ce697495886a3e6fb
+Subproject commit cc9028b90ef50a825a722c55e5fda4b7cd26b0d6
--- a/paddle-mobile @ 2c088e20
+++ b/paddle-mobile @ 2c088e20
-Subproject commit 73e2f989e78e59e6fafbf5d973e36ad17418c64a
+Subproject commit 2c088e20d8083accacaf2057bc35531ac7fba7ce