Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into grpc_benchmark

9a729aec · typhoonzero · 57f666fb · 6b475981 · 9a729aec · 9a729aec
711 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
 paddle/pybind/pybind.h
+python/paddle/version.py
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,10 +16,14 @@ cmake_minimum_required(VERSION 3.0)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

 include(system)

 project(paddle CXX C Go)
+message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION})
+message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION})

 find_package(Sphinx)
 if(NOT CMAKE_CROSSCOMPILING)
@@ -56,6 +60,7 @@ option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -81,6 +86,8 @@ if(ANDROID OR IOS)
        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKL OFF CACHE STRING
        "Disable MKL when cross-compiling for Android and iOS" FORCE)
+    set(WITH_GOLANG OFF CACHE STRING
+        "Disable golang when cross-compiling for Android and iOS" FORCE)

    # Compile PaddlePaddle mobile inference library
    if (NOT WITH_C_API)

--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@


 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -36,7 +36,7 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
    examples:

      - Optimized math operations through SSE/AVX intrinsics, BLAS libraries
-      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+      (e.g. MKL, OpenBLAS, cuBLAS) or customized CPU/GPU kernels.
      - Highly optimized recurrent networks which can handle **variable-length**
      sequence without padding.
      - Optimized local and distributed training for models with high dimensional
@@ -61,32 +61,32 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation

 It is recommended to check out the
-[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html).
+[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/build_from_source_en.html).

 ## Documentation

-We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
-[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation.
+We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
+[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.

- [Deep Learning 101](http://book.paddlepaddle.org/index.html)
+- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)

  You might want to start from this online interactive book that can run in a Jupyter Notebook.

- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html)

  You can run distributed training jobs on MPI clusters.

- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html)

   You can also run distributed training jobs on Kubernetes clusters.

- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html)

   Our new API enables much shorter programs.

- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html)

   We appreciate your contributions!


--- a/RELEASE.cn.md
+++ b/RELEASE.cn.md
+# v0.11.0版本
+
+## PaddlePaddle Fluid
+
+- PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中，不再有*模型*这个概念，应用也不再包含一个用于描述Operator图或者一系列层的符号描述，而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流，例如 if-else-then或者for，而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如：
+
+  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
+
+- 在v0.11.0版本中，我们提供了一个C++类`Executor`用于运行一个Fluid程序。Executor类似一个解释器。在未来的版本中，我们将提升和优化Executor成为一个调试器，就像GDB。并可能提供一些编译器，这个编译器会读取一个上文所描述的应用然后编译成一个等价的
+源代码，这个源代码可以被nvcc编译成可以使用CUDA的二进制，或者被icc编译成可以充分利用Intel CPU的二进制。
+
+
+## 新特点
+
+* 发布 `PaddlePaddle Fluid`。
+* 增加了用于模型预测的C-API。
+* 用Fluid API实现了一个简单的GAN的例子。
+* 增加了关于性能调优的文档。
+* 为`paddle.v2.dataset`下载数据集提供了重试机制.
+* C++中使用protobuf-lite替换protobuf减少了二进制的大小。
+* 发布了新特性 [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment).
+* 基于Bazel API利用cmake实现了一个的新的构建系统函数库。
+* 当使用编译选项`WITH_MKL=ON`时自动下载和编译Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) 函数库.
+* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
+  - 完成了 11个 MKL-DNN 层: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN。
+  - 完成了 3个 MKL-DNN 网络: VGG-19, ResNet-50, GoogleNet
+  - 基于Intel Skylake 6148 CPU的[性能测试](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) : 相对于MKLML有2~3倍的训练加速。
+* 增加 [softsign activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign)
+* 增加 [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod)
+* 增加 [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance)
+* 增加 [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq)
+* 增加 [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score)
+* 增加 [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice)
+* 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
+* 增加移动端友好的网页
+
+## 改进
+
+* 使用一个Python`whl`包即可安装.
+* [V2 API可以实现用户定制化评估](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标)。
+* 将 `PADDLE_ONLY_CPU` 改为 `PADDLE_WITH_GPU`, 因为我们会支持多种设备。
+* 删除了有一些bug的BarrierStat。
+* 清理和删除了paddle::Parameter中未使用的函数。
+* 删除了ProtoDataProvider。
+* Huber loss同时支持回归和分类。
+* 为sequence pooling 层增加`stride`参数。
+* v2 API自动使用cudnn batch normalization。
+* 可以使用一个固定的参数名共享BN层的参数。
+* 2D convolution operation支持variable-dimension input特性。
+* 重构cmake中关于CUDA的部分并实现自动检测GPU架构的功能。
+* 优化网页导航。
+
+## 错误修复
+
+* 修复ROI pooling的Bug. cc9a761
+* 修复当label是dense vector是AUC变成0的问题. #5274
+* 修复WarpCTC 层的Bug.
+
+
 # v0.10.0版本

 我们非常高兴发布了PaddlePaddle V0.10.0版，并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。

--- a/RELEASE.md
+++ b/RELEASE.md
+# Release v0.11.0
+
+## PaddlePaddle Fluid
+
+- Release 0.11.0 includes a new feature *PaddlePaddle Fluid*.  Fluid is
+  designed to allow users to program like PyTorch and TensorFlow Eager Execution.
+  In these systems, there is no longer the concept *model* and applications
+  do not include a symbolic description of a graph of operators nor a sequence
+  of layers. Instead, applications look exactly like a usual program that
+  describes a process of training or inference.  The difference between
+  Fluid and PyTorch or Eager Execution is that Fluid doesn't rely on Python's
+  control-flow, `if-then-else` nor `for`.  Instead, Fluid provides its
+  C++ implementations and their Python binding using the `with` statement.  For an example
+
+  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
+
+- In 0.11.0, we provides a C++ class `Executor` to run a Fluid program.
+Executor works like an interpreter. In future version, we will improve
+`Executor` into a debugger like GDB, and we might provide some compilers,
+which, for example, takes an application like the above one, and outputs
+an equivalent C++ source program, which can be compiled using
+[`nvcc`](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html)
+to generate binaries that use CUDA, or using
+[`icc`](https://software.intel.com/en-us/c-compilers) to generate binaries
+that make full use of Intel CPUs.
+
+## New Features
+
+* Release `PaddlePaddle Fluid`.
+* Add C-API for model inference
+* Use fluid API to create a simple GAN demo.
+* Add develop guide about performance tunning.
+* Add retry when download `paddle.v2.dataset`.
+* Linking protobuf-lite not protobuf in C++. Reduce the binary size.
+* Feature [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment) released.
+* A new style cmake functions for Paddle. It is based on Bazel API.
+* Automatically download and compile with Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) library as CBLAS when build `WITH_MKL=ON`.
+* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
+  - Complete 11 MKL-DNN layers: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN.
+  - Complete 3 MKL-DNN networks: VGG-19, ResNet-50, GoogleNet
+  - [Benchmark](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) on Intel Skylake 6148 CPU: 2~3x training speedup compared with MKLML.
+* Add the [`softsign` activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign).
+* Add the [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod).
+* Add the [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance).
+* Add the [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq).
+* Add the [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score).
+* Add the [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice).
+* Add the [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
+* Add mobile friendly webpages.
+
+## Improvements
+
+* Build and install using a single `whl` package.
+* [Custom evaluating in V2 API](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标).
+* Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`, since we will support many kinds of devices.
+* Remove buggy BarrierStat.
+* Clean and remove unused functions in paddle::Parameter.
+* Remove ProtoDataProvider.
+* Huber loss supports both regression and classification.
+* Add the `stride` parameter  for sequence pooling layers.
+* Enable v2 API use cudnn batch normalization automatically.
+* The BN layer's parameter can be shared by a fixed the parameter name.
+* Support variable-dimension input feature for 2D convolution operation.
+* Refine cmake about CUDA to automatically detect GPU architecture.
+* Improved website navigation.
+
+## Bug Fixes
+
+* Fix bug in ROI pooling. cc9a761
+* Fix AUC is zero when label is dense vector. #5274
+* Fix bug in WarpCTC layer.
+
 # Release v0.10.0

 We are glad to release version 0.10.0.  In this version, we are happy to release the new 

--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -2,27 +2,25 @@

 Machine:

- Server
- 	- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
- Laptop
- 	- DELL XPS15-9560-R1745: i7-7700HQ 8G 256GSSD
- 	- i5 MacBook Pro (Retina, 13-inch, Early 2015)
- Desktop
- 	- i7-6700k
+- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
+- Laptop: TBD

 System: CentOS release 6.3 (Final), Docker 1.12.1.

-PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS)
- MKL-DNN tag v0.11
- MKLML 2018.0.1.20171007
- OpenBLAS v0.2.20
-(TODO: will rerun after 0.11.0)
+PaddlePaddle: (TODO: will rerun after 0.11.0)
+- paddlepaddle/paddle:latest (for MKLML and MKL-DNN)
+  - MKL-DNN tag v0.11
+  - MKLML 2018.0.1.20171007
+- paddlepaddle/paddle:latest-openblas (for OpenBLAS)
+  - OpenBLAS v0.2.20
 	 
 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.

 ## Benchmark Model

 ### Server
+
+#### Training
 Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz

 Input image size - 3 * 224 * 224, Time: images/second
@@ -35,9 +33,7 @@ Input image size - 3 * 224 * 224, Time: images/second
 | MKLML        | 12.12 | 13.70 | 16.18  |
 | MKL-DNN      | 28.46 | 29.83 | 30.44  |

-
-chart on batch size 128
-TBD
+<img src="figs/vgg-cpu-train.png" width="500">

 - ResNet-50

@@ -47,9 +43,7 @@ TBD
 | MKLML        | 32.52 | 31.89 | 33.12  |
 | MKL-DNN      | 81.69 | 82.35 | 84.08  |

-
-chart on batch size 128
-TBD
+<img src="figs/resnet-cpu-train.png" width="500">

 - GoogLeNet

@@ -59,10 +53,35 @@ TBD
 | MKLML        | 128.46| 137.89| 158.63 |
 | MKL-DNN      | 250.46| 264.83| 269.50 |

-chart on batch size 128
-TBD
+<img src="figs/googlenet-cpu-train.png" width="500">
+
+#### Inference
+Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+- VGG-19
+
+| BatchSize | 1     | 2     | 4     | 8     | 16    |
+|-----------|-------|-------|-------|-------|-------|
+| OpenBLAS  | 1.07  | 1.08  | 1.06  | 0.88  | 0.65  |
+| MKLML     | 5.58  | 9.80  | 15.15 | 21.21 | 28.67 |
+| MKL-DNN   | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
+
+- ResNet-50
+
+| BatchSize | 1     | 2      | 4      | 8      | 16     |
+|-----------|-------|--------|--------|--------|--------|
+| OpenBLAS  | 3.35  | 3.19   | 3.09   | 2.55   | 1.96   |
+| MKLML     | 6.33  | 12.02  | 22.88  | 40.53  | 63.09  |
+| MKL-DNN   | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
+
+
+- GoogLeNet
+
+| BatchSize | 1      | 2      | 4      | 8      | 16     |
+|-----------|--------|--------|--------|--------|--------|
+| OpenBLAS  | 12.04  | 11.31  | 10.00  | 9.07   | 4.34   |
+| MKLML     | 22.74  | 41.56  | 81.22  | 133.47 | 210.53 |
+| MKL-DNN   | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
+

 ### Laptop
 TBD
-### Desktop
-TBD
--- a/benchmark/figs/googlenet-cpu-train.png
+++ b/benchmark/figs/googlenet-cpu-train.png
--- a/benchmark/figs/resnet-cpu-train.png
+++ b/benchmark/figs/resnet-cpu-train.png
--- a/benchmark/figs/vgg-cpu-train.png
+++ b/benchmark/figs/vgg-cpu-train.png
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@@ -6,8 +6,18 @@ height = 227
 width = 227
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
+gp = get_config_arg('layer_num', int, 1)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
 define_py_data_sources2(
    "train.list", None, module="provider", obj="process", args=args)

@@ -31,7 +41,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)

 # conv2
 net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
 net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
 net = img_pool_layer(input=net, pool_size=3, stride=2)

@@ -40,11 +50,11 @@ net = img_conv_layer(
    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
 # conv4
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)

 # conv5
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
 net = img_pool_layer(input=net, pool_size=3, stride=2)

 net = fc_layer(
@@ -59,6 +69,9 @@ net = fc_layer(
    layer_attr=ExtraAttr(drop_rate=0.5))
 net = fc_layer(input=net, size=1000, act=SoftmaxActivation())

-lab = data_layer('label', num_class)
-loss = cross_entropy(input=net, label=lab)
-outputs(loss)
+if is_infer:
+    outputs(net)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=net, label=lab)
+    outputs(loss)
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,

--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
    else:
        settings.data_size = settings.height * settings.width
    settings.is_infer = kwargs.get('is_infer', False)
+    settings.num_samples = kwargs.get('num_samples', 2560)
    if settings.is_infer:
        settings.slots = [dense_vector(settings.data_size)]
    else:
@@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
 @provider(
    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(2560 if settings.is_infer else 1024):
+    for i in xrange(settings.num_samples):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
        if settings.is_infer:
            yield img.astype('float32')

--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,

--- a/benchmark/paddle/image/run_mkldnn_infer.sh
+++ b/benchmark/paddle/image/run_mkldnn_infer.sh
@@ -4,7 +4,7 @@ function clock_to_seconds() {
  hours=`echo $1 | awk -F ':' '{print $1}'`
  mins=`echo $1 | awk -F ':' '{print $2}'`
  secs=`echo $1 | awk -F ':' '{print $3}'`
-  echo `bc -l <<< "$secs + $mins * 60 + $hours * 3600"`
+  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
 }

 function infer() {
@@ -37,7 +37,7 @@ function infer() {
      --trainer_count=1 \
      --num_passes=1 \
      --save_dir="models/${topology}-${layer_num}" \
-      --config_args="batch_size=128,layer_num=${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
      > /dev/null 2>&1
    echo "Done"
  fi
@@ -58,9 +58,9 @@ function infer() {
  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
  start_sec=`clock_to_seconds $start`
  end_sec=`clock_to_seconds $end`
-  fps=`bc <<< "scale = 2; 1280 / ($end_sec - $start_sec)"`
+  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
-  echo "FPS: $fps images/sec" >> ${log}
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }

 if [ ! -f "train.list" ]; then
@@ -79,8 +79,9 @@ fi
 # inference benchmark
 for use_mkldnn in True False; do
  for batchsize in 1 2 4 8 16; do
-    infer googlenet v1 $batchsize $use_mkldnn
-    infer resnet 50 $batchsize $use_mkldnn
    infer vgg 19 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_mkldnn_train.sh
+++ b/benchmark/paddle/image/run_mkldnn_train.sh
@@ -28,6 +28,10 @@ function train() {
    --test_period=100 \
    --config_args=$args \
    2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }

 if [ ! -f "train.list" ]; then
@@ -43,5 +47,6 @@ for use_mkldnn in True False; do
    train vgg 19 $batchsize $use_mkldnn
    train resnet 50 $batchsize $use_mkldnn
    train googlenet v1 $batchsize $use_mkldnn
+    train alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
+set -e
+
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+
+function infer() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  thread=`nproc`
+  if [ $thread -gt $bs ]; then
+    thread=$bs
+  fi
+  log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "./run_mkl_infer.sh to save the model first"
+    exit 0
+  fi
+  log_period=$((32 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_mkldnn=False \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+
+  # calculate the last 5 logs period time of 160(=32*5) samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# inference benchmark
+for batchsize in 1 2 4 8 16; do
+  infer vgg 19 $batchsize
+  infer resnet 50 $batchsize 
+  infer googlenet v1 $batchsize
+  infer alexnet 2 $batchsize
+done
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
+set -e
+
+function train() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  thread=`nproc`
+  # each trainer_count use only 1 core to avoid conflict
+  log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+  args="batch_size=${bs},layer_num=${layer_num}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_mkldnn=False \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=3 \
+    --test_period=30 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# training benchmark
+for batchsize in 64 128 256; do
+  train vgg 19 $batchsize
+  train resnet 50 $batchsize
+  train googlenet v1 $batchsize
+  train alexnet 2 $batchsize
+done
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,

--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -3,7 +3,7 @@
 # It will search MKLML, atlas, OpenBlas, reference-cblas in order.
 #
 # If any cblas implementation found, the following variable will be set.
-#    CBLAS_PROVIDER  # one of MKLML, ATLAS, OPENBLAS, REFERENCE
+#    CBLAS_PROVIDER  # one of MKLML, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
 #    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
@@ -17,7 +17,7 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
  set(CBLAS_INC_DIR ${MKLML_INC_DIR})
  set(CBLAS_LIBRARIES ${MKLML_LIB})

-  add_definitions(-DPADDLE_USE_MKLML)
+  add_definitions(-DPADDLE_WITH_MKLML)
  add_definitions(-DLAPACK_FOUND)

  message(STATUS "Found cblas and lapack in MKLML "
@@ -25,42 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
  return()
 endif()

-## Then find atlas.
-set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
-set(ATLAS_INCLUDE_SEARCH_PATHS
-        ${ATLAS_ROOT}/include
-        /usr/include
-        /usr/include/atlas)
-set(ATLAS_LIB_SEARCH_PATHS
-        ${ATLAS_ROOT}/lib
-        /usr/lib
-        /usr/lib/blas/atlas
-        /usr/lib/atlas
-        /usr/lib/atlas-base   # special for ubuntu 14.04.
-    )
-find_path(ATLAS_INC_DIR NAMES cblas.h
-  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
-  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
-  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3
-  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-
-if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER ATLAS)
-  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
-  set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB})
-
-  add_definitions(-DPADDLE_USE_ATLAS)
-  add_definitions(-DLAPACK_FOUND)
-
-  message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
-  return()
-endif()
-
 ## Then find openblas.
 set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
 set(OPENBLAS_INCLUDE_SEARCH_PATHS

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
    add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)

+if(WITH_ARM_FP16)
+    add_definitions(-DPADDLE_ARM_FP16)
+    add_definitions("-march=armv8.2-a+fp16+simd")
+endif(WITH_ARM_FP16)
+
 if(WITH_TESTING)
    add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)

--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
@@ -33,7 +33,7 @@ ExternalProject_Add(
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
    BUILD_IN_SOURCE 1
-    BUILD_COMMAND   make
+    BUILD_COMMAND   make -j8
    INSTALL_COMMAND make install
 )


--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -67,5 +67,5 @@ ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
 MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
-add_definitions(-DPADDLE_USE_MKLDNN)
+add_definitions(-DPADDLE_WITH_MKLDNN)
 LIST(APPEND external_project_dependencies mkldnn)
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -114,11 +114,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
-    ADD_LIBRARY(cblas SHARED ${dummyfile})
-ELSE()
-    ADD_LIBRARY(cblas STATIC ${dummyfile})
-ENDIF()
+ADD_LIBRARY(cblas STATIC ${dummyfile})
 TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})

 IF(NOT ${CBLAS_FOUND})

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -253,9 +253,9 @@ IF(NOT PROTOBUF_FOUND)
    IF(WITH_C_API)
        INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
        IF(ANDROID)
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
        ELSE()
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib)
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
        ENDIF()
    ENDIF()


--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
@@ -7,3 +7,4 @@ API
    模型配置 <v2/model_configs.rst>
    数据访问 <v2/data.rst>
    训练与应用 <v2/run_logic.rst>
+    v2/fluid.rst
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -7,3 +7,4 @@ API
    v2/model_configs.rst
    v2/data.rst
    v2/run_logic.rst
+    v2/fluid.rst
--- a/doc/api/v2/config/activation.rst
+++ b/doc/api/v2/config/activation.rst
@@ -99,3 +99,10 @@ STanh
 ..  automodule:: paddle.v2.activation
    :members: STanh
    :noindex:
+    
+SoftSign
+========
+
+..  automodule:: paddle.v2.activation
+    :members: SoftSign
+    :noindex:
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -467,7 +467,7 @@ lambda_cost
    :noindex:

 square_error_cost
--------
+-----------------
 ..  autoclass:: paddle.v2.layer.square_error_cost
    :noindex:

@@ -533,7 +533,7 @@ Miscs
 =====

 dropout
--------------
+--------
 ..  autoclass:: paddle.v2.layer.dropout
    :noindex:


--- a/doc/api/v2/fluid.rst
+++ b/doc/api/v2/fluid.rst
+======================
+Fluid
+======================
+
+..  toctree::
+    :maxdepth: 1
+
+    fluid/layers.rst
+    fluid/data_feeder.rst
+    fluid/executor.rst
+    fluid/initializer.rst
+    fluid/evaluator.rst
+    fluid/nets.rst
+    fluid/optimizer.rst
+    fluid/param_attr.rst
+    fluid/profiler.rst
+    fluid/regularizer.rst
+
--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/v2/fluid/data_feeder.rst
+===========
+DataFeeder
+===========
+
+DataFeeder
+-----------
+..  automodule:: paddle.v2.fluid.data_feeder
+    :members: DataFeeder
+    :noindex:
--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/v2/fluid/evaluator.rst
+===========
+Evaluator
+===========
+
+Evaluator
+-----------
+..  automodule:: paddle.v2.fluid.evaluator
+    :members: Evaluator
+    :noindex:
--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/v2/fluid/executor.rst
+===========
+Executor
+===========
+
+Executor
+-----------
+..  automodule:: paddle.v2.fluid.executor
+    :members: Executor
+    :noindex:
--- a/doc/api/v2/fluid/initializer.rst
+++ b/doc/api/v2/fluid/initializer.rst
+===========
+Initializer
+===========
+
+
+
+Initializer
+-----------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: Initializer
+    :noindex:
+
+
+
+ConstantInitializer
+-------------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: ConstantInitializer
+    :noindex:
+
+
+
+UniformInitializer
+------------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: UniformInitializer
+    :noindex:
+
+
+
+NormalInitializer
+-----------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: NormalInitializer
+    :noindex:
+
+
+XavierInitializer
+-----------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: XavierInitializer
+    :noindex:
+
+
+MSRAInitializer
+---------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: MSRAInitializer
+    :noindex:
+
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
+==========
+Layers
+==========
+
+
+fc
+---
+..  autofunction:: paddle.v2.fluid.layers.fc
+    :noindex:
+
+embedding
+---------
+..  autofunction:: paddle.v2.fluid.layers.embedding
+    :noindex:
+
+dynamic_lstm
+------------
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+    :noindex:
+
+data
+----
+..  autofunction:: paddle.v2.fluid.layers.data
+    :noindex:
+
+mean
+----
+..  autofunction:: paddle.v2.fluid.layers.mean
+    :noindex:
+
+mul
+---
+..  autofunction:: paddle.v2.fluid.layers.mul
+    :noindex:
+
+elementwise_add
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_add
+    :noindex:
+
+elementwise_div
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_div
+    :noindex:
+
+
+dropout
+-------
+..  autofunction:: paddle.v2.fluid.layers.dropout
+    :noindex:
+
+
+reshape
+--------
+..  autofunction:: paddle.v2.fluid.layers.reshape
+    :noindex:
+
+
+sigmoid
+---------
+..  autofunction:: paddle.v2.fluid.layers.sigmoid
+    :noindex:
+
+
+scale
+---------
+..  autofunction:: paddle.v2.fluid.layers.scale
+    :noindex:
+
+
+reshape
+---------
+..  autofunction:: paddle.v2.fluid.layers.reshape
+    :noindex:
+
+
+transpose
+---------
+..  autofunction:: paddle.v2.fluid.layers.transpose
+    :noindex:
+
+
+sigmoid_cross_entropy_with_logits
+---------------------------------
+..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
+    :noindex:
+
+
+cast
+----
+..  autofunction:: paddle.v2.fluid.layers.cast
+    :noindex:
+
+
+concat
+-------
+..  autofunction:: paddle.v2.fluid.layers.concat
+    :noindex:
+
+
+sums
+----
+..  autofunction:: paddle.v2.fluid.layers.sums
+    :noindex:
+
+
+linear_chain_crf
+----------------
+..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+    :noindex:
+
+
+assign
+-------
+..  autofunction:: paddle.v2.fluid.layers.embedding
+    :noindex:
+
+
+split_lod_tensor
+----------------
+..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+    :noindex:
+
+
+merge_lod_tensor
+----------------
+..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+    :noindex:
+
+cos_sim
+--------
+..  autofunction:: paddle.v2.fluid.layers.cos_sim
+    :noindex:
+
+
+cross_entropy
+-------------
+..  autofunction:: paddle.v2.fluid.layers.cross_entropy
+    :noindex:
+
+
+
+square_error_cost
+-----------------
+..  autofunction:: paddle.v2.fluid.layers.square_error_cost
+    :noindex:
+
+
+accuracy
+---------
+..  autofunction:: paddle.v2.fluid.layers.accuracy
+    :noindex:
+
+
+sequence_conv
+-------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_conv
+    :noindex:
+
+
+conv2d
+------
+..  autofunction:: paddle.v2.fluid.layers.conv2d
+    :noindex:
+
+
+sequence_pool
+-------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_pool
+    :noindex:
+
+
+pool2d
+------
+..  autofunction:: paddle.v2.fluid.layers.pool2d
+    :noindex:
+
+
+batch_norm
+----------
+..  autofunction:: paddle.v2.fluid.layers.batch_norm
+    :noindex:
+
+
+beam_search_decode
+------------------
+..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
+    :noindex:
+
+
+lod_rank_table
+--------------
+..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
+    :noindex:
+
+
+max_sequence_len
+----------------
+..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
+    :noindex:
+
+
+topk
+-----
+..  autofunction:: paddle.v2.fluid.layers.topk
+    :noindex:
+
+
+lod_tensor_to_array
+-------------------
+..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+    :noindex:
+
+
+
+array_to_lod_tensor
+-------------------
+..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+    :noindex:
+
+
+
+
+fill_constant
+-------------
+..  autofunction:: paddle.v2.fluid.layers.fill_constant
+    :noindex:
+
+
+
+fill_constant_batch_size_like
+-----------------------------
+..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+
+
+ones
+----
+..  autofunction:: paddle.v2.fluid.layers.ones
+    :noindex:
+
+
+zeros
+-----
+..  autofunction:: paddle.v2.fluid.layers.zeros
+    :noindex:
+
+
+increment
+---------
+..  autofunction:: paddle.v2.fluid.layers.increment
+    :noindex:
+
+
+array_write
+-----------
+..  autofunction:: paddle.v2.fluid.layers.array_write
+    :noindex:
+
+
+
+create_array
+------------
+..  autofunction:: paddle.v2.fluid.layers.create_array
+    :noindex:
+
+
+less_than
+---------
+..  autofunction:: paddle.v2.fluid.layers.less_than
+    :noindex:
+
+
+array_read
+----------
+..  autofunction:: paddle.v2.fluid.layers.array_read
+    :noindex:
+
+
+shrink_memory
+--------------
+..  autofunction:: paddle.v2.fluid.layers.shrink_memory
+    :noindex:
+
+
+array_length
+-------------
+..  autofunction:: paddle.v2.fluid.layers.array_length
+    :noindex:
+
+
+conv2d_transpose
+----------------
+..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+    :noindex:
+
+
+sequence_expand
+---------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+    :noindex:
+
+
+lstm_unit
+---------
+..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+    :noindex:
+
+
+sequence_softmax
+----------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
+    :noindex:
+
+
+reduce_sum
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+    :noindex:
+
+
+reduce_mean
+---------
+..  autofunction:: paddle.v2.fluid.layers.reduce_mean
+    :noindex:
+
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
+===========
+Nets
+===========
+
+simple_img_conv_pool
+--------------------
+..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
+    :noindex:
+
+
+img_conv_group
+---------------
+..  autofunction:: paddle.v2.fluid.nets.img_conv_group
+    :noindex:
+
+
+sequence_conv_pool
+------------------
+..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
+    :noindex:
+
+
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
+===========
+Optimizer
+===========
+
+Optimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: Optimizer
+    :noindex:
+
+
+SGDOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: SGDOptimizer
+    :noindex:
+
+
+
+MomentumOptimizer
+-----------------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: MomentumOptimizer
+    :noindex:
+
+
+
+AdagradOptimizer
+----------------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdagradOptimizer
+    :noindex:
+
+
+AdamOptimizer
+-------------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdamOptimizer
+    :noindex:
+
+
+AdamaxOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdamaxOptimizer
+    :noindex:
+
+
+DecayedAdagradOptimizer
+-----------------------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: DecayedAdagradOptimizer
+    :noindex:
+
--- a/doc/api/v2/fluid/param_attr.rst
+++ b/doc/api/v2/fluid/param_attr.rst
+===========
+ParamAttr
+===========
+
+
+
+ParamAttr
+-----------
+..  automodule:: paddle.v2.fluid.param_attr
+    :members: ParamAttr
+    :noindex:
--- a/doc/api/v2/fluid/profiler.rst
+++ b/doc/api/v2/fluid/profiler.rst
+===========
+Profiler
+===========
+
+
+
+Profiler
+-----------
+..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
+    :noindex:
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
+===========
+Regularizer
+===========
+
+WeightDecayRegularizer
+----------------------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: WeightDecayRegularizer
+    :noindex:
+
+
+L2DecayRegularizer
+------------------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: L2DecayRegularizer
+    :noindex:
+
+
+
+L1DecayRegularizer
+-------------------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: L1DecayRegularizer
+
+
--- a/doc/design/evaluator.md
+++ b/doc/design/evaluator.md
 ## Evaluator Design

-### The Problem
+### Problem Statement

-During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted.
+During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.

 ### Evaluator Design
-Currently, every operation is expressed in the graph. we divide the evaluator process into three steps.
+Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.

 1. Initialize the metric state and add it into the block.

-2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once.
+2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.


 3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.

 ### Implementation
-This design is shown in python API. 
-Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass. 
+This design is shown in the Python API. 
+Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. 

    
 ```python

--- a/doc/design/executor.md
+++ b/doc/design/executor.md
 # Executor Design Doc

 ## Motivation
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).

-We use executor to do the runtime evaluation of a `ProgramDesc`.
+The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.

 ## Overview

-An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.

-### What does executor do?
+## Executor

-It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
+The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
+It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.

-### What does executor NOT do?
+### The interface
+```c++
+  Executor(places);
+```
+A executor does not own any computing resources, a user can only construct an executor using the specified places.

-It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
+### Running an Executor

-It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
-
-## Implementation
-
-`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
+```
+  void Run(ProgramDesc, Scope, block_id, create_local_scope);
+```
+An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
--- a/doc/design/fluid-compiler.graffle
+++ b/doc/design/fluid-compiler.graffle
--- a/doc/design/fluid-compiler.png
+++ b/doc/design/fluid-compiler.png
--- a/doc/design/fluid.md
+++ b/doc/design/fluid.md
+# Design Doc: PaddlePaddle Fluid
+
+## Why Fluid
+
+When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe.  However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
+
+Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model.  In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no  concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
+
+## The Evolution of Deep Learning Systems
+
+Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
+
+| Existed since | model as sequence of layers | model as graph of operators | No model |
+|--|--|--|--|
+| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
+| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
+| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+
+From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
+
+## Deep Learning Programming Paradigms
+
+With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
+
+```python
+x = layer.data("image")
+l = layer.data("label")
+f = layer.fc(x, W)
+s = layer.softmax(f)
+c = layer.mse(l, s)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    forward({input=x, data=m}, minimize=c)
+    backward(...)
+
+print W # print the trained model parameters.
+```
+
+The above program includes two parts:
+
+1. The first part describes the model, and
+2. The second part describes the training process (or inference process) for the model.
+
+This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
+
+This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general,  prefer PyTorch over the older systems.  Using PyTorch, we would write the above program as following:
+
+```python
+W = tensor(...)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    x = m["image"]
+    l = m["label"]
+    f = layer.fc(x, W)
+    s = layer.softmax(f)
+    c = layer.mse(l, s)
+    backward()
+
+print W # print the trained model parameters.
+```
+
+We can see that the main difference is the moving the model configuration part (the first step) into the training loop.  This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block.  This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
+
+## Describe Arbitrary Models for the Future
+
+Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
+
+As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator.  A PyTorch example would look like the following:
+
+```python
+for i in xrange(1000):
+    m = read_minibatch()
+    x = m["sentence"]
+    for t in xrange x.len():
+        h[t] = the_step(x[t])
+```        
+
+With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
+
+```python
+train_loop = layers.While(cond)
+with train_loop.block():
+  m = read_minibatch()
+  x = m["sentence"]
+  rnn = layers.While(...)
+  with rnn.block():
+    h[t] = the_step(input[t])
+```    
+
+An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44).
+
+From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
+
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+
+## Turing Completeness
+
+In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine.  For a programming language, if it provides if-then-else and loop, it is Turing complete.  From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there  is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete.  Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
+
+## The Execution of a Fluid Program
+
+There are two ways to execute a Fluid program.  When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+
+Fluid is moving towards the direction of a compiler, which is explain in more detail later in this article.
+
+## Backward Compatibility of Fluid
+
+Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference.  For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph).  Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators.  The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
+
+For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
+
+## Towards a Deep Learning Language and the Compiler
+
+We can change the `if-then-else` and loop structure a little bit in the above Fluid example programs, to make it into a new programming language, different than Python.
+
+Even if we do not invent a new language, as long as we get the `ProgramDesc` message filled in, we can write a transpiler, which translates each invocation to an operator, into a C++ call to a kernel function of that operator. For example, a transpiler that weaves the CUDA kernels outputs an NVIDIA-friendly C++ program, which can be built using `nvcc`.  Another transpiler could generate MKL-friendly code that should be built using `icc` from Intel.  More interestingly, we can translate a Fluid program into its distributed version of two `ProgramDesc` messages, one for running on the trainer process, and the other one for the parameter server.  For more details of the last example, the [concurrent programming design](concurrent_programming.md) document would be a good pointer.  The following figure explains the proposed two-stage process:
+
+![](fluid-compiler.png)
--- a/doc/design/images/multigpu_allreduce.graffle
+++ b/doc/design/images/multigpu_allreduce.graffle
--- a/doc/design/images/multigpu_allreduce.png
+++ b/doc/design/images/multigpu_allreduce.png
--- a/doc/design/images/multigpu_before_convert.graffle
+++ b/doc/design/images/multigpu_before_convert.graffle
--- a/doc/design/images/multigpu_before_convert.png
+++ b/doc/design/images/multigpu_before_convert.png
--- a/doc/design/kernel_hint_design.md
+++ b/doc/design/kernel_hint_design.md
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+  if (Attr<bool>(kForceCPU)) {
+    return KernelType(CPUPlace, ...)
+  } else {
+    ...
+  }
+}
+```
+
+In Python code
+
+```python
+FORCE_CPU = core.kForceCPU()
+
+def xx_layer(..., force_cpu=false):
+  layer_helper = LayerHelper(...)
+  layer_helper.append_op(
+    type="xx",
+    attr={FORCE_CPU: force_cpu})
+```
--- a/doc/design/mkl/image/engine.png
+++ b/doc/design/mkl/image/engine.png
--- a/doc/design/mkl/image/gradients.png
+++ b/doc/design/mkl/image/gradients.png
--- a/doc/design/mkl/image/layers.png
+++ b/doc/design/mkl/image/layers.png
--- a/doc/design/mkl/image/matrix.png
+++ b/doc/design/mkl/image/matrix.png
--- a/doc/design/mkl/image/overview.png
+++ b/doc/design/mkl/image/overview.png
--- a/doc/design/mkl/mkl_packed.md
+++ b/doc/design/mkl/mkl_packed.md
+# Intel® MKL Packed on PaddlePaddle: Design Doc
+
+
+## Contents
+
+- [Overview](#overview)
+- [Key Points](#key-points) 
+   - [Background](#background)
+   - [Solution](#solution)
+- [Actions](#actions)
+    - [CMake](#cmake)
+	- [Layers](#layers)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+
+
+## Overview
+我们计划将 Intel® MKL 中引入的 GEMM Packed APIs\[[1](#references)\] 集成到 PaddlePaddle 中，充分发挥英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+现阶段的优化主要针对 Recurrent Neural Network（以下简称RNN）相关层（包括`RecurrentLayer`, `GatedRecurrentLayer`和`LstmLayer`）， 以及 PaddlePaddle V1 API。
+
+## Key Points
+
+### Background
+目前PaddlePaddle采用了 Intel® MKL库的[cblas_?gemm](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm)函数，这个函数本身会在计算前将原数据转换为更适合英特尔平台的内部格式。
+
+1. 转换耗时 \
+这一数据格式的转换操作（Packing），在问题本身的计算量比较小的时候，显得相对来说较为耗时。例如在DeepSpeech2 \[[2](#references)\] 的Vanilla RNN部分中，矩阵大小是`batch_size * 2048`。
+2. 转换冗余 \
+由于在现有的某些情况下（例如RNN），多次调用 cblas_?gemm 会使用相同的原数据，因此，每次调用时对原数据的重复Packing便成为了冗余。
+
+为了最大程度减少多次调用 cblas_?gemm 在Packing上的耗时，Intel® MKL 引入了以下四个API:
+   * [cblas_?gemm_alloc](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-alloc)
+   * [cblas_?gemm_pack](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-pack)
+   * [cblas_?gemm_compute](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-compute)
+   * [cblas_?gemm_free](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-free)
+
+通过使用这些API，我们可以先完成对原数据的Packing操作，再把已转换为Packed格式的数据传递给那些复用同一数据的gemm_compute函数，从而避免了Packing冗余。
+
+### Solution
+在RNN的情况下，同一次前向、后向（forward/backward）过程中所有时间步（time step）共享同一个权重（weight）。当只做推断（inference）时，各次前向之间也都使用了相同的权重，没有必要在每次前向中每个时间步的计算时对权重进行重复的Packing操作。
+
+我们通过使用新引入的GEMM Packed APIs，在层初始化的时候，先完成对权重的Packing操作，然后在前向，后向时复用已经转换过的权重，并在每次权重更新后，对新的权重进行转换用于下次迭代。
+
+* 优化前，对于序列长度（sequence length）为`T`的网络模型（model）, `N`次迭代执行的转换次数为：
+  - `inference`： `N * T`  
+  - `training`： `2 * N * T`
+* 优化后，对于同样设置的网络模型，其转换次数减少至：
+  - `inference`： `1`    
+  - `training`： `2 * N`
+
+## Actions
+
+添加的相关文件和目录结构如下：
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+└── paddle/
+    ├── ...
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   ├── MKLPackedRecurrentLayer.*
+        |   ├── MKLPackedGatedRecurrentLayer.*
+        |   ├── MKLPackedLstmLayer.*
+        |   └── MKLPackedGemm.h
+        └── tests/
+            ├── ...
+            └── test_MKLPacked.cpp
+```
+
+### CMake
+在对应的`CMakeLists.txt`中根据`WITH_MKL`是否打开，来决定是否开启MKL Packed相关功能。
+
+### Layers
+所有的`MKLPacked*Layer`都继承于PaddlePaddle的基类`Layer`, 并添加头文件 `MKLPackedGemm.h`，该文件对相关GEMM Packed APIs做了封装。
+
+### Unit Tests
+我们会添加`test_MKLPacked.cpp`用于MKL Packed优化后layer的测试。
+对于每一个新加的RNN layer，我们会对比如下2个方面：
+1. 对比优化后layer自身，sequence mode（`rnn_use_batch=false`）与batch mode(`rnn_use_batch=true`)的结果。
+2. 对比优化后layer与相对应的PaddlePaddle原有layer, 在batch mode下的结果。
+
+### Python API
+计划在`paddle/utils.Flags`中添加`use_mkl_packed`的flag，用于选择是否使用相关功能，并且当编译时`WITH_MKL=ON`的情况下，默认设置为`true`。
+
+同时，在`python/paddle/trainer/config_parser.py`中对应的layer处，添加`use_mkl_packed`这个选择，方便用户在Python端选择是否启用这个功能。
+
+具体实现方式比如：
+
+```python
+use_mkl_packed = bool(int(g_command_config_args.get("use_mkl_packed", 0)))
+if use_mkl_packed:
+    self.layer_type = mkl_packed_*
+```
+
+所有相关的`layer_type`会以*mkl_packed_*开头，这些会在`MKLPacked*Layer`注册layer的时候保证，以示区分。 
+
+
+### Benchmarking
+会添加相应的脚本用于测试和对比在使用MKL Packed recurrent layers 前后的网络性能。
+
+## References 
+1. [Introducing the new Packed APIs for GEMM](https://software.intel.com/en-us/articles/introducing-the-new-packed-apis-for-gemm)
+2. [DeepSpeech2 on PaddlePaddle](https://github.com/PaddlePaddle/DeepSpeech#deepspeech2-on-paddlepaddle)
+
--- a/doc/design/mkl/mkldnn.md
+++ b/doc/design/mkl/mkldnn.md
+# Intel® MKL-DNN on PaddlePaddle: Design Doc
+
+我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn)
+(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle，
+充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+
+<div align="center">
+<img src="image/overview.png"><br/>
+Figure 1. PaddlePaddle on IA
+</div>
+
+近期目标
+
+- 完成常用Layer的MKL-DNN实现。
+- 完成常见深度神经网络VGG，GoogLeNet 和 ResNet的MKL-DNN实现。
+
+目前的优化，主要针对PaddlePaddle在重构之前的代码框架以及V1的API。
+具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。
+
+## Contents
+
+- [Overview](#overview)
+- [Actions](#actions)
+ 	- [CMake](#cmake)
+ 	- [Matrix](#matrix)
+	- [Layers](#layers)
+	- [Activations](#activations)
+	- [Parameters](#parameters)
+	- [Gradients](#gradients)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+	- [Others](#others)
+- [Design Concerns](#design-concerns)
+
+## Overview
+
+我们会把MKL-DNN会作为第三方库集成进PaddlePaddle，与其他第三方库一样，会在编译PaddlePaddle的时候下载并编译MKL-DNN。
+
+同时，为了进一步提升PaddlePaddle在基本数学运算的计算速度，我们也将MKLML即(MKL small library\[[1](#references)\])
+作为另一个第三方库集成进PaddlePaddle，它只会包括生成好的动态库和头文件。
+
+MKL，MKLML以及MKL-DNN三者关系如下表：
+
+| Name        |  Open Source     | License     | Descriptions  |
+| :---------- | :--------------- | :---------- | :------------ |
+|   MKL       |     No           | Proprietary | Accelerate math processing routines | 
+|   MKLML     |     No           | Proprietary | Small package of MKL, especially for Machine Learning |
+|   MKL-DNN   |     Yes          | Apache 2.0  | Accelerate primitives processing routines especially for Deep Neural Networks  |
+
+MKLML可以与MKL-DNN共同使用，以此达到最好的性能。
+
+<div align="center">
+<img src="image/engine.png"><br/>
+Figure 2. PaddlePaddle with MKL Engines
+</div>
+
+## Actions
+
+添加的相关文件和目录结构如下：
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+├── cmake/
+│   ├── external/
+│   │   ├── ...
+│   │   ├── mkldnn.cmake
+│   │   └── mklml.cmake
+└── paddle/
+    ├── ...
+    ├── math/
+    │   ├── ...
+    │   └── MKLDNNMatrix.*
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   └── MKLDNN*Layer.*
+        ├── activations/
+        │   ├── ...
+        │   └── MKLDNNActivations.*
+        └── tests/
+            ├── ...
+            ├── MKLDNNTester.*
+            └── test_MKLDNN.cpp
+```
+
+### CMake
+在`CMakeLists.txt`中提供一个与MKL有关的总开关：`WITH_MKL`，它负责决定编译时是否使用MKLML和MKL-DNN
+
+- `WITH_MKLML` 控制是否使用MKLML库。 
+当打开`WITH_MKL`时，会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。
+编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
+MKLML的库目前都是动态库，主要包括`libiomp5.so`和`libmklml_intel.so`。
+- `WITH_MKLDNN` 控制是否使用MKL-DNN。
+当开启`WITH_MKL`时，会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。
+编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。
+MKL-DNN的库目前只有动态库`libmkldnn.so`。
+
+### Matrix
+目前在PaddlePaddle中数据都是以`NCHW`的格式存储，但是在MKL-DNN中的排列方式不止这一种。
+所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
+
+<div align="center">
+<img src="image/matrix.png"><br/>
+Figure 3. MKLDNNMatrix
+</div>
+
+### Layers
+所有MKL-DNN的Layers都会继承于`MKLDNNLayer`，该类继承于PaddlePaddle的基类`Layer`。
+在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑，
+子类只需要使用定义好的接口，实现具体的函数功能即可。
+
+<div align="center">
+<img src="image/layers.png"><br/>
+Figure 4. MKLDNNLayer
+</div>
+
+每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix：
+
+- 内部存储（internel memory）：`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表输入数据，输入梯度，输出数据和输出梯度。
+- 外部存储（external memory）：都是以ext开头，比如`extInVal_`和`extInGrad_`，它们主要是用于，
+当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时，转换内存的工作。
+需要注意的是，PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`，
+所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存，
+如果不需要外部存储用于转换，那么对应的内部存储也会与它们共享内存。
+- 转换函数（resetXXX）： 包括`resetInValue`，`resetInGrad`，`resetOutValue`和`resetOutGrad`，
+表示对输入数据，输入梯度，输出数据和输出梯度的转换。
+这些函数会根据输入参数重新设置内部和外部存储，当然这两者也可以相等，即表示不需要转换。
+
+注意：每个`MKLDNNlayer`的子类只需要使用内部存储就可以了，所有外部的转换工作都会在reset系列函数中都准备好。
+
+### Activations
+在重构前的PaddlePaddle中，激活函数是独立于`Layer`的概念，并且输入输出都是共用一块内存，
+所以添加了对应的`MKLDNNActivation`来实现，方式类似于`MKLDNNLayer`。
+
+### Parameters
+对于有参数的层，我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。
+如果存在数据排列格式不一样的情况时，我们会在网络训练之前把格式转换为MKL-DNN希望的格式，
+在训练结束的时候再保存为PaddlePaddle的格式，但是整个训练过程中不需要任何转换。
+这样既使得最终保存的参数格式与PaddlePaddle一致，又可以避免不必要的转换。
+
+### Gradients
+由于MKL-DNN的操作都是直接覆盖的形式，也就是说输出的结果不会在原来的数据上累加，
+这样带来的好处就是不需要一直清空memory，节省了不必要的操作。
+但是注意的是，当网络出现分支且在`backward`的时候，需要累加不同Layer传过来的梯度。
+所以在`MKLDNNlayer`中实现了一个merge的方法，此时每个小分支的`Input Gradient`
+会先临时保存在`MKLDNNMatrix`中，由分支处的Layer负责求和，并把结果放到当前层的`output_.grad`中。
+所以整体上，在实现每个子类的时候就不需要关心分支的事情了。
+
+<div align="center">
+<img src="image/gradients.png"><br/>
+Figure 5. Merge Gradients
+</div>
+
+### Unit Tests
+我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+测试分为每个Layer（或Activation）的单元测试和简单网络的整体测试。
+每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
+
+### Python API
+目前只考虑**v1 API**。
+
+计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择，方便用户选择使用MKL-DNN的layers。
+
+具体实现方式比如：
+
+```python
+use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+if use_mkldnn
+    self.layer_type = mkldnn_*
+```
+
+所有MKL-DNN的`layer_type`会以*mkldnn_*开头，这些会在`MKLDNN*Layer`注册layer的时候保证，以示区分。 
+
+同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
+
+### Benchmarking
+会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image)，用于测试和对比在使用MKL-DNN前后的CNN网络性能。
+测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md)
+
+### Others
+1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为4096，具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。
+2. 深入PaddlePaddle，寻找有没有其他可以优化的可能，进一步优化。比如可能会用OpenMP改进SGD的更新性能。
+
+## Design Concerns
+
+为了更好的符合PaddlePaddle的代码风格\[[3](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。
+
+我们总结出一些特别需要注意的点：
+
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，
+我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
+2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
+3. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。
+包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
+4. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，
+同时数据格式就是`NCHW`，这样下一个cpu device就能拿到正确的数据。
+在有普通的CPU layer时， `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。
+
+## References
+1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
+主要包括了深度学习相关的数学原语与操作，一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
+2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。
+目前在PaddlePaddle中，仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
+3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
+但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
+所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
-# Intel® MKL-DNN on PaddlePaddle: Design Doc
-
-我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle，充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
-
-我们短期内的基本目标是：
-
- 完成常用layer的MKL-DNN实现。
- 完成常见深度神经网络VGG，GoogLeNet 和 ResNet的MKL-DNN实现。
-
-
-## Contents
-
- [Overview](#overview)
- [Actions](#actions)
- 	- [CMake](#cmake)
-	- [Layers](#layers)
-	- [Activations](#activations)
-	- [Weights](#weights)
-	- [Unit Tests](#unit-tests)
-	- [Protobuf Messages](#protobuf-messages)
-	- [Python API](#python-api)
-	- [Demos](#demos)
-	- [Benchmarking](#benchmarking)
-	- [Others](#others)
- [Design Concerns](#design-concerns)
-
-## Overview
-
-我们会把MKL-DNN作为第三方库集成进PaddlePaddle，整体框架图
-<div align="center">
-<img src="image/overview.png" width=350><br/>
-Figure 1. PaddlePaddle on IA.
-</div>
-
-## Actions
-我们把集成方案大致分为了如下几个方面。
-
-### CMake
-我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关，他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。
-
-当打开`WITH_MKL`时，会开启MKLML的功能，作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上，同时会开启MKL-DNN功能。
-
-当关闭`WITH_MKL`时，MKLML和MKL-DNN功能会同时关闭。
-
-所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
-
-### Layers
-所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
-`paddle/gserver/layers`中，并且文件名都会一以*MKLDNN*开头。
-
-所有MKL-DNN的layers都会继承于一个叫做`MKLDNNLayer`的父类，该父类继承于PaddlePaddle的基类`Layer`。
-
-在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑。部分函数定义为纯虚函数，子类只需要实现这些函数即可。
-
-### Activations
-由于在PaddlePaddle中，激活函数是独立于layer概念的，所以会在`paddle/gserver/activations`目录下添加`MKLDNNActivation.h`和`MKLDNNActivation.cpp`文件用于定义和使用MKL-DNN的接口。
-
-### Weights
-由于有些layer是含有参数的，我们会尽量让MKL-DNN的参数与PaddlePaddle中`parameter`共享一块内存。
-同时，由于MKL-DNN在训练时使用的参数layout可能与PaddlePaddle默认的`nchw`不一致，我们会在网络训练的开始和结束时分别转换这个layout，使得最终保存的参数格式与PaddlePaddle一致。
-
-### Unit Tests
-会在`paddle/gserver/test`目录下添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
-测试分为每个layer(或activation)的单元测试和简单网络的整体测试。
-每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
-
-### Protobuf Messages
-根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
-
-### Python API
-目前只考虑**v1 API**。
-
-计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择，方便用户选择使用MKL-DNN的layers。
-
-具体实现方式比如：
-
-```python
-use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-if use_mkldnn
-    self.layer_type = mkldnn_*
-```
-
-所有MKL-DNN的layer type会以*mkldnn_*开头，以示区分。 
-
-并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py `和`layers.py`里面添加必要的MKL-DNN的接口。
-
-### Demos
-
-会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹，里面放入一些用于MKL-DNN测试的demo脚本。
-
-### Benchmarking
-会添加`benchmark/paddle/image/run_mkldnn.sh`，用于测试使用MKL-DNN之后的性能。
-
-### Others
-1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为64。
-2. 深入PaddlePaddle，寻找有没有其他可以优化的可能，进一步优化。比如可能会用OpenMP改进SGD的更新性能。
-
-## Design Concerns
-
-为了更好的符合PaddlePaddle的代码风格\[[2](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]。
-
-我们总结出一些特别需要注意的点：
-
-1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
-2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
-3. 创建`MKLDNNMatrix`，同时继承`CpuMatrix`和`mkldnn::memory`。用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
-4. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
-5. 每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表input value， input gradient，output value和output gradient。他们会存放MKL-DNN用到的internal memory。同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory)，主要是在格式与PaddlePaddle默认的`nchw`格式不匹配时，用于转换内存的工作。必要的转换函数也会在`MKLDNNLayer`中提前定义好，每个子类只需要调用定义好的reset buffer函数即可。
-6. 每个`MKLDNNlayer`的resetbuffer相关的函数（包括reset input、output的Value和grad），他们会根据输入参数reset internal和external的memory，当然这两者也可以相等，即表示不需要转换。只需要把握一个原则，每个`MKLDNNlayer`的子类，只需要使用internal的memory就可以了，所有external的转换工作在父类的reset函数中都提前准备好了。
-7. 一般来说，external的memory会尽量与PaddlePaddle中的`value`和`grad`共享内存。同时每个`MKLDNNLayer`中的external output value和gradient(也就是`extOutVal_`和`extOutGrad_`)必须分别与`output_.value`和`output_.grad`共享内存，因为PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`。如果不需要external的buffer用于转换，那么internal的buffer也会与他们共享内存。
-8. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，同时数据格式就是`nchw`，这样下一个cpu device就能拿到正确的数据。在有cpu device的时候，external的memory的格式始终是`nchw`或者`nc`。
-9. 由于MKL-DNN的输出操作都是覆盖data的，不是在原来的数据上累加，所以当网络出现分支时，在`backward`时会需要merge不同layer的梯度。`MKLDNNlayer`中会实现merge的方法，此时每个小分支的input gradient会先临时保存在一个`MKLDNNMatrix`中，由分支处的layer负责求和，并把结果放到这个layer的`output_.grad`中。所以整体上，每个子类并不会需要关心分支的事情，也是在父类都实现好了。
-10. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
-
-## References
-
-1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN")
-2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
-3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`，所以不存在这个问题)，所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
-
--- a/doc/design/mkldnn/image/overview.png
+++ b/doc/design/mkldnn/image/overview.png
--- a/doc/design/operator_kernel_type.md
+++ b/doc/design/operator_kernel_type.md
+# Design Doc: The Keys of Operator Kernel Type
+## Problem
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  proto::DataType data_type_;
+};
+```
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
+
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`. 
+
+We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. 
+
+For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+
+It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration.
+
+## Solution
+
+There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  platform::Library library_;
+  proto::DataType data_type_;
+  framework::Layout layout_;
+};
+```
+
+Following is the details:
+
+### Place
+
+`Place` is defined as follows:
+
+```cpp
+typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
+```
+
+`Place` is to represent the device memory where data is locating.
+
+
+### Library
+
+One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
+
+```cpp
+enum Library { Plain, MKLDNN, CUDNN };
+```
+
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle.
+
+If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created.
+
+
+### DataType
+
+
+`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
+
+### Layout
+
+Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
+
+Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework.
+
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too.
+
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW.
+
+- The inference of Layout is at run-time, not compile-time.
+
+- Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators.
+
+`Layout` is also defined as a enum variable:
+
+```cpp
+enum Layout {
+  kNCHW,
+  kNHWC,
+#ifdef PADDLE_WITH_MKLDNN
+  knChw8c
+  ...
+#endif
+};
+```
--- a/doc/design/paddle_nccl.md
+++ b/doc/design/paddle_nccl.md
+# Design Doc: NCCL support in Paddle Fluid
+
+## Abstract
+
+This Design Doc refers to the NCCL feature in  paddle.  We propose an approach to support NCCL library both on a single machine and multiple machines. We wrapper the NCCL primitives `Broadcast`, `Allreduce`, `Reduce` as operators to utilize Multi-GPU powers in one script.
+
+
+## Motivation
+
+[NCCL](https://developer.nvidia.com/nccl) is a NVIDIA library support Multi-GPU communicating and optimized for NVIDIA GPUs, it provides routines such as all-gather, all-reduce, broadcast, reduce, reduce-scatter, that can achieve high bandwidth over PCIe and NVLink high-speed interconnect. With NCCL library, we can easily accelerate the training in parallel. 
+
+- Pros
+1. easily plug-in with [NCCL2](https://developer.nvidia.com/nccl) library.
+1. high performance in NVIDIA GPUs.
+1. MPI like primitives, which have low learning cost for users.
+
+- Cons
+1. Only design for NVIDIA GPUs, not a general multi-device solution.
+1. Although NCCL1 is opensourced under BSD license, but NCCL2 is not opensourced anymore.
+
+At the beginning of training, the framework needs to distribute the same parameters to every GPU, and merge the gradients at any time user interests.
+
+As a result, during training, we need the operations of peer to peer copy between different GPUs, aggregating gradients/parameters from GPUs, and broadcasting parameters to GPUs. Every GPU only need to run the operator with correct place information.
+
+Besides, it needs interfaces to synchronize model update with each different GPU Cards. 
+
+## Implementation
+
+As mentioned above, we wrap the NCCL routines as several kinds of operators. Need to note that NCCL need to create Communicator between gpu at the beginning, so there is a NCCLInit operator created.
+
+### Transpiler
+
+To be compatible with [parameter server design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md), the transpiler compiles the user defined operation graph into sub-graphs to be executed on different devices.
+
+1. The user-defined model will be a single device program
+
+2. Broadcast/Reduce operators between GPUs will be inserted into the program, even for the multi-node, may insert the `Send`, `Recv` operator.
+
+   *Broadcast, AllReduce in a single machine. And Broadcast, AllReduce, [Send, Recv](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md#graph-converter) in multiple machines*
+
+   <img src="images/multigpu_before_convert.png" width="300"/>
+
+After compiling, the graph as shows
+
+<img src="images/multigpu_allreduce.png" width="1000"/>
+
+Operators are added to the sub-graphs. Every GPU assigned a role of `rank0`, `rank1` etc. 
+
+- **Broadcast**. Broadcast operator distribute initialized parameter to all the GPUs from the GPU who owns it. e.g. from`rank0` GPU.
+- **AllReduce**. AllReduce operator synchronizes parameters/gradients between GPUs. AllReduce implemented in the Ring-Based  communicating method, avoid of the bottle neck in a single GPU.
+
+Need to notice that AllReduce operator force GPUs synchronized at that point. The whole training process in asynchronous or synchronous mode depends on the AllReduce point in the graph.
+
+As it shown in the picture, when each GPU compute the gradient of `W`, followed with a `AllReduce` operator, accumulate the `dW` to full batch of data, then run the optimize process individually and apply the gradient to its `W`.
+
+- **AllReduce**
+  Need to note that our AllReduce operator is a ring-base AllReduce implementation. If we use the NCCL2 AllReduce primitive, every GPU optimized full batch of data, wasted (n-1) GPU compute resources. In addition, NCCL2 built-in AllReduce will only utilize the communicating resource during synchronization, then update the gradient will be a subsequent phase. In fact, we can amortize the update gradient time cost into the communicating phase. The process is
+1. Every parameter has its root card. That card will responsible for aggregating the gradients from GPUs.
+2. The whole model's parameter will be hashed to different root card, ensure the load balance between GPUs.
+3. Logically neighberhood card will start send parameter to the next one. After one round, the parameter main card will aggregate the full gradients.
+4. Then the root card will optimize the parameter.
+5. This parameter card will send its optimized result to its neighberhood, then the neighberhood will send parameter to its next one.
+6. Finish the sychronization round.
+
+The total time cost will be 2 * (n-1) * per-parameter-send-time, we reach the goal of amortize the upgrade time into communicating phase.
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
@@ -53,7 +53,7 @@ The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the
 The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.

 This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
-[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
+[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md)

 ## Distributed Training Architecture


--- a/doc/design/refactor/multi_cpu.md
+++ b/doc/design/refactor/multi_cpu.md
+# Design Doc: Execute the Program with Multi CPU
+
+## Abstract
+
+This Design Doc propose an approach to make the user-defined Op graph
+running with multi-CPU, we will use an auto transpiler to convert the user-defined
+Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
+
+## Transpiler
+
+<img src="src/multi-threads/single-thread@3x.png" width="300">
+
+After converted:
+
+<img src="src/multi-threads/multi-threads@3x.png" width="1000">
+
+## Implement
+
+- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
+  which would be executed with multi-threads.
+- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
+  for the atomic counter become `0`:
+  ```cpp
+  BlockingCounter bc(thread_count);
+  for (int i = 0; i < thread_count; ++i) {
+    thread_pool->Start([&bc] {bc.DecrementCount(); })
+  }
+  bc.Wait();
+  ```
+- `ParallelDo` Operator
+  - Initialize a thread pool which is a Singleton.
+  - Use a block id as the input, and create run the specify Block on independent scope
+    with multi-threads.
+  - Initialize a `BlockingCounter` instance and wait until all threads are done.
+- `Split` Operator will split the Input Tensor into a TensorArray.
+- `Merge` merge all the gradients which calculated in different threads
+  with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
+
+## TODO
+
+- Improve the optimizer stage with multi-threads, since we could
+  assign the parameters to the different threads and execute
+  optimizer with multi-threads.
--- a/doc/design/refactor/src/multi-threads.graffle
+++ b/doc/design/refactor/src/multi-threads.graffle
--- a/doc/design/refactor/src/multi-threads/multi-threads@3x.png
+++ b/doc/design/refactor/src/multi-threads/multi-threads@3x.png
--- a/doc/design/refactor/src/multi-threads/single-thread@3x.png
+++ b/doc/design/refactor/src/multi-threads/single-thread@3x.png
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -5,8 +5,9 @@ PaddlePaddle使用git-flow branching model做分支管理，使用[Semantic Vers
 PaddlePaddle每次发新的版本，遵循以下流程:

 1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
-2. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
-3. 对这个版本的提交，做如下几个操作:
+1. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
+1. 对这个版本的提交，做如下几个操作:
+	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
 	* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
 	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
 	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
@@ -20,9 +21,9 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 			pip install twine
 			twine upload dist/[package to upload]
 			```
-4. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-5. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
-6. 协同完成Release Note的书写
+1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
+1. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
+1. 协同完成Release Note的书写


 需要注意的是:
@@ -30,7 +31,7 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 * `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
 * 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。

-# PaddlePaddle 分支规范
+## PaddlePaddle 分支规范

 PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。

@@ -47,11 +48,11 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-

 * BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。

-# PaddlePaddle回归测试列表
+## PaddlePaddle回归测试列表

 本列表说明PaddlePaddle发版之前需要测试的功能点。

-## PaddlePaddle Book中所有章节
+### PaddlePaddle Book中所有章节

 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。


--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
+# Design Doc: Supporting new Device/Library
+
+## Background
+
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
+
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+
+On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
+
+So, how to support a new Device/Library in Fluid becomes a challenge.
+
+
+## Basic: Integrate A New Device/Library
+
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
+
+There are mainly three parts that we have to consider while integrating a new device/library:
+
+- Place and DeviceContext: indicates the device id and manages hardware resources
+
+- Memory and Tensor: malloc/free data on certain device
+
+- Math Functor and OpKernel: implement computing unit on certain devices/libraries
+
+### Place and DeviceContext
+
+
+#### Place
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent different devices and computing libraries. There are inheritance relationships between different kinds of `Place`.
+
+```
+        |   CPUPlace   --> MKLDNNPlace
+Place --|   CUDAPlace  --> CUDNNPlace
+        |   FPGAPlace
+```
+
+And `Place` is defined as follows:
+
+```
+typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
+```
+
+#### DeviceContext
+
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different hardwares, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+
+
+```
+                /->  CPUDeviceContext   --> MKLDeviceContext
+DeviceContext ---->  CUDADeviceContext  --> CUDNNDeviceContext
+                \->  FPGADeviceContext
+```
+
+An example of Nvidia GPU is as follows:
+
+- DeviceContext
+
+
+```
+class DeviceContext {
+  virtual Place GetPlace() const = 0;
+};  
+```
+
+
+- CUDADeviceContext
+
+
+```
+class CUDADeviceContext : public DeviceContext {
+  Place GetPlace() const override { return place_; }
+private:
+  CUDAPlace place_;
+  cudaStream_t stream_; 
+  cublasHandle_t cublas_handle_;
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;  // binds with stream_
+};
+```
+
+- CUDNNDeviceContext
+
+```
+class CUDNNDeviceContext : public CUDADeviceContext {
+  private:
+    cudnnHandle_t cudnn_handle_;
+};
+```
+
+
+### Memory and Tensor
+
+
+#### memory module
+
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36):
+
+```
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+template <typename Place>
+void Free(Place place, void* ptr);
+
+template <typename Place>
+size_t Used(Place place);
+```
+
+To implementing these interfaces, we have to implement MemoryAllocator for different Devices
+
+
+#### Tensor
+
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place.
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+
+
+### Math Functor and OpKernel
+
+Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
+
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
+
+The interface is defined in header file.
+
+```
+template <typename DeviceContext, typename T>
+class MaxOutFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
+};
+```
+
+CPU implemention is in .cc file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
+  public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};
+```
+
+CUDA implemention is in .cu file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};                  
+```
+
+
+We get computing handle from a concrete DeviceContext, and make compution on tensors.
+
+The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+
+Fluid provides different register interfaces in op_registry.h
+
+
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example:
+
+In .cc file:
+
+```
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
+```
+
+In .cu file:
+
+```
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
+```
+
+
+## Advanced topics: How to switch between different Device/Library
+
+Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+
+
+We will discuss how to implement an efficient OpKernel switch policy. 
+
+- TBD
--- a/doc/design/switch_kernel.md
+++ b/doc/design/switch_kernel.md
+## Background
+Every operator has many kernels because there are multiple data types, places, data layout that Fluid supports. We use the `KernelType` to describe kernel types that operators can hold. 
+
+The `KernelType` is as follows.
+
+```
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+
+The `place_` is a descriptor of the device and the computational library, e.g., `MKLDNNPlace`, `CUDAPlace`.
+
+The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float`/`double`.
+
+The `layout` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+
+## Problem
+
+We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
+
+1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
+2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
+3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
+
+Problems under these situations are similar. We can formalise this problem as follow.
+
+We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
+
+## Solution
+
+It is clearly that transforming inputs of an operator toadapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+
+We can infer a kernel type from the inputs of an operators. We let this kernel type as `actual kernel type`, which means this kernel type is the actually kernel type that operator should be performed.
+
+We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
+
+We transform the input data from `actual` to `expect` if the expect kernel type is not as same as actual kernel type.
+
+The algorithm is described as follow
+
+```cpp
+using DataTransformationFN = std::function<void(const Tensor& in, Tensor* out)>;
+using KernelTypePair = std::pair<KernelType, KernelType>;
+
+map<KernelTypePair, DataTransformationFN> g_data_transformation_;
+
+void OpWithKernel::Run() {
+  vec<Tensor> inputs = ...
+  auto actual_kernel_type = GetActualKernelType(inputs);
+  
+  // The expected kernel type is related to actual kernel type.
+  // For the most operators, the expected kernel type is as same as
+  // actual kernel type.
+  //
+  // So we pass `actual_kernel_type` as a parameter of 
+  // GetExpectedKernelType
+  auto expect_kernel_type = GetExpectedKernelType(actual_kernel_type);
+  
+  auto trans = g_data_transformation_[{actual_kernel_type, expect_kernel_type}];
+  
+  kernel.run(trans(inputs));
+}
+```
--- a/doc/faq/build_and_install/index_cn.rst
+++ b/doc/faq/build_and_install/index_cn.rst
@@ -14,7 +14,7 @@

    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu

 更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。


--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -19,7 +19,7 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
   git clone https://github.com/PaddlePaddle/Paddle.git
   cd Paddle
   # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
   # 如果不使用Docker编译环境，执行下面的命令
   mkdir build
   cd build
@@ -30,7 +30,7 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译

 .. code-block:: bash

-   pip install python/dist/*.whl
+   pip install build/python/dist/*.whl


 .. _run_test:
@@ -45,7 +45,7 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译

 .. code-block:: bash

-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh

 如果不使用Docker，可以执行ctest命令即可：

@@ -70,13 +70,13 @@ PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其
   :header: "依赖", "版本", "说明"
   :widths: 10, 15, 30

-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
-   "Python", "2.7.x", "依赖libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
   "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "可选"
+   "Go", ">=1.8", "可选"


 .. _build_options:

--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -21,7 +21,7 @@ Then run:
   git clone https://github.com/PaddlePaddle/Paddle.git
   cd Paddle
   # run the following command to build a CPU-Only binaries if you are using docker
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
   # else run these commands
   mkdir build
   cd build
@@ -34,7 +34,7 @@ machine or copy it to the target machine.

 .. code-block:: bash

-   pip install python/dist/*.whl
+   pip install build/python/dist/*.whl


 .. _run_test:
@@ -49,7 +49,7 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.

 .. code-block:: bash

-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh

 If you don't use Docker, just run ctest will start the tests:

@@ -76,13 +76,13 @@ will be downloaded automatically.
   :header: "Dependency", "Version", "Description"
   :widths: 10, 15, 30

-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
-   "Python", "2.7.x", "Need libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
   "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "Optional"
+   "Go", ">=1.8", "Optional"


 .. _build_options:
@@ -117,7 +117,7 @@ You can add :code:`-D` argument to pass such options, like:
    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
    "WITH_STYLE_CHECK", "Check code style when building", "ON"
    "WITH_TESTING", "Build unit tests", "ON"
-    "WITH_DOC", "Build documentaions", "OFF"
+    "WITH_DOC", "Build documentations", "OFF"
    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"

--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -114,7 +114,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note

  .. code-block:: bash

-     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash

 **注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**

@@ -122,13 +122,13 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note

     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu

 **关于AVX：**

 AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
 是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
-`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。

 以下指令能检查Linux电脑是否支持AVX：


--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -122,7 +122,7 @@ GPU driver installed before move on.

  .. code-block:: bash

-     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash

 **NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**

@@ -130,14 +130,14 @@ GPU driver installed before move on.

     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu

 **About AVX:**

 AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
 The latest PaddlePaddle Docker image turns AVX on by default, so, if your
 computer doesn't support AVX, you'll probably need to
-`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`.
+`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.

 The following command will tell you whether your computer supports AVX.


--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -13,7 +13,7 @@ PaddlePaddle提供pip和Docker的安装方式：

   pip_install_cn.rst
   docker_install_cn.rst
-
+   ../../howto/dev/build_cn.md

 编译流程
 ++++++++

--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -13,6 +13,7 @@ You can choose either pip or Docker to complete your install:

   pip_install_en.rst
   docker_install_en.rst
+   ../../howto/dev/build_en.md


 Build from Source

--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -34,14 +34,14 @@ PaddlePaddle可以使用常用的Python包管理工具
   :align: center

 ..  csv-table:: 各个版本最新的whl包
-    :header: "版本说明", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
    :widths: 1, 3, 3, 3

-    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"

 .. _pip_dependency:

@@ -83,4 +83,4 @@ PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.or

  获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。

-  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
\ No newline at end of file
+  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -37,14 +37,14 @@ If the links below shows up the login form, just click "Log in as guest" to star
   :align: center

 ..  csv-table:: whl package of each version
-    :header: "version", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
    :widths: 1, 3, 3, 3

-    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"

 .. _pip_dependency:


--- a/doc/getstarted/concepts/src/infer.py
+++ b/doc/getstarted/concepts/src/infer.py
+import paddle.v2 as paddle
+import numpy as np
+
+paddle.init(use_gpu=False)
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+# loading the model which generated by training
+with open('params_pass_90.tar', 'r') as f:
+    parameters = paddle.parameters.Parameters.from_tar(f)
+
+# Input multiple sets of data，Output the infer result in a array.
+i = [[[1, 2]], [[3, 4]], [[5, 6]]]
+print paddle.infer(output_layer=y_predict, parameters=parameters, input=i)
+# Will print:
+# [[ -3.24491572]
+#  [ -6.94668722]
+#  [-10.64845848]]
--- a/doc/getstarted/concepts/src/train.py
+++ b/doc/getstarted/concepts/src/train.py
@@ -26,6 +26,11 @@ def event_handler(event):
        if event.batch_id % 1 == 0:
            print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id,
                                                  event.cost)
+    # product model every 10 pass
+    if isinstance(event, paddle.event.EndPass):
+        if event.pass_id % 10 == 0:
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)


 # define training dataset reader

--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -147,4 +147,9 @@ PaddlePaddle支持不同类型的输入数据，主要包括四种类型，和
 ..  literalinclude:: src/train.py
    :linenos:

+使用以上训练好的模型进行预测，取其中一个模型params_pass_90.tar，输入需要预测的向量组，然后打印输出：
+
+..  literalinclude:: src/infer.py
+    :linenos:
+
 有关线性回归的实际应用，可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
--- a/doc/howto/dev/build_cn.md
+++ b/doc/howto/dev/build_cn.md
-# 编译PaddlePaddle和运行单元测试
+# 用Docker编译和测试PaddlePaddle

 ## 需要的软硬件


--- a/doc/howto/dev/build_en.md
+++ b/doc/howto/dev/build_en.md
-# Build PaddlePaddle from Source Code and Run Unit Test
+# Build using Docker

 ## What Developers Need


--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -76,18 +76,18 @@ no changes added to commit (use "git add" and/or "git commit -a")

 ## 构建和测试

-编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:dev`来代替。
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:latest-dev`来代替。

 如要build这个开发镜像，在源码目录树的根目录中运行：

 ```bash
-➜  docker build -t paddle:dev .
+➜  docker build -t paddle:latest-dev .
 ```

 随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：

 ```bash
-➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
 ```

 这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`，并且输出一个 `./build/paddle.deb`文件之外，还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*（`paddle:prod`）：
@@ -99,7 +99,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 如果要运行所有的单元测试，可以用如下命令：

 ```bash
-➜  docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
 ```

 关于构建和测试的更多信息，请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。

--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
 # 如何写新的Operator

 - [概念简介](#概念简介)
- - [实现C++类](#实现C++类)
-   - [定义ProtoMaker类](#定义ProtoMaker类)
-   - [定义Operator类](#定义Operator类)
-   - [定义OpKernel类](#定义OpKernel类)
-   - [注册Operator](#注册Operator)
+ - [实现C++类](#实现c类)
+   - [定义ProtoMaker类](#定义protomaker类)
+   - [定义Operator类](#定义operator类)
+   - [定义OpKernel类](#定义opkernel类)
+   - [注册Operator](#注册operator)
   - [编译](#编译)
- - [绑定Python](#绑定Python)
+ - [绑定Python](#绑定python)
 - [实现单元测试](#实现单元测试)
-   - [前向Operator单测](#前向Operator单测)
-   - [反向Operator单测](#反向Operator单测)
+   - [前向Operator单测](#前向operator单测)
+   - [反向Operator单测](#反向operator单测)
   - [编译和执行](#编译和执行)
+ - [注意事项](#注意事项)


 ## 概念简介
@@ -30,8 +31,8 @@
 --------------  | :----------------------
 OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
 Op定义           | `.cc`文件
-Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，GPU 实现在`.cu`文件中。
-注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，GPU实现在`.cu`文件中
+Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
+注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中


 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
@@ -43,7 +44,7 @@ Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU
 ## 实现C++类


-### 1. 定义ProtoMaker类
+### 定义ProtoMaker类

 矩阵乘法的公式：$Out = X * Y$, 可见该计算由两个输入，一个输出组成。

@@ -52,7 +53,7 @@ Kernel实现       | CPU、GPU共享Kernel实现在`.h`文件中，否则，CPU
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -81,7 +82,7 @@ The equation is: Out = X * Y
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of scale operator.").NotInGradient();
    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
@@ -100,7 +101,7 @@ The equation is: Out = scale*X
 - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。


-### 2. 定义Operator类
+### 定义Operator类

 下面的点实现了MulOp的定义：

@@ -149,11 +150,11 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,

 通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中，和下面将要介绍的注册函数一起放在`.cc`中

-### 3. 定义OpKernel类
+### 定义OpKernel类

 `MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:

- `typename  Place`: 表示设备类型，不同设备(CPU、GPU)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。

 - `typename T` : 表示数据类型，如`float`, `double`等。

@@ -165,7 +166,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 下面是 `MulKernel` `Compute`的实现：

  ```cpp
-  template <typename Place, typename T>
+  template <typename DeviceContext, typename T>
  class MulKernel : public framework::OpKernel {
  public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -173,33 +174,32 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
    auto* Y = context.Input<Tensor>("Y");
    auto* Z = context.Output<Tensor>("Out");
    Z->mutable_data<T>(context.GetPlace());
-    auto* device_context =
-        const_cast<platform::DeviceContext*>(context.device_context_);
-    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
  }
  };
  ```

-需要注意：**不同设备(CPU、GPU)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
+需要注意：**不同设备(CPU、CUDA)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**

-`MulOp`的CPU、GPU实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。

-为了使`OpKernel`的计算过程书写更加简单，并且CPU、GPU的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。


 到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
 反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。

-### 4. 注册Operator
+### 注册Operator

 - 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。

    ```cpp
    namespace ops = paddle::operators;
    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
-    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
    REGISTER_OP_CPU_KERNEL(mul_grad,
-                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
    ```

   在上面的代码中：
@@ -209,20 +209,20 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。


- 在 `.cu`文件中注册GPU Kernel。
-    - 请注意，如果GPU Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：
+- 在 `.cu`文件中注册CUDA Kernel。
+    - 请注意，如果CUDA Kernel的实现基于Eigen unsupported模块，那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`，代码示例如下：

    ```cpp
    // if use Eigen unsupported module before include head files
-    // #define EIGEN_USE_GPU
+    #define EIGEN_USE_GPU

    namespace ops = paddle::operators;
-    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
-    REGISTER_OP_GPU_KERNEL(mul_grad,
-                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
    ```

-### 5. 编译
+### 编译

 运行下面命令可以进行编译：

@@ -236,71 +236,57 @@ make mul_op

 ## 实现单元测试

-单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。

-### 前向Operator单元测试
+### 前向Operator单测

-前向Op单元测试继承自`unittest.TestCase`，并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator，需要：
+Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator，需要：

 1. 在`setUp`函数定义输入、输出，以及相关的属性参数。
 2. 生成随机的输入数据。
 3. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比。
+4. 反向计算已经自动集成进测试框架，直接调用相应接口即可。


  ```python
  import unittest
  import numpy as np
-  from gradient_checker import GradientChecker, create_op
-  from op_test_util import OpTestMeta
+  from op_test import OpTest

-  class TestMulOp(unittest.TestCase):
-      __metaclass__ = OpTestMeta

+  class TestMulOp(OpTest):
      def setUp(self):
-          self.type = "mul"
+          self.op_type = "mul"
          self.inputs = {
              'X': np.random.random((32, 84)).astype("float32"),
              'Y': np.random.random((84, 100)).astype("float32")
          }
          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
-  ```
-
-上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：
-
- `self.type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。

+      def test_check_output(self):
+          self.check_output()

-### 反向Operator单元测试
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)

-反向Op单元测试继承自`GradientChecker`，而`GradientChecker`继承自`unittest.TestCase`，因此，**反向单元测试函数需要以`test_`开头**。
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))

-```python
-class TestMulGradOp(GradientChecker):
-    def setUp(self):
-        self.op = create_op("mul")
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
-        }
-
-    def test_check_grad_normal(self):
-        # mul op will enlarge the relative error
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+  ```

-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+上面的代码首先导入依赖的包，下面是对`setUp`函数中操作的重要变量的详细解释：

-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-```
+- `self.op_type = "mul" ` : 定义类型，与operator注册时注册的类型一致。
+- `self.inputs` : 定义输入，类型为`numpy.array`，并初始化。
+- `self.outputs` : 定义输出，并在Python脚本中完成与operator同样的计算逻辑，返回Python端的计算结果。

-下面解释代码中一些关键的地方:
+### 反向operator单测

- 调用`create_op("mul")`创建反向Op对应的前向Op。
+而反向测试中：
 - `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
  - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
  - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
@@ -308,7 +294,7 @@ class TestMulGradOp(GradientChecker):
 - `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。


-### 编译和执行单元测试
+### 编译和执行

 `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。

@@ -328,5 +314,5 @@ ctest -R test_mul_op

 - 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。不允许一个文件中包含多个Op，这将会导致编译出错。
 - 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`等，这将会导致单元测试出错。
- 如果Op没有实现GPU Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
+- 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
 - 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
 # How to write a new operator

 - [Background](#background)
- - [Implementing C++ Types](#implementing-c++-types)
-   - [Defining ProtoMaker](#defining-protoMaker)
+ - [Implementing C++ Types](#implementing-c-types)
+   - [Defining ProtoMaker](#defining-protomaker)
   - [Defining Operator](#defining-operator)
   - [Registering Operator](#registering-operator)
   - [Compilation](#compilation)
@@ -28,8 +28,8 @@ An operator can be differentiated by whether in has kernel methods. An operator
 --------------  | :----------------------
 OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
 Op definition           | `.cc` files
-Kernel implementation       | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files.
-Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation.
+Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
+Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.


 New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
@@ -41,7 +41,7 @@ Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePadd
 ## Implementing C++ Types


-### 1. Defining Class ProtoMaker
+### Defining ProtoMaker

 Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.

@@ -50,7 +50,7 @@ First, define `ProtoMaker` to describe the Operator's input, output, and additio
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -79,7 +79,7 @@ An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/de
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of scale operator.").NotInGradient();
    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
@@ -98,7 +98,7 @@ There are two changes in this example:
 - `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.


-### 2. Defining Operator
+### Defining Operator

 The following code defines the interface for MulOp:

@@ -147,11 +147,11 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,

 Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.

-### 3. Defining OpKernel
+### Defining OpKernel

 `MulKernel` inherits `framework::OpKernel`, which includes the following templates:

- `typename  Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).

 - `typename T` denotes data type, such as `float` or `double`.

@@ -163,7 +163,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 `MulKernel`'s implementation of `Compute` is as follows:

  ```cpp
-  template <typename Place, typename T>
+  template <typename DeviceContext, typename T>
  class MulKernel : public framework::OpKernel {
  public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -171,16 +171,15 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
    auto* Y = context.Input<Tensor>("Y");
    auto* Z = context.Output<Tensor>("Out");
    Z->mutable_data<T>(context.GetPlace());
-    auto* device_context =
-        const_cast<platform::DeviceContext*>(context.device_context_);
-    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
  }
  };
  ```

-Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
+Note that **different devices (CPU, CUDA)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**

-`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).

 To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).

@@ -189,16 +188,16 @@ This concludes the forward implementation of an operator. Next its operation and

 The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.

-### 4. Registering Operator
+### Registering Operator

 - In `.cc` files, register forward and backward operator classes and the CPU kernel.

    ```cpp
    namespace ops = paddle::operators;
    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
-    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
    REGISTER_OP_CPU_KERNEL(mul_grad,
-                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
    ```

   In that code block,
@@ -208,20 +207,20 @@ The definition of its corresponding backward operator, if applicable, is similar
    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.


- Registering GPU Kernel in `.cu` files
-    - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
+- Registering CUDA Kernel in `.cu` files
+    - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as

    ```cpp
    // if use Eigen unsupported module before include head files
    #define EIGEN_USE_GPU

    namespace ops = paddle::operators;
-    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
-    REGISTER_OP_GPU_KERNEL(mul_grad,
-                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
    ```

-### 5. Compilation
+### Compilation

 Run the following commands to compile.

@@ -253,62 +252,51 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass

 2. Generating random input data.

-3. Implementing the same computation logic in a Python script:
+3. Implementing the same computation logic in a Python script.
+
+4. Call check gradient function to check the backward operator.

  ```python
  import unittest
  import numpy as np
-  from gradient_checker import GradientChecker, create_op
-  from op_test_util import OpTestMeta
+  from op_test import OpTest

-  class TestMulOp(unittest.TestCase):
-      __metaclass__ = OpTestMeta

+  class TestMulOp(OpTest):
      def setUp(self):
-          self.type = "mul"
+          self.op_type = "mul"
          self.inputs = {
              'X': np.random.random((32, 84)).astype("float32"),
              'Y': np.random.random((84, 100)).astype("float32")
          }
          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+      def test_check_output(self):
+          self.check_output()
+          
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
  ```
 Get its output, and compare it with the forward operator's own output.

 The code above first loads required packages. In addition, we have

- `self.type = "mul" ` defines the type that is identical to what the operator's registered type.
+- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type.
 - `self.inputs` defines input, with type `numpy.array` and initializes it.
 - `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.

 ### Testing Backward Operators

-A backward operator unit test inherits `GradientChecker`, which inherits `unittest.TestCase`. As a result, **a backward operator unit test needs to be have the prefix `test_`**.
-
-```python
-class TestMulGradOp(GradientChecker):
-    def setUp(self):
-        self.op = create_op("mul")
-        self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
-        }
-
-    def test_check_grad_normal(self):
-        # mul op will enlarge the relative error
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-```
-
-Some key points in the code above include:
+Some key points in checking gradient above include:

- `create_op("mul")` creates the backward operator's corresponding forward operator.
 - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
  - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
  - The second variable `"Out"` points to the network's final output target `Out`.
@@ -338,5 +326,5 @@ ctest -R test_mul_op

 - Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
 - The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
- If the operator does not implement a GPU kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
+- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
 - If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -3,12 +3,64 @@
 ##################

 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
-
+也可以利用PaddlePaddle 工具来编译文档，这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下

 如何构建文档
 ============

-PaddlePaddle的文档构建有两种方式。
+PaddlePaddle的文档构建有三种方式。
+
+
+使用PaddlePaddle.org工具
+--------------
+这个是目前推荐的使用方法。除了可以自动编译文档，也可以直接在网页预览文档。
+
+文件工具是使用Docker，需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+如果不想使用 Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。

 使用Docker构建
 --------------
@@ -47,17 +99,12 @@ PaddlePaddle的文档构建有两种方式。

 PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。

-如何更新文档主题
-================
-
-PaddlePaddle文档主题在 `TO_YOUR_PADDLE_CLONE_PATH/doc_theme` 文件夹下，包含所有和前端网页设计相关的文件。
-
-如何更新doc.paddlepaddle.org
+如何更新www.paddlepaddle.org
 ============================

-更新的文档以PR的形式提交到github中，提交方式参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和
-`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。
+更新的文档以PR的形式提交到github中，提交方式参见 `贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
+`英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。


 ..  _cmake: https://cmake.org/

--- a/doc/howto/dev/write_docs_en.rst
+++ b/doc/howto/dev/write_docs_en.rst
+##################
+Contribute Documentation
+##################
+
+PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
+Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
+When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
+
+How to Build Documentations
+============
+
+We recommend using PaddlePaddle.org tool to build documentation
+
+
+Use PaddlePaddle.org tool
+--------------
+This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
+
+The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories. You may only clone the contents you need
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+
+If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+If you want to learn more on the PaddlePaddle.org, please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ 。
+
+How to write Documentations
+============
+
+PaddlePaddle uses `sphinx`_ to compile documentations，Please check sphinx official website for more detail.
+
+
+How to update www.paddlepaddle.org
+============================
+
+Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
+`English Docs <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -9,9 +9,6 @@

  usage/cmd_parameter/index_cn.rst
  usage/cluster/cluster_train_cn.md
-  usage/k8s/k8s_basis_cn.md
-  usage/k8s/k8s_cn.md
-  usage/k8s/k8s_distributed_cn.md

 开发标准
 --------
@@ -19,6 +16,7 @@
 ..  toctree::
  :maxdepth: 1

+  dev/contribute_to_paddle_cn.md
  dev/write_docs_cn.rst

 模型配置

--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -9,8 +9,6 @@ Usage

  usage/cmd_parameter/index_en.rst
  usage/cluster/cluster_train_en.md
-  usage/k8s/k8s_en.md
-  usage/k8s/k8s_aws_en.md

 Development
 ------------
@@ -20,6 +18,7 @@ Development

  dev/new_layer_en.rst
  dev/contribute_to_paddle_en.md
+  dev/write_docs_en.rst

 Configuration
 -------------

--- a/doc/howto/read_source.md
+++ b/doc/howto/read_source.md
+# PaddlePaddle Fluid Source Code Overview
+
+Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book
+
+Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework
+
+Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators
+
+Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory
+
+Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform
+
+# Compile Time
+
+The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto).
+
+```python
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+```
+
+- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#L93)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/layers.py)
+  - Every Layer has one or more operators and variables/parameters
+    - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
+      - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
+      - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) 
+      - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
+- Optimizer: `fluid.optimizer.SGD`. It does the following
+  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/backward.cc)]
+  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py), [C++](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer)]
+
+# Run Time
+
+The following **evaluates** the NN. Instantiates all the variables, operators.
+
+```python
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+# Allocate memory. Initialize Parameter.
+exe.run(fluid.default_startup_program())
+
+# Allocate memory. Do computation.
+exe.run(fluid.default_main_program(),
+        feed=feeder.feed(data),
+        fetch_list=[avg_cost])
+```
+
+- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
+  - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
+  - Feeds the data: `feed=feeder.feed(data)`
+  - Evaluates all the operators
+  - Fetches the result: `fetch_list=[avg_cost]`
+- Other worth looking files:
+  - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live
+    - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live
+      - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory)
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
-# PaddlePaddle分布式训练
-
-* [概述](#概述)
-* [环境准备](#环境准备)
-* [启动参数说明](#启动参数说明)
-  * [启动参数服务器](#启动参数服务器)
-  * [启动计算节点](#启动计算节点)
-  * [准备数据集](#准备数据集)
-  * [准备训练程序](#准备训练程序)
-* [使用分布式计算平台或工具](#使用分布式计算平台或工具)
-  * [使用Fabric启动集群作业](#使用fabric启动集群作业)
-     * [准备一个Linux集群](#准备一个linux集群)
-     * [启动集群作业](#启动集群作业)
-     * [终止集群作业](#终止集群作业)
-     * [检查集群训练结果](#检查集群训练结果)
-     * [检查模型输出](#检查模型输出)
-  * [在OpenMPI集群中提交训练作业](#在openmpi集群中提交训练作业)
-     * [准备OpenMPI集群](#准备OpenMPI集群)
-     * [启动集群作业](#启动集群作业-1)
-  * [在Kubernetes集群中提交训练作业](#在kubernetes集群中提交训练作业)
+# 分布式训练
+

 ## 概述
+
 本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：

 <img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
@@ -32,10 +15,11 @@

 在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。

+
 ## 环境准备

 1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
-1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install)的多种安装方式。我们推荐使用[Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)安装方式来快速安装PaddlePaddle。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。

 安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
 ```bash
@@ -63,12 +47,12 @@ $ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradie
 $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
 ```

-| 参数  | 是否必选 | 默认值 | 说明 |
-| ------------- | ------------- | ------------- | ------------- |
-| port  | 必选 | 7164 | pserver监听的起始端口，根据ports_num决定<br>总端口个数，从起始端口监听多个端口用于通信  |
-| ports_num  | 必选 | 1 | 监听的端口个数  |
-| ports_num_for_sparse  | 必选 | 1 | 用于稀疏类型参数通信的端口个数  |
-| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
+参数说明
+
+- port：**必选，默认7164**，pserver监听的起始端口，根据ports_num决定总端口个数，从起始端口监听多个端口用于通信
+- ports_num：**必选，默认1**，监听的端口个数
+- ports_num_for_sparse：**必选，默认1**，用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数

 ### 启动计算节点
 执行以下命令启动使用python编写的trainer程序（文件名为任意文件名，如train.py）
@@ -105,16 +89,16 @@ paddle.init(
        pservers="127.0.0.1")
 ```

-| 参数  | 是否必选 | 默认 | 说明 |
-| ------------- | ------------- | ------------- | ------------- |
-| use_gpu  | 可选 | False | 是否启用GPU训练 |
-| trainer_count  | 必选 | 1 | 当前训练任务trainer总个数 |
-| port  | 必选 | 7164 | 连接到pserver的端口  |
-| ports_num  | 必选 | 1 | 连接到pserver的端口个数  |
-| ports_num_for_sparse  | 必选 | 1 | 和pserver之间用于稀疏类型参数通信的端口个数  |
-| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
-| trainer_id  | 必选 | 0 | 每个trainer的唯一ID，从0开始的整数 |
-| pservers  | 必选 | 127.0.0.1 | 当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开 |
+参数说明
+
+- use_gpu： **可选，默认False**，是否启用GPU训练
+- trainer_count：**必选，默认1**，当前训练任务trainer总个数
+- port：**必选，默认7164**，连接到pserver的端口
+- ports_num：**必选，默认1**，连接到pserver的端口个数
+- ports_num_for_sparse：**必选，默认1**，和pserver之间用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
+- trainer_id：**必选，默认0**，每个trainer的唯一ID，从0开始的整数
+- pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开


 ### 准备数据集
@@ -171,7 +155,7 @@ test.txt-00002

 - `my_lib.py`：会被`train.py`调用的一些用户定义的库函数，比如PIL库等。
 - `word_dict.pickle`：在`train.py`中会使用到的字典数据文件。
- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：
+- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：

  ```python
  cluster_train_file = "./train_data_dir/train/train.txt"
@@ -195,91 +179,10 @@ PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务

 在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。

-### 使用Fabric启动集群作业
-
-#### 准备一个Linux集群
-可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
-
-#### 启动集群作业
-
-`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
-
-`paddle.py` 为方便作业启动提供了两个独特的命令选项。
-
-  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
-  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
-
-`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
-```
-sh run.sh
-```
-
-集群作业将会在几秒后启动。
-
-#### 终止集群作业
-`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
-
-#### 检查集群训练结果
-详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
-
-`paddle_trainer.INFO`
-提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
-
-`paddle_pserver2.INFO`
-提供 pserver 运行日志，有助于诊断分布式错误。
-
-`server.log`
-提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
-
-`train.log`
-提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
-
-#### 检查模型输出
-运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
-工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
-
-### 在OpenMPI集群中提交训练作业
-
-#### 准备OpenMPI集群
-
-执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
-
-```bash
-paddle/scripts/cluster_train_v2/openmpi/docker_cluster
-kubectl create -f head.yaml
-kubectl create -f mpi-nodes.yaml
-```
-
-然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
-
-#### 启动集群作业
-
-您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
-
-```bash
-# 获得head和node节点的IP地址
-kubectl get po -o wide
-# 将node节点的IP地址保存到machines文件中
-kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
-# 拷贝必要的文件到head节点
-scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
-# ssh 登录到head节点
-ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
-# --------------- 以下操作均在head节点中执行 ---------------
-# 准备训练数据
-python prepare.py
-# 拷贝训练程序和字典文件到每台MPI节点
-cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
-# 创建日志目录
-mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
-# 拷贝训练数据到各自的节点
-scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
-scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
-scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
-# 启动训练任务
-mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
-```
-
-### 在Kubernetes集群中提交训练作业
+## 在不同集群中运行

-此部分的使用方法可以参考[here](../k8s/k8s_distributed_cn.md)。
+  - [fabric集群](fabric_cn.md)
+  - [openmpi集群](openmpi_cn.md)
+  - [kubernetes单机](k8s_cn.md)
+  - [kubernetes distributed分布式](k8s_distributed_cn.md)
+  - [AWS上运行kubernetes集群训练](k8s_aws_cn.md)
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
-# PaddlePaddle Distributed Training
-
-* [Introduction](#introduction)
-* [Preparations](#preparations)
-* [Command-line arguments](#command-line-arguments)
-   * [Starting parameter server](#starting-parameter-server)
-   * [Starting trainer](#starting-trainer)
-   * [Prepare Training Dataset](#prepare-training-dataset)
-   * [Prepare Training program](#prepare-training-program)
-* [Use cluster platforms or cluster management tools](#use-cluster-platforms-or-cluster-management-tools)
-   * [Cluster Training Using Fabric](#cluster-training-using-fabric)
-      * [Prepare a Linux cluster](#prepare-a-linux-cluster)
-      * [Launching Cluster Job](#launching-cluster-job)
-      * [Kill Cluster Job](#kill-cluster-job)
-      * [Check Cluster Training Result](#check-cluster-training-result)
-      * [Check Model Output](#check-model-output)
-   * [Cluster Training Using OpenMPI](#cluster-training-using-openmpi)
-      * [Prepare an OpenMPI cluster](#prepare-an-openmpi-cluster)
-      * [Launching Cluster Job](#launching-cluster-job-1)
-   * [Cluster Training Using Kubernetes](#cluster-training-using-kubernetes)
+# Distributed Training

 ## Introduction

@@ -35,7 +16,7 @@ When training with synchronize SGD, PaddlePaddle uses an internal "synchronize b

 ## Preparations
 1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
-2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install) document. We strongly recommend using [Docker installation](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).

 After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):

@@ -67,12 +48,12 @@ If you wish to run parameter servers in background, and save a log file, you can
 $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
 ```

-| param  | required | default | description |
-| ------------- | ------------- | ------------- | ------------- |
-| port  | required | 7164 | port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput |
-| ports_num  | required | 1 | total number of ports will listen on  |
-| ports_num_for_sparse  | required | 1 | number of ports which serves sparse parameter update  |
-| num_gradient_servers  | required | 1 | total number of gradient servers |
+Parameter Description
+
+- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput.
+- ports_num: **required, default 1**, total number of ports will listen on.
+- ports_num_for_sparse: **required, default 1**, number of ports which serves sparse parameter update.
+- num_gradient_servers: **required, default 1**, total number of gradient servers.

 ### Starting trainer
 Type the command below to start the trainer(name the file whatever you want, like "train.py")
@@ -111,16 +92,16 @@ paddle.init(
        pservers="127.0.0.1")
 ```

-| param  | required | default | description |
-| ------------- | ------------- | ------------- | ------------- |
-| use_gpu  | optional | False | set to "True" to enable GPU training |
-| trainer_count  | required | 1 | total count of trainers in the training job |
-| port  | required | 7164 | port to connect to parameter server  |
-| ports_num  | required | 1 | number of ports for communication |
-| ports_num_for_sparse  | required | 1 | number of ports for sparse type caculation |
-| num_gradient_servers  | required | 1 | total number of gradient server |
-| trainer_id  | required | 0 | ID for every trainer, start from 0 |
-| pservers  | required | 127.0.0.1 | list of IPs of parameter servers, separated by "," |
+Parameter Description
+
+- use_gpu: **optional, default False**, set to "True" to enable GPU training.
+- trainer_count: **required, default 1**, total count of trainers in the training job.
+- port: **required, default 7164**, port to connect to parameter server.
+- ports_num: **required, default 1**, number of ports for communication.
+- ports_num_for_sparse: **required, default 1**, number of ports for sparse type caculation.
+- num_gradient_servers: **required, default 1**, total number of gradient server.
+- trainer_id: **required, default 0**, ID for every trainer, start from 0.
+- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".

 ### Prepare Training Dataset

@@ -178,7 +159,7 @@ Your workspace may looks like:

 - `my_lib.py`: user defined libraries, like PIL libs. This is optional.
 - `word_dict.pickle`: dict file for training word embeding.
- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
+- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:

  ```python
  cluster_train_file = "./train_data_dir/train/train.txt"
@@ -202,92 +183,9 @@ We'll introduce cluster job management on these platforms. The examples can be f

 These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.

-### Cluster Training Using Fabric
-
-#### Prepare a Linux cluster
-
-Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
-
-#### Launching Cluster Job
-`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
-
-`paddle.py`provides two distinguished command option for easy job launching.
-
- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
-dispatch latency.
-
-`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
-```
-sh run.sh
-```
-
-The cluster Job will start in several seconds.
-
-#### Kill Cluster Job
-`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
-
-#### Check Cluster Training Result
-Check log in $workspace/log for details, each node owns same log structure.
-
-`paddle_trainer.INFO`
-It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
-
-`paddle_pserver2.INFO`
-It provides parameter server running log, which could help to diagnose distributed error.
-
-`server.log`
-It provides stderr and stdout of parameter server process. Check error log if training crashes.
-
-`train.log`
-It provides stderr and stdout of trainer process. Check error log if training crashes.
-
-#### Check Model Output
-After one pass finished, model files will be written in `output` directory in node 0.
-`nodefile` in workspace indicates the node id of current cluster job.
-
-### Cluster Training Using OpenMPI
-
-#### Prepare an OpenMPI cluster
-
-Run the following command to start a 3-node MPI cluster and one "head" node.
-
-```bash
-cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
-kubectl create -f head.yaml
-kubectl create -f mpi-nodes.yaml
-```
-
-Then you can log in to every OpenMPI node using ssh without input any passwords.
-
-#### Launching Cluster Job
-
-Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
-
-```bash
-# find out node IP addresses
-kubectl get po -o wide
-# generate a "machines" file containing node IP addresses
-kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
-# copy necessary files onto "head" node
-scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
-# login to head node using ssh
-ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
-# --------------- in head node ---------------
-# prepare training data
-python prepare.py
-# copy training data and dict file to MPI nodes
-cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
-# creat a directory for storing log files
-mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
-# copy training data to every node
-scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
-scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
-scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
-# start the job
-mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
-```
-
-### Cluster Training Using Kubernetes
+## Use different clusters

-The details can be found [here](../k8s/k8s_cn.md)
+  - [fabric](fabric_en.md)
+  - [openmpi](openmpi_en.md)
+  - [kubernetes](k8s_en.md)
+  - [kubernetes on AWS](k8s_aws_en.md)
--- a/doc/howto/usage/cluster/fabric_cn.md
+++ b/doc/howto/usage/cluster/fabric_cn.md
+# 使用fabric启动集群训练
+
+## 准备一个Linux集群
+可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
+
+## 启动集群作业
+
+`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
+
+`paddle.py` 为方便作业启动提供了两个独特的命令选项。
+
+-  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
+-  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
+
+`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
+```
+sh run.sh
+```
+
+集群作业将会在几秒后启动。
+
+## 终止集群作业
+`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
+
+## 检查集群训练结果
+详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
+
+`paddle_trainer.INFO`
+提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
+
+`paddle_pserver2.INFO`
+提供 pserver 运行日志，有助于诊断分布式错误。
+
+`server.log`
+提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+`train.log`
+提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+## 检查模型输出
+运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
+工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
--- a/doc/howto/usage/cluster/fabric_en.md
+++ b/doc/howto/usage/cluster/fabric_en.md
+# Cluster Training Using Fabric
+
+## Prepare a Linux cluster
+
+Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
+
+## Launching Cluster Job
+`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
+
+`paddle.py`provides two distinguished command option for easy job launching.
+
+- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
+- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
+dispatch latency.
+
+`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
+```
+sh run.sh
+```
+
+The cluster Job will start in several seconds.
+
+## Kill Cluster Job
+`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
+
+## Check Cluster Training Result
+Check log in $workspace/log for details, each node owns same log structure.
+
+`paddle_trainer.INFO`
+It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
+
+`paddle_pserver2.INFO`
+It provides parameter server running log, which could help to diagnose distributed error.
+
+`server.log`
+It provides stderr and stdout of parameter server process. Check error log if training crashes.
+
+`train.log`
+It provides stderr and stdout of trainer process. Check error log if training crashes.
+
+## Check Model Output
+After one pass finished, model files will be written in `output` directory in node 0.
+`nodefile` in workspace indicates the node id of current cluster job.
--- a/doc/howto/usage/cluster/k8s_aws_cn.md
+++ b/doc/howto/usage/cluster/k8s_aws_cn.md
+k8s_aws_en.md
\ No newline at end of file
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -493,7 +493,7 @@ spec:
    spec:
      containers:
      - name: paddle-data
-        image: paddledev/paddle-tutorial:k8s_data
+        image: paddlepaddle/paddle-tutorial:k8s_data
        imagePullPolicy: Always
        volumeMounts:
        - mountPath: "/efs"
@@ -522,7 +522,7 @@ NAME          DESIRED   SUCCESSFUL   AGE
 paddle-data   1         1            6m
 ```

-Data preparation is done by docker image `paddledev/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
+Data preparation is done by docker image `paddlepaddle/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.

 #### Start Training

@@ -545,7 +545,7 @@ spec:
          claimName: efsvol
      containers:
      - name: trainer
-        image: paddledev/paddle-tutorial:k8s_train
+        image: paddlepaddle/paddle-tutorial:k8s_train
        command: ["bin/bash",  "-c", "/root/start.sh"]
        env:
        - name: JOB_NAME
@@ -617,7 +617,7 @@ kubectl --kubeconfig=kubeconfig log -f POD_NAME

 Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes.

-The details for start `pserver` and `trainer` are hidden inside docker image `paddledev/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
+The details for start `pserver` and `trainer` are hidden inside docker image `paddlepaddle/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.

 #### Inspect Training Output


--- a/doc/howto/usage/k8s/k8s_cn.md
+++ b/doc/howto/usage/k8s/k8s_cn.md
 # Kubernetes单机训练

-在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
+在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的PaddlePaddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。

 ## 制作Docker镜像

-在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式Paddle训练任务中的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
-Paddle的Docker image里。为此，我们需要制作一个包含训练数据的Paddle镜像。
+在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式PaddlePaddle训练任务中
+的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
+PaddlePaddle的Docker Image里。为此，我们需要制作一个包含训练数据的PaddlePaddle镜像。
+
+PaddlePaddle的 `paddlepaddle/paddle:cpu-demo-latest` 镜像里有PaddlePaddle的源码与demo，
+（请注意，默认的PaddlePaddle生产环境镜像 `paddlepaddle/paddle:latest` 是不包括源码的，PaddlePaddle的各版本镜像可以参考
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)），
+下面我们使用这个镜像来下载数据到Docker Container中，并把这个包含了训练数据的Container保存为一个新的镜像。

-Paddle 的 [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) 
-里介绍了用Paddle源码中的脚本下载训练数据的过程。
-而 `paddledev/paddle:cpu-demo-latest` 镜像里有 Paddle 源码与demo，（ 请注意，默认的
-Paddle镜像 `paddledev/paddle:cpu-latest` 是不包括源码的, Paddle的各版本镜像可以参考 [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html) ），所以我们使用这个镜像来下载训练数据到Docker container中，然后把这个包含了训练数据的container保存为一个新的镜像。
-  
 ### 运行容器

 ```
-$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
 ```

 ### 下载数据
@@ -103,7 +104,7 @@ spec:
      restartPolicy: Never
 ```

-### 创建Paddle Job
+### 创建PaddlePaddle Job

 使用上文创建的yaml文件创建Kubernetes Job，命令为：


--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
 # Kubernetes分布式训练

-前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
-
-有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群，可以参考[k8s_basis](./k8s_basis_cn.md)。
+前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cluster/cluster_train_cn.html)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。

 ## 整体方案

@@ -28,7 +26,7 @@ PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行
 - 拷贝训练文件到容器内
 - 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练

-因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/src/k8s_train/Dockerfile)。
+因为官方镜像 `paddlepaddle/paddle:latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile)。

 ```bash
 $ cd doc/howto/usage/k8s/src/k8s_train
@@ -62,7 +60,7 @@ spec:
      hostNetwork: true
      containers:
      - name: paddle-data
-        image: paddledev/paddle-tutorial:k8s_data
+        image: paddlepaddle/paddle-tutorial:k8s_data
        imagePullPolicy: Always
        volumeMounts:
        - mountPath: "/mnt"
@@ -149,20 +147,19 @@ spec:

 文件中，`metadata`下的`name`表示这个job的名字。`parallelism，completions`字段表示这个job会同时开启3个PaddlePaddle节点，成功训练且退出的pod数目为3时，这个job才算成功结束。然后申明一个存储卷`jobpath`，代表宿主机目录`/home/work/mfs`，在对容器的描述`containers`字段中，将此目录挂载为容器的`/home/jobpath`目录，这样容器的`/home/jobpath`目录就成为了共享存储，放在这个目录里的文件其实是保存到了MFS上。

-`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内。
+`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内：
+

-环境变量 | 说明
--- | ---
-JOB_PATH | 共享存储挂在的路径
-JOB_NAME | Job的名字
-TRAIN_CONFIG_DIR | 本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
-CONF_PADDLE_NIC | `paddle pserver`进程需要的`--nics`参数，即网卡名
-CONF_PADDLE_PORT | `paddle paserver`的`--port`参数
-CONF_PADDLE_PORTS_NUM | 稠密更新的端口数量，即`--ports_num`参数
-CONF_PADDLE_PORTS_NUM_SPARSE | 稀疏更新的端口数量，即`--ports_num_for_sparse`参数
-CONF_PADDLE_GRADIENT_NUM | 训练节点数量，即`--num_gradient_servers参数`
+- JOB_PATH：共享存储挂在的路径
+- JOB_NAME：Job的名字
+- TRAIN_CONFIG_DIR：本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
+- CONF_PADDLE_NIC：`paddle pserver`进程需要的`--nics`参数，即网卡名
+- CONF_PADDLE_PORT：`paddle paserver`的`--port`参数
+- CONF_PADDLE_PORTS_NUM：稠密更新的端口数量，即`--ports_num`参数
+- CONF_PADDLE_PORTS_NUM_SPARSE：稀疏更新的端口数量，即`--ports_num_for_sparse`参数
+- CONF_PADDLE_GRADIENT_NUM：训练节点数量，即`--num_gradient_servers参数`

-这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。
+这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。

 编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。


--- a/doc/howto/usage/k8s/k8s_en.md
+++ b/doc/howto/usage/k8s/k8s_en.md
-# Paddle On Kubernetes
+# PaddlePaddle On Kubernetes

->In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
+In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster.

 ## Build Docker Image

-In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
+In distributed Kubernetes cluster, we will use Ceph or other distributed
+storage system for storing training related data so that all processes in
+PaddlePaddle training can retrieve data from Ceph. In this example, we will
+only demo training job on single machine. In order to simplify the requirement
+of the environment, we will directly put training data into the PaddlePaddle Docker Image,
+so we need to create a PaddlePaddle Docker image that includes the training data.
+
+The production Docker Image `paddlepaddle/paddle:cpu-demo-latest` has the PaddlePaddle
+source code and demo. (Caution: Default PaddlePaddle Docker Image `paddlepaddle/paddle:latest` doesn't include
+the source code, PaddlePaddle's different versions of Docker Image can be referred here:
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_en.html)),
+so we run this Docker Image and download the training data, and then commit the whole
+Container to be a new Docker Image.

-Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
-And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
-  
 ### Run Docker Container

 ```
-$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
 ```

 ### Download Training Data
@@ -67,7 +76,7 @@ $ docker commit quick_start_data mypaddle/paddle:quickstart

 ## Use Kubernetes For Training

->We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.

 ### Create Yaml Files

@@ -99,7 +108,7 @@ spec:
      restartPolicy: Never
 ```

-### Start Paddle Job
+### Start PaddlePaddle Job

 Using the above yaml file to start the Kubernetes job.


--- a/doc/howto/usage/cluster/openmpi_cn.md
+++ b/doc/howto/usage/cluster/openmpi_cn.md
+# 在OpenMPI集群中提交训练作业
+
+## 准备OpenMPI集群
+
+执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
+
+```bash
+paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
+
+## 启动集群作业
+
+您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
+
+```bash
+# 获得head和node节点的IP地址
+kubectl get po -o wide
+# 将node节点的IP地址保存到machines文件中
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# 拷贝必要的文件到head节点
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# ssh 登录到head节点
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- 以下操作均在head节点中执行 ---------------
+# 准备训练数据
+python prepare.py
+# 拷贝训练程序和字典文件到每台MPI节点
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# 创建日志目录
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# 拷贝训练数据到各自的节点
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# 启动训练任务
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
--- a/doc/howto/usage/cluster/openmpi_en.md
+++ b/doc/howto/usage/cluster/openmpi_en.md
--- a/doc/howto/usage/k8s/src/Dockerfile
+++ b/doc/howto/usage/k8s/src/Dockerfile
--- a/doc/howto/usage/k8s/src/add_security_group.png
+++ b/doc/howto/usage/k8s/src/add_security_group.png
--- a/doc/howto/usage/k8s/src/create_efs.png
+++ b/doc/howto/usage/k8s/src/create_efs.png
--- a/doc/howto/usage/k8s/src/efs_mount.png
+++ b/doc/howto/usage/k8s/src/efs_mount.png
--- a/doc/howto/usage/cluster/src/k8s-paddle-arch.png
+++ b/doc/howto/usage/cluster/src/k8s-paddle-arch.png
--- a/doc/howto/usage/k8s/src/k8s_data/Dockerfile
+++ b/doc/howto/usage/k8s/src/k8s_data/Dockerfile
--- a/doc/howto/usage/k8s/src/k8s_data/README.md
+++ b/doc/howto/usage/k8s/src/k8s_data/README.md
--- a/doc/howto/usage/k8s/src/k8s_data/get_data.sh
+++ b/doc/howto/usage/k8s/src/k8s_data/get_data.sh
--- a/doc/howto/usage/k8s/src/k8s_train/Dockerfile
+++ b/doc/howto/usage/k8s/src/k8s_train/Dockerfile
--- a/doc/howto/usage/k8s/src/k8s_train/README.md
+++ b/doc/howto/usage/k8s/src/k8s_train/README.md
--- a/doc/howto/usage/k8s/src/k8s_train/start.sh
+++ b/doc/howto/usage/k8s/src/k8s_train/start.sh
--- a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
+++ b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
--- a/doc/howto/usage/k8s/src/managed_policy.png
+++ b/doc/howto/usage/k8s/src/managed_policy.png
--- a/doc/howto/usage/k8s/src/pserver_and_trainer.png
+++ b/doc/howto/usage/k8s/src/pserver_and_trainer.png
--- a/doc/howto/usage/k8s/src/route53_create_recordset.png
+++ b/doc/howto/usage/k8s/src/route53_create_recordset.png
--- a/doc/howto/usage/k8s/src/route53_create_zone.png
+++ b/doc/howto/usage/k8s/src/route53_create_zone.png
--- a/doc/howto/usage/k8s/src/worker_security_group.png
+++ b/doc/howto/usage/k8s/src/worker_security_group.png
--- a/doc/howto/usage/k8s/k8s_basis_cn.md
+++ b/doc/howto/usage/k8s/k8s_basis_cn.md
--- a/doc/howto/usage/k8s/src/k8s-paddle-arch.png
+++ b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
--- a/doc/mobile/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
--- a/doc/mobile/cross_compiling_for_ios_en.md
+++ b/doc/mobile/cross_compiling_for_ios_en.md
--- a/doc/mobile/index_en.rst
+++ b/doc/mobile/index_en.rst
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
--- a/paddle/capi/Main.cpp
+++ b/paddle/capi/Main.cpp
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
--- a/paddle/capi/error.cpp
+++ b/paddle/capi/error.cpp
--- a/paddle/capi/error.h
+++ b/paddle/capi/error.h
--- a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
+++ b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
--- a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
+++ b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
--- a/paddle/capi/main.h
+++ b/paddle/capi/main.h
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
--- a/paddle/framework/data_layout.h
+++ b/paddle/framework/data_layout.h
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
--- a/paddle/framework/init.h
+++ b/paddle/framework/init.h
--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
--- a/paddle/framework/library_type.h
+++ b/paddle/framework/library_type.h
--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
--- a/paddle/framework/lod_rank_table.h
+++ b/paddle/framework/lod_rank_table.h
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/framework/op_proto_maker_test.cc
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
--- a/paddle/framework/prune.h
+++ b/paddle/framework/prune.h
--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
--- a/paddle/framework/var_type_inference.h
+++ b/paddle/framework/var_type_inference.h
--- a/paddle/framework/var_type_inference_test.cc
+++ b/paddle/framework/var_type_inference_test.cc
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
--- a/paddle/gserver/layers/ConvTransProjection.cpp
+++ b/paddle/gserver/layers/ConvTransProjection.cpp
--- a/paddle/gserver/layers/MKLDNNLRNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
--- a/paddle/gserver/layers/MKLDNNLRNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.h
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
--- a/paddle/gserver/layers/PoolProjection.cpp
+++ b/paddle/gserver/layers/PoolProjection.cpp
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
--- a/paddle/gserver/layers/ROIPoolLayer.cpp
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
--- a/paddle/gserver/layers/SequenceToBatch.cpp
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
--- a/paddle/gserver/tests/mkldnn_simple_net.conf
+++ b/paddle/gserver/tests/mkldnn_simple_net.conf
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
--- a/paddle/math/NEONFunctions.cpp
+++ b/paddle/math/NEONFunctions.cpp
--- a/paddle/math/NEONFunctions.h
+++ b/paddle/math/NEONFunctions.h
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
--- a/paddle/math/tests/test_float16.cpp
+++ b/paddle/math/tests/test_float16.cpp
--- a/paddle/math/tests/test_float16.cu
+++ b/paddle/math/tests/test_float16.cu
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/operators/adadelta_op.h
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
--- a/paddle/operators/adagrad_op.h
+++ b/paddle/operators/adagrad_op.h
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
--- a/paddle/operators/adamax_op.cu
+++ b/paddle/operators/adamax_op.cu
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
--- a/paddle/operators/auc_op.h
+++ b/paddle/operators/auc_op.h
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
--- a/paddle/operators/batch_norm_op.h
+++ b/paddle/operators/batch_norm_op.h
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
--- a/paddle/operators/bilinear_tensor_product_op.cu
+++ b/paddle/operators/bilinear_tensor_product_op.cu
--- a/paddle/operators/bilinear_tensor_product_op.h
+++ b/paddle/operators/bilinear_tensor_product_op.h
--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
--- a/paddle/operators/cast_op.cu
+++ b/paddle/operators/cast_op.cu
--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/operators/chunk_eval_op.cc
--- a/paddle/operators/chunk_eval_op.h
+++ b/paddle/operators/chunk_eval_op.h
--- a/paddle/operators/clip_by_norm_op.cc
+++ b/paddle/operators/clip_by_norm_op.cc
--- a/paddle/operators/clip_by_norm_op.cu
+++ b/paddle/operators/clip_by_norm_op.cu
--- a/paddle/operators/clip_by_norm_op.h
+++ b/paddle/operators/clip_by_norm_op.h
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
--- a/paddle/operators/clip_op.cu
+++ b/paddle/operators/clip_op.cu
--- a/paddle/operators/clip_op.h
+++ b/paddle/operators/clip_op.h
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
--- a/paddle/operators/compare_op.cu
+++ b/paddle/operators/compare_op.cu
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
--- a/paddle/operators/concat_op.cu.cc
+++ b/paddle/operators/concat_op.cu.cc
--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
--- a/paddle/operators/conv_shift_op.cc
+++ b/paddle/operators/conv_shift_op.cc
--- a/paddle/operators/conv_shift_op.cu
+++ b/paddle/operators/conv_shift_op.cu
--- a/paddle/operators/conv_shift_op.h
+++ b/paddle/operators/conv_shift_op.h
--- a/paddle/operators/conv_transpose_cudnn_op.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cc
--- a/paddle/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
--- a/paddle/operators/conv_transpose_op.cu.cc
+++ b/paddle/operators/conv_transpose_op.cu.cc
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
--- a/paddle/operators/cos_sim_op.cu
+++ b/paddle/operators/cos_sim_op.cu
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
--- a/paddle/operators/crf_decoding_op.h
+++ b/paddle/operators/crf_decoding_op.h
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
--- a/paddle/operators/crop_op.cu
+++ b/paddle/operators/crop_op.cu
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
--- a/paddle/operators/decayed_adagrad_op.cu
+++ b/paddle/operators/decayed_adagrad_op.cu
--- a/paddle/operators/decayed_adagrad_op.h
+++ b/paddle/operators/decayed_adagrad_op.h
--- a/paddle/operators/detail/recv_impl.cc
+++ b/paddle/operators/detail/recv_impl.cc
--- a/paddle/operators/detail/send_impl.cc
+++ b/paddle/operators/detail/send_impl.cc
--- a/paddle/operators/detail/send_recv.proto
+++ b/paddle/operators/detail/send_recv.proto
--- a/paddle/operators/detail/send_recv_impl.h
+++ b/paddle/operators/detail/send_recv_impl.h
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
--- a/paddle/operators/elementwise_add_op.cu
+++ b/paddle/operators/elementwise_add_op.cu
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
--- a/paddle/operators/elementwise_div_op.cu
+++ b/paddle/operators/elementwise_div_op.cu
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
--- a/paddle/operators/elementwise_sub_op.cu
+++ b/paddle/operators/elementwise_sub_op.cu
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
--- a/paddle/operators/expand_op.cc
+++ b/paddle/operators/expand_op.cc
--- a/paddle/operators/expand_op.cu
+++ b/paddle/operators/expand_op.cu
--- a/paddle/operators/expand_op.h
+++ b/paddle/operators/expand_op.h
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
--- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
--- a/paddle/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/operators/fill_constant_batch_size_like_op.h
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
--- a/paddle/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/operators/fill_zeros_like_op.cu.cc
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
--- a/paddle/operators/ftrl_op.cc
+++ b/paddle/operators/ftrl_op.cc
--- a/paddle/operators/ftrl_op.cu
+++ b/paddle/operators/ftrl_op.cu
--- a/paddle/operators/ftrl_op.h
+++ b/paddle/operators/ftrl_op.h
--- a/paddle/operators/gather.cu.h
+++ b/paddle/operators/gather.cu.h
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
--- a/paddle/operators/gru_op.cu.cc
+++ b/paddle/operators/gru_op.cu.cc
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
--- a/paddle/operators/gru_unit_op.cu
+++ b/paddle/operators/gru_unit_op.cu
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
--- a/paddle/operators/hinge_loss_op.cc
+++ b/paddle/operators/hinge_loss_op.cc
--- a/paddle/operators/seq_expand_op.cu
+++ b/paddle/operators/seq_expand_op.cu
--- a/paddle/operators/hinge_loss_op.h
+++ b/paddle/operators/hinge_loss_op.h
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
--- a/paddle/operators/huber_loss_op.cu
+++ b/paddle/operators/huber_loss_op.cu
--- a/paddle/operators/huber_loss_op.h
+++ b/paddle/operators/huber_loss_op.h
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
--- a/paddle/operators/is_empty_op.cc
+++ b/paddle/operators/is_empty_op.cc
--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
--- a/paddle/operators/l1_norm_op.cu
+++ b/paddle/operators/l1_norm_op.cu
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/operators/l1_norm_op.h
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
--- a/paddle/operators/linear_chain_crf_op.cu
+++ b/paddle/operators/linear_chain_crf_op.cu
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
--- a/paddle/operators/lod_array_length_op.cc
+++ b/paddle/operators/lod_array_length_op.cc
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
--- a/paddle/operators/lod_reset_op.cc
+++ b/paddle/operators/lod_reset_op.cc
--- a/paddle/operators/lod_reset_op.cu
+++ b/paddle/operators/lod_reset_op.cu
--- a/paddle/operators/lod_reset_op.h
+++ b/paddle/operators/lod_reset_op.h
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
--- a/paddle/operators/log_loss_op.cc
+++ b/paddle/operators/log_loss_op.cc
--- a/paddle/operators/log_loss_op.cu
+++ b/paddle/operators/log_loss_op.cu
--- a/paddle/operators/log_loss_op.h
+++ b/paddle/operators/log_loss_op.h
--- a/paddle/operators/logical_op.cc
+++ b/paddle/operators/logical_op.cc
--- a/paddle/operators/logical_op.cu
+++ b/paddle/operators/logical_op.cu
--- a/paddle/operators/logical_op.h
+++ b/paddle/operators/logical_op.h
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
--- a/paddle/operators/lrn_op.cu
+++ b/paddle/operators/lrn_op.cu
--- a/paddle/operators/lrn_op.h
+++ b/paddle/operators/lrn_op.h
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
--- a/paddle/operators/lstm_op.cu.cc
+++ b/paddle/operators/lstm_op.cu.cc
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
--- a/paddle/operators/margin_rank_loss_op.cu
+++ b/paddle/operators/margin_rank_loss_op.cu
--- a/paddle/operators/margin_rank_loss_op.h
+++ b/paddle/operators/margin_rank_loss_op.h
--- a/paddle/operators/math/context_project.cc
+++ b/paddle/operators/math/context_project.cc
--- a/paddle/operators/math/context_project.cu
+++ b/paddle/operators/math/context_project.cu
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/operators/math/cross_entropy.cc
--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/operators/math/cross_entropy.h
--- a/paddle/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_cpu_kernel.h
--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
--- a/paddle/operators/math/detail/lstm_kernel.h
+++ b/paddle/operators/math/detail/lstm_kernel.h
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
--- a/paddle/operators/math/lstm_compute.cc
+++ b/paddle/operators/math/lstm_compute.cc
--- a/paddle/operators/math/lstm_compute.cu
+++ b/paddle/operators/math/lstm_compute.cu
--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/operators/math/lstm_compute.h
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
--- a/paddle/operators/math/matmul.h
+++ b/paddle/operators/math/matmul.h
--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/operators/math/maxouting.cc
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/operators/math/maxouting.cu
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/operators/math/maxouting.h
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
--- a/paddle/operators/math/selected_rows_functor.h
+++ b/paddle/operators/math/selected_rows_functor.h
--- a/paddle/operators/math/selected_rows_functor_test.cc
+++ b/paddle/operators/math/selected_rows_functor_test.cc
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
--- a/paddle/operators/math/sequence_pooling.cc
+++ b/paddle/operators/math/sequence_pooling.cc
--- a/paddle/operators/math/sequence_pooling.cu
+++ b/paddle/operators/math/sequence_pooling.cu
--- a/paddle/operators/math/sequence_pooling.h
+++ b/paddle/operators/math/sequence_pooling.h
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
--- a/paddle/operators/math/softmax.cu
+++ b/paddle/operators/math/softmax.cu
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
--- a/paddle/operators/math/softmax_impl.h
+++ b/paddle/operators/math/softmax_impl.h
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
--- a/paddle/operators/math/vol2col.cc
+++ b/paddle/operators/math/vol2col.cc
--- a/paddle/operators/math/vol2col.cu
+++ b/paddle/operators/math/vol2col.cu
--- a/paddle/operators/math/vol2col.h
+++ b/paddle/operators/math/vol2col.h
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
--- a/paddle/operators/matmul_op.cu.cc
+++ b/paddle/operators/matmul_op.cu.cc
--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
--- a/paddle/operators/max_sequence_len_op.cc
+++ b/paddle/operators/max_sequence_len_op.cc
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
--- a/paddle/operators/maxout_op.cu.cc
+++ b/paddle/operators/maxout_op.cu.cc
--- a/paddle/operators/maxout_op.h
+++ b/paddle/operators/maxout_op.h
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
--- a/paddle/operators/minus_op.cu
+++ b/paddle/operators/minus_op.cu
--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
--- a/paddle/operators/modified_huber_loss_op.cu
+++ b/paddle/operators/modified_huber_loss_op.cu
--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
--- a/paddle/operators/momentum_op.cu
+++ b/paddle/operators/momentum_op.cu
--- a/paddle/operators/momentum_op.h
+++ b/paddle/operators/momentum_op.h
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
--- a/paddle/operators/mul_op.cu.cc
+++ b/paddle/operators/mul_op.cu.cc
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
--- a/paddle/operators/nccl_op.cu.cc
+++ b/paddle/operators/nccl_op.cu.cc
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
--- a/paddle/operators/nce_op.h
+++ b/paddle/operators/nce_op.h
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
--- a/paddle/operators/batch_norm_op.md
+++ b/paddle/operators/batch_norm_op.md
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/name_convention.md
--- a/paddle/operators/net_op_design.md
+++ b/paddle/operators/net_op_design.md
--- a/paddle/operators/op_documentation/op_markdown_format.md
+++ b/paddle/operators/op_documentation/op_markdown_format.md
--- a/paddle/operators/rnn_design.md
+++ b/paddle/operators/rnn_design.md
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
--- a/paddle/operators/pad_op.cu
+++ b/paddle/operators/pad_op.cu
--- a/paddle/operators/pad_op.h
+++ b/paddle/operators/pad_op.h
--- a/paddle/operators/pool_cudnn_op.cc
+++ b/paddle/operators/pool_cudnn_op.cc
--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ b/paddle/operators/pool_cudnn_op.cu.cc
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
--- a/paddle/operators/pool_op.cu.cc
+++ b/paddle/operators/pool_op.cu.cc
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
--- a/paddle/operators/pool_with_index_op.cu.cc
+++ b/paddle/operators/pool_with_index_op.cu.cc
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
--- a/paddle/operators/positive_negative_pair_op.cc
+++ b/paddle/operators/positive_negative_pair_op.cc
--- a/paddle/operators/positive_negative_pair_op.h
+++ b/paddle/operators/positive_negative_pair_op.h
--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
--- a/paddle/operators/precision_recall_op.h
+++ b/paddle/operators/precision_recall_op.h
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
--- a/paddle/operators/prelu_op.cu
+++ b/paddle/operators/prelu_op.cu
--- a/paddle/operators/prelu_op.h
+++ b/paddle/operators/prelu_op.h
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
--- a/paddle/operators/proximal_adagrad_op.cu
+++ b/paddle/operators/proximal_adagrad_op.cu
--- a/paddle/operators/proximal_adagrad_op.h
+++ b/paddle/operators/proximal_adagrad_op.h
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
--- a/paddle/operators/proximal_gd_op.cu
+++ b/paddle/operators/proximal_gd_op.cu
--- a/paddle/operators/proximal_gd_op.h
+++ b/paddle/operators/proximal_gd_op.h
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/operators/rank_loss_op.cu
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
--- a/paddle/operators/reduce_op.cu
+++ b/paddle/operators/reduce_op.cu
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
--- a/paddle/operators/reshape_op.cu
+++ b/paddle/operators/reshape_op.cu
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
--- a/paddle/operators/rmsprop_op.cu
+++ b/paddle/operators/rmsprop_op.cu
--- a/paddle/operators/rmsprop_op.h
+++ b/paddle/operators/rmsprop_op.h
--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/operators/rnn_memory_helper_op.cc
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
--- a/paddle/operators/roi_pool_op.cu
+++ b/paddle/operators/roi_pool_op.cu
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/operators/roi_pool_op.h
--- a/paddle/operators/row_conv_op.cc
+++ b/paddle/operators/row_conv_op.cc
--- a/paddle/operators/row_conv_op.cu
+++ b/paddle/operators/row_conv_op.cu
--- a/paddle/operators/row_conv_op.h
+++ b/paddle/operators/row_conv_op.h
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
--- a/paddle/operators/scatter_op.cu
+++ b/paddle/operators/scatter_op.cu
--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
--- a/paddle/operators/send_recv_op_benchmark.cc
+++ b/paddle/operators/send_recv_op_benchmark.cc
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
--- a/paddle/operators/sequence_concat_op.cu.cc
+++ b/paddle/operators/sequence_concat_op.cu.cc
--- a/paddle/operators/sequence_concat_op.h
+++ b/paddle/operators/sequence_concat_op.h
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
--- a/paddle/operators/sequence_conv_op.cu.cc
+++ b/paddle/operators/sequence_conv_op.cu.cc
--- a/paddle/operators/sequence_conv_op.h
+++ b/paddle/operators/sequence_conv_op.h
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
--- a/paddle/operators/sequence_expand_op.cu
+++ b/paddle/operators/sequence_expand_op.cu
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
--- a/paddle/operators/sequence_pool_op.cu
+++ b/paddle/operators/sequence_pool_op.cu
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
--- a/paddle/operators/sequence_slice_op.cu
+++ b/paddle/operators/sequence_slice_op.cu
--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/operators/sequence_slice_op.h
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
--- a/paddle/operators/sequence_softmax_op.cu.cc
+++ b/paddle/operators/sequence_softmax_op.cu.cc
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
--- a/paddle/operators/sign_op.cc
+++ b/paddle/operators/sign_op.cc
--- a/paddle/operators/sign_op.cu
+++ b/paddle/operators/sign_op.cu
--- a/paddle/operators/sign_op.h
+++ b/paddle/operators/sign_op.h
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
--- a/paddle/operators/smooth_l1_loss_op.cu
+++ b/paddle/operators/smooth_l1_loss_op.cu
--- a/paddle/operators/smooth_l1_loss_op.h
+++ b/paddle/operators/smooth_l1_loss_op.h
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
--- a/paddle/operators/softmax_op.cu.cc
+++ b/paddle/operators/softmax_op.cu.cc
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
--- a/paddle/operators/split_op.cu.cc
+++ b/paddle/operators/split_op.cu.cc
--- a/paddle/operators/split_op.h
+++ b/paddle/operators/split_op.h
--- a/paddle/operators/spp_op.cc
+++ b/paddle/operators/spp_op.cc
--- a/paddle/operators/spp_op.cu.cc
+++ b/paddle/operators/spp_op.cu.cc
--- a/paddle/operators/spp_op.h
+++ b/paddle/operators/spp_op.h
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
--- a/paddle/operators/squared_l2_distance_op.cu
+++ b/paddle/operators/squared_l2_distance_op.cu
--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/operators/squared_l2_distance_op.h
--- a/paddle/operators/squared_l2_norm_op.cc
+++ b/paddle/operators/squared_l2_norm_op.cc
--- a/paddle/operators/squared_l2_norm_op.cu
+++ b/paddle/operators/squared_l2_norm_op.cu
--- a/paddle/operators/squared_l2_norm_op.h
+++ b/paddle/operators/squared_l2_norm_op.h
--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
--- a/paddle/operators/transpose_op.cu.cc
+++ b/paddle/operators/transpose_op.cu.cc
--- a/paddle/operators/transpose_op.h
+++ b/paddle/operators/transpose_op.h
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
--- a/paddle/platform/dynload/nccl.cc
+++ b/paddle/platform/dynload/nccl.cc
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
--- a/paddle/platform/place_test.cc
+++ b/paddle/platform/place_test.cc
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
--- a/paddle/pybind/const_value.cc
+++ b/paddle/pybind/const_value.cc
--- a/paddle/pybind/const_value.h
+++ b/paddle/pybind/const_value.h
--- a/paddle/pybind/print_operators_doc.cc
+++ b/paddle/pybind/print_operators_doc.cc
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
--- a/paddle/scripts/check_env.sh
+++ b/paddle/scripts/check_env.sh
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
--- a/paddle/scripts/tools/build_docs/build_docs.sh
+++ b/paddle/scripts/tools/build_docs/build_docs.sh
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
--- a/python/.gitignore
+++ b/python/.gitignore
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
--- a/python/paddle/v2/fluid/clip.py
+++ b/python/paddle/v2/fluid/clip.py
--- a/python/paddle/v2/fluid/data_feeder.py
+++ b/python/paddle/v2/fluid/data_feeder.py
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
--- a/python/paddle/v2/fluid/layers/__init__.py
+++ b/python/paddle/v2/fluid/layers/__init__.py
--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
--- a/python/paddle/v2/fluid/registry.py
+++ b/python/paddle/v2/fluid/registry.py
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
--- a/python/paddle/v2/fluid/tests/.gitignore
+++ b/python/paddle/v2/fluid/tests/.gitignore
--- a/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py
+++ b/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
--- a/python/paddle/v2/fluid/tests/test_activation_op.py
+++ b/python/paddle/v2/fluid/tests/test_activation_op.py
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
--- a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
+++ b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
--- a/python/paddle/v2/fluid/tests/test_const_value.py
+++ b/python/paddle/v2/fluid/tests/test_const_value.py
--- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
--- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
--- a/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
+++ b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
--- a/python/paddle/v2/fluid/tests/test_data_feeder.py
+++ b/python/paddle/v2/fluid/tests/test_data_feeder.py
--- a/python/paddle/v2/fluid/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
--- a/python/paddle/v2/fluid/tests/test_dyn_rnn.py
+++ b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
--- a/python/paddle/v2/fluid/tests/test_fill_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_op.py
--- a/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
--- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
--- a/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
--- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
--- a/python/paddle/v2/fluid/tests/test_inference_model_io.py
+++ b/python/paddle/v2/fluid/tests/test_inference_model_io.py
--- a/python/paddle/v2/fluid/tests/test_initializer.py
+++ b/python/paddle/v2/fluid/tests/test_initializer.py
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
--- a/python/paddle/v2/fluid/tests/test_lrn_op.py
+++ b/python/paddle/v2/fluid/tests/test_lrn_op.py
--- a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
--- a/python/paddle/v2/fluid/tests/test_operator.py
+++ b/python/paddle/v2/fluid/tests/test_operator.py
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
--- a/python/paddle/v2/fluid/tests/test_reduce_op.py
+++ b/python/paddle/v2/fluid/tests/test_reduce_op.py
--- a/python/paddle/v2/fluid/tests/test_registry.py
+++ b/python/paddle/v2/fluid/tests/test_registry.py
--- a/python/paddle/v2/fluid/tests/test_reshape_op.py
+++ b/python/paddle/v2/fluid/tests/test_reshape_op.py
--- a/python/paddle/v2/fluid/tests/test_row_conv_op.py
+++ b/python/paddle/v2/fluid/tests/test_row_conv_op.py
--- a/python/paddle/v2/fluid/tests/test_seq_expand.py
+++ b/python/paddle/v2/fluid/tests/test_seq_expand.py
--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
--- a/python/paddle/v2/fluid/tests/test_spp_op.py
+++ b/python/paddle/v2/fluid/tests/test_spp_op.py
--- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
--- a/python/setup.py.in
+++ b/python/setup.py.in