remove conflict

dcf5e948 · chengduoZH · 479ee271 · 863661a3 · dcf5e948 · dcf5e948
319 changed file
--- a/README.md
+++ b/README.md
@@ -61,32 +61,32 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation

 It is recommended to check out the
-[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html).
+[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/build_from_source_en.html).

 ## Documentation

-We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
-[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation.
+We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
+[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.

- [Deep Learning 101](http://book.paddlepaddle.org/index.html)
+- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)

  You might want to start from this online interactive book that can run in a Jupyter Notebook.

- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html)

  You can run distributed training jobs on MPI clusters.

- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html)

   You can also run distributed training jobs on Kubernetes clusters.

- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html)

   Our new API enables much shorter programs.

- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html)

   We appreciate your contributions!


--- a/benchmark/paddle/image/run_mkldnn_infer.sh
+++ b/benchmark/paddle/image/run_mkldnn_infer.sh
--- a/benchmark/paddle/image/run_mkldnn_train.sh
+++ b/benchmark/paddle/image/run_mkldnn_train.sh
@@ -28,6 +28,10 @@ function train() {
    --test_period=100 \
    --config_args=$args \
    2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }

 if [ ! -f "train.list" ]; then

--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
+set -e
+
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+
+function infer() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  thread=`nproc`
+  if [ $thread -gt $bs ]; then
+    thread=$bs
+  fi
+  log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "./run_mkl_infer.sh to save the model first"
+    exit 0
+  fi
+  log_period=$((256 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+
+  # calculate the last 5 logs period time of 1280 samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# inference benchmark
+for batchsize in 1 2 4 8 16; do
+  infer googlenet v1 $batchsize
+  infer resnet 50 $batchsize
+  infer vgg 19 $batchsize
+done
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
+set -e
+
+function train() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  thread=`nproc`
+  # each trainer_count use only 1 core to avoid conflict
+  log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+  args="batch_size=${bs},layer_num=${layer_num}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# training benchmark
+for batchsize in 64 128 256; do
+  train vgg 19 $batchsize
+  train resnet 50 $batchsize
+  train googlenet v1 $batchsize
+done
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
@@ -7,3 +7,4 @@ API
    模型配置 <v2/model_configs.rst>
    数据访问 <v2/data.rst>
    训练与应用 <v2/run_logic.rst>
+    v2/fluid.rst
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -188,12 +188,6 @@ beam_search_decode
    :noindex:


-lstm
---------
-..  autofunction:: paddle.v2.fluid.layers.lstm
-    :noindex:
-
-
 lod_rank_table
 ---------
 ..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
@@ -300,3 +294,27 @@ conv2d_transpose
 ..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
    :noindex:

+
+sequence_expand
+---------
+..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+    :noindex:
+
+
+lstm_unit
+---------
+..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+    :noindex:
+
+
+sequence_softmax
+---------
+..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
+    :noindex:
+
+
+reduce_sum
+---------
+..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+    :noindex:
+
--- a/doc/design/executor.md
+++ b/doc/design/executor.md
 # Executor Design Doc

 ## Motivation
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).

-We use executor to do the runtime evaluation of a `ProgramDesc`.
+The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.

 ## Overview

-An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.

-### What does executor do?
+## Executor

-It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
+The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
+It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.

-### What does executor NOT do?
+### The interface
+```c++
+  Executor(places);
+```
+A executor does not own any computing resources, a user can only construct an executor using the specified places.

-It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
+### Running an Executor

-It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
-
-## Implementation
-
-`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
+```
+  void Run(ProgramDesc, Scope, block_id, create_local_scope);
+```
+An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
--- a/doc/design/images/multigpu_allreduce.graffle
+++ b/doc/design/images/multigpu_allreduce.graffle
--- a/doc/design/images/multigpu_allreduce.png
+++ b/doc/design/images/multigpu_allreduce.png
--- a/doc/design/images/multigpu_before_convert.graffle
+++ b/doc/design/images/multigpu_before_convert.graffle
--- a/doc/design/images/multigpu_before_convert.png
+++ b/doc/design/images/multigpu_before_convert.png
--- a/doc/design/kernel_hint_design.md
+++ b/doc/design/kernel_hint_design.md
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+  if (Attr<bool>(kForceCPU)) {
+    return KernelType(CPUPlace, ...)
+  } else {
+    ...
+  }
+}
+```
+
+In Python code
+
+```python
+FORCE_CPU = core.kForceCPU()
+
+def xx_layer(..., force_cpu=false):
+  layer_helper = LayerHelper(...)
+  layer_helper.append_op(
+    type="xx",
+    attr={FORCE_CPU: force_cpu})
+```
--- a/doc/design/mkldnn/image/engine.png
+++ b/doc/design/mkldnn/image/engine.png
--- a/doc/design/mkldnn/image/gradients.png
+++ b/doc/design/mkldnn/image/gradients.png
--- a/doc/design/mkldnn/image/layers.png
+++ b/doc/design/mkldnn/image/layers.png
--- a/doc/design/mkldnn/image/matrix.png
+++ b/doc/design/mkldnn/image/matrix.png
--- a/doc/design/mkldnn/image/overview.png
+++ b/doc/design/mkldnn/image/overview.png
--- a/doc/design/mkl/mkl_packed.md
+++ b/doc/design/mkl/mkl_packed.md
+# Intel® MKL Packed on PaddlePaddle: Design Doc
+
+
+## Contents
+
+- [Overview](#overview)
+- [Key Points](#key-points) 
+   - [Background](#background)
+   - [Solution](#solution)
+- [Actions](#actions)
+    - [CMake](#cmake)
+	- [Layers](#layers)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+
+
+## Overview
+我们计划将 Intel® MKL 中引入的 GEMM Packed APIs\[[1](#references)\] 集成到 PaddlePaddle 中，充分发挥英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+现阶段的优化主要针对 Recurrent Neural Network（以下简称RNN）相关层（包括`RecurrentLayer`, `GatedRecurrentLayer`和`LstmLayer`）， 以及 PaddlePaddle V1 API。
+
+## Key Points
+
+### Background
+目前PaddlePaddle采用了 Intel® MKL库的[cblas_?gemm](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm)函数，这个函数本身会在计算前将原数据转换为更适合英特尔平台的内部格式。
+
+1. 转换耗时 \
+这一数据格式的转换操作（Packing），在问题本身的计算量比较小的时候，显得相对来说较为耗时。例如在DeepSpeech2 \[[2](#references)\] 的Vanilla RNN部分中，矩阵大小是`batch_size * 2048`。
+2. 转换冗余 \
+由于在现有的某些情况下（例如RNN），多次调用 cblas_?gemm 会使用相同的原数据，因此，每次调用时对原数据的重复Packing便成为了冗余。
+
+为了最大程度减少多次调用 cblas_?gemm 在Packing上的耗时，Intel® MKL 引入了以下四个API:
+   * [cblas_?gemm_alloc](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-alloc)
+   * [cblas_?gemm_pack](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-pack)
+   * [cblas_?gemm_compute](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-compute)
+   * [cblas_?gemm_free](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-free)
+
+通过使用这些API，我们可以先完成对原数据的Packing操作，再把已转换为Packed格式的数据传递给那些复用同一数据的gemm_compute函数，从而避免了Packing冗余。
+
+### Solution
+在RNN的情况下，同一次前向、后向（forward/backward）过程中所有时间步（time step）共享同一个权重（weight）。当只做推断（inference）时，各次前向之间也都使用了相同的权重，没有必要在每次前向中每个时间步的计算时对权重进行重复的Packing操作。
+
+我们通过使用新引入的GEMM Packed APIs，在层初始化的时候，先完成对权重的Packing操作，然后在前向，后向时复用已经转换过的权重，并在每次权重更新后，对新的权重进行转换用于下次迭代。
+
+* 优化前，对于序列长度（sequence length）为`T`的网络模型（model）, `N`次迭代执行的转换次数为：
+  - `inference`： `N * T`  
+  - `training`： `2 * N * T`
+* 优化后，对于同样设置的网络模型，其转换次数减少至：
+  - `inference`： `1`    
+  - `training`： `2 * N`
+
+## Actions
+
+添加的相关文件和目录结构如下：
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+└── paddle/
+    ├── ...
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   ├── MKLPackedRecurrentLayer.*
+        |   ├── MKLPackedGatedRecurrentLayer.*
+        |   ├── MKLPackedLstmLayer.*
+        |   └── MKLPackedGemm.h
+        └── tests/
+            ├── ...
+            └── test_MKLPacked.cpp
+```
+
+### CMake
+在对应的`CMakeLists.txt`中根据`WITH_MKL`是否打开，来决定是否开启MKL Packed相关功能。
+
+### Layers
+所有的`MKLPacked*Layer`都继承于PaddlePaddle的基类`Layer`, 并添加头文件 `MKLPackedGemm.h`，该文件对相关GEMM Packed APIs做了封装。
+
+### Unit Tests
+我们会添加`test_MKLPacked.cpp`用于MKL Packed优化后layer的测试。
+对于每一个新加的RNN layer，我们会对比如下2个方面：
+1. 对比优化后layer自身，sequence mode（`rnn_use_batch=false`）与batch mode(`rnn_use_batch=true`)的结果。
+2. 对比优化后layer与相对应的PaddlePaddle原有layer, 在batch mode下的结果。
+
+### Python API
+计划在`paddle/utils.Flags`中添加`use_mkl_packed`的flag，用于选择是否使用相关功能，并且当编译时`WITH_MKL=ON`的情况下，默认设置为`true`。
+
+同时，在`python/paddle/trainer/config_parser.py`中对应的layer处，添加`use_mkl_packed`这个选择，方便用户在Python端选择是否启用这个功能。
+
+具体实现方式比如：
+
+```python
+use_mkl_packed = bool(int(g_command_config_args.get("use_mkl_packed", 0)))
+if use_mkl_packed:
+    self.layer_type = mkl_packed_*
+```
+
+所有相关的`layer_type`会以*mkl_packed_*开头，这些会在`MKLPacked*Layer`注册layer的时候保证，以示区分。 
+
+
+### Benchmarking
+会添加相应的脚本用于测试和对比在使用MKL Packed recurrent layers 前后的网络性能。
+
+## References 
+1. [Introducing the new Packed APIs for GEMM](https://software.intel.com/en-us/articles/introducing-the-new-packed-apis-for-gemm)
+2. [DeepSpeech2 on PaddlePaddle](https://github.com/PaddlePaddle/DeepSpeech#deepspeech2-on-paddlepaddle)
+
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -208,4 +208,3 @@ if use_mkldnn
 但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
 4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
 所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
-
--- a/doc/design/paddle_nccl.md
+++ b/doc/design/paddle_nccl.md
+# Design Doc: NCCL support in Paddle Fluid
+
+## Abstract
+
+This Design Doc refers to the NCCL feature in  paddle.  We propose an approach to support NCCL library both on a single machine and multiple machines. We wrapper the NCCL primitives `Broadcast`, `Allreduce`, `Reduce` as operators to utilize Multi-GPU powers in one script.
+
+
+## Motivation
+
+[NCCL](https://developer.nvidia.com/nccl) is a NVIDIA library support Multi-GPU communicating and optimized for NVIDIA GPUs, it provides routines such as all-gather, all-reduce, broadcast, reduce, reduce-scatter, that can achieve high bandwidth over PCIe and NVLink high-speed interconnect. With NCCL library, we can easily accelerate the training in parallel. 
+
+- Pros
+1. easily plug-in with [NCCL2](https://developer.nvidia.com/nccl) library.
+1. high performance in NVIDIA GPUs.
+1. MPI like primitives, which have low learning cost for users.
+
+- Cons
+1. Only design for NVIDIA GPUs, not a general multi-device solution.
+1. Although NCCL1 is opensourced under BSD license, but NCCL2 is not opensourced anymore.
+
+At the beginning of training, the framework needs to distribute the same parameters to every GPU, and merge the gradients at any time user interests.
+
+As a result, during training, we need the operations of peer to peer copy between different GPUs, aggregating gradients/parameters from GPUs, and broadcasting parameters to GPUs. Every GPU only need to run the operator with correct place information.
+
+Besides, it needs interfaces to synchronize model update with each different GPU Cards. 
+
+## Implementation
+
+As mentioned above, we wrap the NCCL routines as several kinds of operators. Need to note that NCCL need to create Communicator between gpu at the beginning, so there is a NCCLInit operator created.
+
+### Transpiler
+
+To be compatible with [parameter server design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md), the transpiler compiles the user defined operation graph into sub-graphs to be executed on different devices.
+
+1. The user-defined model will be a single device program
+
+2. Broadcast/Reduce operators between GPUs will be inserted into the program, even for the multi-node, may insert the `Send`, `Recv` operator.
+
+   *Broadcast, AllReduce in a single machine. And Broadcast, AllReduce, [Send, Recv](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md#graph-converter) in multiple machines*
+
+   <img src="images/multigpu_before_convert.png" width="300"/>
+
+After compiling, the graph as shows
+
+<img src="images/multigpu_allreduce.png" width="1000"/>
+
+Operators are added to the sub-graphs. Every GPU assigned a role of `rank0`, `rank1` etc. 
+
+- **Broadcast**. Broadcast operator distribute initialized parameter to all the GPUs from the GPU who owns it. e.g. from`rank0` GPU.
+- **AllReduce**. AllReduce operator synchronizes parameters/gradients between GPUs. AllReduce implemented in the Ring-Based  communicating method, avoid of the bottle neck in a single GPU.
+
+Need to notice that AllReduce operator force GPUs synchronized at that point. The whole training process in asynchronous or synchronous mode depends on the AllReduce point in the graph.
+
+As it shown in the picture, when each GPU compute the gradient of `W`, followed with a `AllReduce` operator, accumulate the `dW` to full batch of data, then run the optimize process individually and apply the gradient to its `W`.
+
+- **AllReduce**
+  Need to note that our AllReduce operator is a ring-base AllReduce implementation. If we use the NCCL2 AllReduce primitive, every GPU optimized full batch of data, wasted (n-1) GPU compute resources. In addition, NCCL2 built-in AllReduce will only utilize the communicating resource during synchronization, then update the gradient will be a subsequent phase. In fact, we can amortize the update gradient time cost into the communicating phase. The process is
+1. Every parameter has its root card. That card will responsible for aggregating the gradients from GPUs.
+2. The whole model's parameter will be hashed to different root card, ensure the load balance between GPUs.
+3. Logically neighberhood card will start send parameter to the next one. After one round, the parameter main card will aggregate the full gradients.
+4. Then the root card will optimize the parameter.
+5. This parameter card will send its optimized result to its neighberhood, then the neighberhood will send parameter to its next one.
+6. Finish the sychronization round.
+
+The total time cost will be 2 * (n-1) * per-parameter-send-time, we reach the goal of amortize the upgrade time into communicating phase.
--- a/doc/design/refactor/multi_cpu.md
+++ b/doc/design/refactor/multi_cpu.md
+# Design Doc: Execute the Program with Multi CPU
+
+## Abstract
+
+This Design Doc propose an approach to make the user-defined Op graph
+running with multi-CPU, we will use an auto transpiler to convert the user-defined
+Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
+
+## Transpiler
+
+<img src="src/multi-threads/single-thread@3x.png" width="300">
+
+After converted:
+
+<img src="src/multi-threads/multi-threads@3x.png" width="1000">
+
+## Implement
+
+- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
+  which would be executed with multi-threads.
+- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
+  for the atomic counter become `0`:
+  ```cpp
+  BlockingCounter bc(thread_count);
+  for (int i = 0; i < thread_count; ++i) {
+    thread_pool->Start([&bc] {bc.DecrementCount(); })
+  }
+  bc.Wait();
+  ```
+- `ParallelDo` Operator
+  - Initialize a thread pool which is a Singleton.
+  - Use a block id as the input, and create run the specify Block on independent scope
+    with multi-threads.
+  - Initialize a `BlockingCounter` instance and wait until all threads are done.
+- `Split` Operator will split the Input Tensor into a TensorArray.
+- `Merge` merge all the gradients which calculated in different threads
+  with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
+
+## TODO
+
+- Improve the optimizer stage with multi-threads, since we could
+  assign the parameters to the different threads and execute
+  optimizer with multi-threads.
--- a/doc/design/refactor/src/multi-threads.graffle
+++ b/doc/design/refactor/src/multi-threads.graffle
--- a/doc/design/refactor/src/multi-threads/multi-threads@3x.png
+++ b/doc/design/refactor/src/multi-threads/multi-threads@3x.png
--- a/doc/design/refactor/src/multi-threads/single-thread@3x.png
+++ b/doc/design/refactor/src/multi-threads/single-thread@3x.png
--- a/doc/design/switch_kernel.md
+++ b/doc/design/switch_kernel.md
+## Background
+Every operator has many kernels because there are multiple data types, places, data layout that Fluid supports. We use the `KernelType` to describe kernel types that operators can hold. 
+
+The `KernelType` is as follows.
+
+```
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+
+The `place_` is a descriptor of the device and the computational library, e.g., `MKLDNNPlace`, `CUDAPlace`.
+
+The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float`/`double`.
+
+The `layout` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+
+## Problem
+
+We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
+
+1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
+2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
+3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
+
+Problems under these situations are similar. We can formalise this problem as follow.
+
+We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
+
+## Solution
+
+It is clearly that transforming inputs of an operator toadapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+
+We can infer a kernel type from the inputs of an operators. We let this kernel type as `actual kernel type`, which means this kernel type is the actually kernel type that operator should be performed.
+
+We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
+
+We transform the input data from `actual` to `expect` if the expect kernel type is not as same as actual kernel type.
+
+The algorithm is described as follow
+
+```cpp
+using DataTransformationFN = std::function<void(const Tensor& in, Tensor* out)>;
+using KernelTypePair = std::pair<KernelType, KernelType>;
+
+map<KernelTypePair, DataTransformationFN> g_data_transformation_;
+
+void OpWithKernel::Run() {
+  vec<Tensor> inputs = ...
+  auto actual_kernel_type = GetActualKernelType(inputs);
+  
+  // The expected kernel type is related to actual kernel type.
+  // For the most operators, the expected kernel type is as same as
+  // actual kernel type.
+  //
+  // So we pass `actual_kernel_type` as a parameter of 
+  // GetExpectedKernelType
+  auto expect_kernel_type = GetExpectedKernelType(actual_kernel_type);
+  
+  auto trans = g_data_transformation_[{actual_kernel_type, expect_kernel_type}];
+  
+  kernel.run(trans(inputs));
+}
+```
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -128,7 +128,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note

 AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
 是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
-`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。

 以下指令能检查Linux电脑是否支持AVX：


--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -137,7 +137,7 @@ GPU driver installed before move on.
 AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
 The latest PaddlePaddle Docker image turns AVX on by default, so, if your
 computer doesn't support AVX, you'll probably need to
-`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`.
+`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.

 The following command will tell you whether your computer supports AVX.


--- a/doc/getstarted/concepts/src/infer.py
+++ b/doc/getstarted/concepts/src/infer.py
+import paddle.v2 as paddle
+import numpy as np
+
+paddle.init(use_gpu=False)
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+# loading the model which generated by training
+with open('params_pass_90.tar', 'r') as f:
+    parameters = paddle.parameters.Parameters.from_tar(f)
+
+# Input multiple sets of data，Output the infer result in a array.
+i = [[[1, 2]], [[3, 4]], [[5, 6]]]
+print paddle.infer(output_layer=y_predict, parameters=parameters, input=i)
+# Will print:
+# [[ -3.24491572]
+#  [ -6.94668722]
+#  [-10.64845848]]
--- a/doc/getstarted/concepts/src/train.py
+++ b/doc/getstarted/concepts/src/train.py
@@ -26,6 +26,11 @@ def event_handler(event):
        if event.batch_id % 1 == 0:
            print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id,
                                                  event.cost)
+    # product model every 10 pass
+    if isinstance(event, paddle.event.EndPass):
+        if event.pass_id % 10 == 0:
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)


 # define training dataset reader

--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -147,4 +147,9 @@ PaddlePaddle支持不同类型的输入数据，主要包括四种类型，和
 ..  literalinclude:: src/train.py
    :linenos:

+使用以上训练好的模型进行预测，取其中一个模型params_pass_90.tar，输入需要预测的向量组，然后打印输出：
+
+..  literalinclude:: src/infer.py
+    :linenos:
+
 有关线性回归的实际应用，可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -53,7 +53,7 @@ Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -82,7 +82,7 @@ The equation is: Out = X * Y
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of scale operator.").NotInGradient();
    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();

--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -50,7 +50,7 @@ First, define `ProtoMaker` to describe the Operator's input, output, and additio
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -79,7 +79,7 @@ An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/de
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of scale operator.").NotInGradient();
    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();

--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -9,9 +9,6 @@

  usage/cmd_parameter/index_cn.rst
  usage/cluster/cluster_train_cn.md
-  usage/k8s/k8s_basis_cn.md
-  usage/k8s/k8s_cn.md
-  usage/k8s/k8s_distributed_cn.md

 开发标准
 --------

--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -9,8 +9,6 @@ Usage

  usage/cmd_parameter/index_en.rst
  usage/cluster/cluster_train_en.md
-  usage/k8s/k8s_en.md
-  usage/k8s/k8s_aws_en.md

 Development
 ------------

--- a/doc/howto/read_source.md
+++ b/doc/howto/read_source.md
@@ -6,10 +6,10 @@ Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework

 Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators

-Optimizer: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer
-
 Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory

+Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform
+
 # Compile Time

 The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto).

--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
-# PaddlePaddle分布式训练
-
-* [概述](#概述)
-* [环境准备](#环境准备)
-* [启动参数说明](#启动参数说明)
-  * [启动参数服务器](#启动参数服务器)
-  * [启动计算节点](#启动计算节点)
-  * [准备数据集](#准备数据集)
-  * [准备训练程序](#准备训练程序)
-* [使用分布式计算平台或工具](#使用分布式计算平台或工具)
-  * [使用Fabric启动集群作业](#使用fabric启动集群作业)
-     * [准备一个Linux集群](#准备一个linux集群)
-     * [启动集群作业](#启动集群作业)
-     * [终止集群作业](#终止集群作业)
-     * [检查集群训练结果](#检查集群训练结果)
-     * [检查模型输出](#检查模型输出)
-  * [在OpenMPI集群中提交训练作业](#在openmpi集群中提交训练作业)
-     * [准备OpenMPI集群](#准备OpenMPI集群)
-     * [启动集群作业](#启动集群作业-1)
-  * [在Kubernetes集群中提交训练作业](#在kubernetes集群中提交训练作业)
+# 分布式训练
+

 ## 概述
+
 本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：

 <img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
@@ -32,10 +15,11 @@

 在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。

+
 ## 环境准备

 1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
-1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install)的多种安装方式。我们推荐使用[Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)安装方式来快速安装PaddlePaddle。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。

 安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
 ```bash
@@ -63,12 +47,12 @@ $ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradie
 $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
 ```

-| 参数  | 是否必选 | 默认值 | 说明 |
-| ------------- | ------------- | ------------- | ------------- |
-| port  | 必选 | 7164 | pserver监听的起始端口，根据ports_num决定<br>总端口个数，从起始端口监听多个端口用于通信  |
-| ports_num  | 必选 | 1 | 监听的端口个数  |
-| ports_num_for_sparse  | 必选 | 1 | 用于稀疏类型参数通信的端口个数  |
-| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
+参数说明
+
+- port：**必选，默认7164**，pserver监听的起始端口，根据ports_num决定总端口个数，从起始端口监听多个端口用于通信
+- ports_num：**必选，默认1**，监听的端口个数
+- ports_num_for_sparse：**必选，默认1**，用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数

 ### 启动计算节点
 执行以下命令启动使用python编写的trainer程序（文件名为任意文件名，如train.py）
@@ -105,16 +89,16 @@ paddle.init(
        pservers="127.0.0.1")
 ```

-| 参数  | 是否必选 | 默认 | 说明 |
-| ------------- | ------------- | ------------- | ------------- |
-| use_gpu  | 可选 | False | 是否启用GPU训练 |
-| trainer_count  | 必选 | 1 | 当前训练任务trainer总个数 |
-| port  | 必选 | 7164 | 连接到pserver的端口  |
-| ports_num  | 必选 | 1 | 连接到pserver的端口个数  |
-| ports_num_for_sparse  | 必选 | 1 | 和pserver之间用于稀疏类型参数通信的端口个数  |
-| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
-| trainer_id  | 必选 | 0 | 每个trainer的唯一ID，从0开始的整数 |
-| pservers  | 必选 | 127.0.0.1 | 当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开 |
+参数说明
+
+- use_gpu： **可选，默认False**，是否启用GPU训练
+- trainer_count：**必选，默认1**，当前训练任务trainer总个数
+- port：**必选，默认7164**，连接到pserver的端口
+- ports_num：**必选，默认1**，连接到pserver的端口个数
+- ports_num_for_sparse：**必选，默认1**，和pserver之间用于稀疏类型参数通信的端口个数
+- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
+- trainer_id：**必选，默认0**，每个trainer的唯一ID，从0开始的整数
+- pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开


 ### 准备数据集
@@ -171,7 +155,7 @@ test.txt-00002

 - `my_lib.py`：会被`train.py`调用的一些用户定义的库函数，比如PIL库等。
 - `word_dict.pickle`：在`train.py`中会使用到的字典数据文件。
- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：
+- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：

  ```python
  cluster_train_file = "./train_data_dir/train/train.txt"
@@ -195,91 +179,10 @@ PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务

 在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。

-### 使用Fabric启动集群作业
-
-#### 准备一个Linux集群
-可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
-
-#### 启动集群作业
-
-`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
-
-`paddle.py` 为方便作业启动提供了两个独特的命令选项。
-
-  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
-  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
-
-`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
-```
-sh run.sh
-```
-
-集群作业将会在几秒后启动。
-
-#### 终止集群作业
-`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
-
-#### 检查集群训练结果
-详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
-
-`paddle_trainer.INFO`
-提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
-
-`paddle_pserver2.INFO`
-提供 pserver 运行日志，有助于诊断分布式错误。
-
-`server.log`
-提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
-
-`train.log`
-提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
-
-#### 检查模型输出
-运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
-工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
-
-### 在OpenMPI集群中提交训练作业
-
-#### 准备OpenMPI集群
-
-执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
-
-```bash
-paddle/scripts/cluster_train_v2/openmpi/docker_cluster
-kubectl create -f head.yaml
-kubectl create -f mpi-nodes.yaml
-```
-
-然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
-
-#### 启动集群作业
-
-您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
-
-```bash
-# 获得head和node节点的IP地址
-kubectl get po -o wide
-# 将node节点的IP地址保存到machines文件中
-kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
-# 拷贝必要的文件到head节点
-scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
-# ssh 登录到head节点
-ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
-# --------------- 以下操作均在head节点中执行 ---------------
-# 准备训练数据
-python prepare.py
-# 拷贝训练程序和字典文件到每台MPI节点
-cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
-# 创建日志目录
-mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
-# 拷贝训练数据到各自的节点
-scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
-scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
-scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
-# 启动训练任务
-mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
-```
-
-### 在Kubernetes集群中提交训练作业
+## 在不同集群中运行

-此部分的使用方法可以参考[here](../k8s/k8s_distributed_cn.md)。
+  - [fabric集群](fabric_cn.md)
+  - [openmpi集群](openmpi_cn.md)
+  - [kubernetes单机](k8s_cn.md)
+  - [kubernetes distributed分布式](k8s_distributed_cn.md)
+  - [AWS上运行kubernetes集群训练](k8s_aws_cn.md)
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
-# PaddlePaddle Distributed Training
-
-* [Introduction](#introduction)
-* [Preparations](#preparations)
-* [Command-line arguments](#command-line-arguments)
-   * [Starting parameter server](#starting-parameter-server)
-   * [Starting trainer](#starting-trainer)
-   * [Prepare Training Dataset](#prepare-training-dataset)
-   * [Prepare Training program](#prepare-training-program)
-* [Use cluster platforms or cluster management tools](#use-cluster-platforms-or-cluster-management-tools)
-   * [Cluster Training Using Fabric](#cluster-training-using-fabric)
-      * [Prepare a Linux cluster](#prepare-a-linux-cluster)
-      * [Launching Cluster Job](#launching-cluster-job)
-      * [Kill Cluster Job](#kill-cluster-job)
-      * [Check Cluster Training Result](#check-cluster-training-result)
-      * [Check Model Output](#check-model-output)
-   * [Cluster Training Using OpenMPI](#cluster-training-using-openmpi)
-      * [Prepare an OpenMPI cluster](#prepare-an-openmpi-cluster)
-      * [Launching Cluster Job](#launching-cluster-job-1)
-   * [Cluster Training Using Kubernetes](#cluster-training-using-kubernetes)
+# Distributed Training

 ## Introduction

@@ -35,7 +16,7 @@ When training with synchronize SGD, PaddlePaddle uses an internal "synchronize b

 ## Preparations
 1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
-2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install) document. We strongly recommend using [Docker installation](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).

 After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):

@@ -67,12 +48,12 @@ If you wish to run parameter servers in background, and save a log file, you can
 $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
 ```

-| param  | required | default | description |
-| ------------- | ------------- | ------------- | ------------- |
-| port  | required | 7164 | port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput |
-| ports_num  | required | 1 | total number of ports will listen on  |
-| ports_num_for_sparse  | required | 1 | number of ports which serves sparse parameter update  |
-| num_gradient_servers  | required | 1 | total number of gradient servers |
+Parameter Description
+
+- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput.
+- ports_num: **required, default 1**, total number of ports will listen on.
+- ports_num_for_sparse: **required, default 1**, number of ports which serves sparse parameter update.
+- num_gradient_servers: **required, default 1**, total number of gradient servers.

 ### Starting trainer
 Type the command below to start the trainer(name the file whatever you want, like "train.py")
@@ -111,16 +92,16 @@ paddle.init(
        pservers="127.0.0.1")
 ```

-| param  | required | default | description |
-| ------------- | ------------- | ------------- | ------------- |
-| use_gpu  | optional | False | set to "True" to enable GPU training |
-| trainer_count  | required | 1 | total count of trainers in the training job |
-| port  | required | 7164 | port to connect to parameter server  |
-| ports_num  | required | 1 | number of ports for communication |
-| ports_num_for_sparse  | required | 1 | number of ports for sparse type caculation |
-| num_gradient_servers  | required | 1 | total number of gradient server |
-| trainer_id  | required | 0 | ID for every trainer, start from 0 |
-| pservers  | required | 127.0.0.1 | list of IPs of parameter servers, separated by "," |
+Parameter Description
+
+- use_gpu: **optional, default False**, set to "True" to enable GPU training.
+- trainer_count: **required, default 1**, total count of trainers in the training job.
+- port: **required, default 7164**, port to connect to parameter server.
+- ports_num: **required, default 1**, number of ports for communication.
+- ports_num_for_sparse: **required, default 1**, number of ports for sparse type caculation.
+- num_gradient_servers: **required, default 1**, total number of gradient server.
+- trainer_id: **required, default 0**, ID for every trainer, start from 0.
+- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".

 ### Prepare Training Dataset

@@ -178,7 +159,7 @@ Your workspace may looks like:

 - `my_lib.py`: user defined libraries, like PIL libs. This is optional.
 - `word_dict.pickle`: dict file for training word embeding.
- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
+- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:

  ```python
  cluster_train_file = "./train_data_dir/train/train.txt"
@@ -202,92 +183,9 @@ We'll introduce cluster job management on these platforms. The examples can be f

 These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.

-### Cluster Training Using Fabric
-
-#### Prepare a Linux cluster
-
-Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
-
-#### Launching Cluster Job
-`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
-
-`paddle.py`provides two distinguished command option for easy job launching.
-
- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
-dispatch latency.
-
-`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
-```
-sh run.sh
-```
-
-The cluster Job will start in several seconds.
-
-#### Kill Cluster Job
-`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
-
-#### Check Cluster Training Result
-Check log in $workspace/log for details, each node owns same log structure.
-
-`paddle_trainer.INFO`
-It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
-
-`paddle_pserver2.INFO`
-It provides parameter server running log, which could help to diagnose distributed error.
-
-`server.log`
-It provides stderr and stdout of parameter server process. Check error log if training crashes.
-
-`train.log`
-It provides stderr and stdout of trainer process. Check error log if training crashes.
-
-#### Check Model Output
-After one pass finished, model files will be written in `output` directory in node 0.
-`nodefile` in workspace indicates the node id of current cluster job.
-
-### Cluster Training Using OpenMPI
-
-#### Prepare an OpenMPI cluster
-
-Run the following command to start a 3-node MPI cluster and one "head" node.
-
-```bash
-cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
-kubectl create -f head.yaml
-kubectl create -f mpi-nodes.yaml
-```
-
-Then you can log in to every OpenMPI node using ssh without input any passwords.
-
-#### Launching Cluster Job
-
-Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
-
-```bash
-# find out node IP addresses
-kubectl get po -o wide
-# generate a "machines" file containing node IP addresses
-kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
-# copy necessary files onto "head" node
-scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
-# login to head node using ssh
-ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
-# --------------- in head node ---------------
-# prepare training data
-python prepare.py
-# copy training data and dict file to MPI nodes
-cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
-# creat a directory for storing log files
-mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
-# copy training data to every node
-scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
-scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
-scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
-# start the job
-mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
-```
-
-### Cluster Training Using Kubernetes
+## Use different clusters

-The details can be found [here](../k8s/k8s_cn.md)
+  - [fabric](fabric_en.md)
+  - [openmpi](openmpi_en.md)
+  - [kubernetes](k8s_en.md)
+  - [kubernetes on AWS](k8s_aws_en.md)
--- a/doc/howto/usage/cluster/fabric_cn.md
+++ b/doc/howto/usage/cluster/fabric_cn.md
+# 使用fabric启动集群训练
+
+## 准备一个Linux集群
+可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
+
+## 启动集群作业
+
+`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
+
+`paddle.py` 为方便作业启动提供了两个独特的命令选项。
+
+-  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
+-  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
+
+`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
+```
+sh run.sh
+```
+
+集群作业将会在几秒后启动。
+
+## 终止集群作业
+`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
+
+## 检查集群训练结果
+详细信息请检查 $workspace/log 里的日志，每一个节点都有相同的日志结构。
+
+`paddle_trainer.INFO`
+提供几乎所有训练的内部输出日志，与本地训练相同。这里检验运行时间模型的收敛。
+
+`paddle_pserver2.INFO`
+提供 pserver 运行日志，有助于诊断分布式错误。
+
+`server.log`
+提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+`train.log`
+提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+## 检查模型输出
+运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
+工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
--- a/doc/howto/usage/cluster/fabric_en.md
+++ b/doc/howto/usage/cluster/fabric_en.md
+# Cluster Training Using Fabric
+
+## Prepare a Linux cluster
+
+Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
+
+## Launching Cluster Job
+`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
+
+`paddle.py`provides two distinguished command option for easy job launching.
+
+- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
+- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
+dispatch latency.
+
+`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
+```
+sh run.sh
+```
+
+The cluster Job will start in several seconds.
+
+## Kill Cluster Job
+`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
+
+## Check Cluster Training Result
+Check log in $workspace/log for details, each node owns same log structure.
+
+`paddle_trainer.INFO`
+It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
+
+`paddle_pserver2.INFO`
+It provides parameter server running log, which could help to diagnose distributed error.
+
+`server.log`
+It provides stderr and stdout of parameter server process. Check error log if training crashes.
+
+`train.log`
+It provides stderr and stdout of trainer process. Check error log if training crashes.
+
+## Check Model Output
+After one pass finished, model files will be written in `output` directory in node 0.
+`nodefile` in workspace indicates the node id of current cluster job.
--- a/doc/howto/usage/cluster/k8s_aws_cn.md
+++ b/doc/howto/usage/cluster/k8s_aws_cn.md
+k8s_aws_en.md
\ No newline at end of file
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -493,7 +493,7 @@ spec:
    spec:
      containers:
      - name: paddle-data
-        image: paddledev/paddle-tutorial:k8s_data
+        image: paddlepaddle/paddle-tutorial:k8s_data
        imagePullPolicy: Always
        volumeMounts:
        - mountPath: "/efs"
@@ -522,7 +522,7 @@ NAME          DESIRED   SUCCESSFUL   AGE
 paddle-data   1         1            6m
 ```

-Data preparation is done by docker image `paddledev/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
+Data preparation is done by docker image `paddlepaddle/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.

 #### Start Training

@@ -545,7 +545,7 @@ spec:
          claimName: efsvol
      containers:
      - name: trainer
-        image: paddledev/paddle-tutorial:k8s_train
+        image: paddlepaddle/paddle-tutorial:k8s_train
        command: ["bin/bash",  "-c", "/root/start.sh"]
        env:
        - name: JOB_NAME
@@ -617,7 +617,7 @@ kubectl --kubeconfig=kubeconfig log -f POD_NAME

 Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes.

-The details for start `pserver` and `trainer` are hidden inside docker image `paddledev/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
+The details for start `pserver` and `trainer` are hidden inside docker image `paddlepaddle/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.

 #### Inspect Training Output


--- a/doc/howto/usage/k8s/k8s_cn.md
+++ b/doc/howto/usage/k8s/k8s_cn.md
 # Kubernetes单机训练

-在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
+在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的PaddlePaddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。

 ## 制作Docker镜像

-在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式Paddle训练任务中的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
-Paddle的Docker image里。为此，我们需要制作一个包含训练数据的Paddle镜像。
+在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式PaddlePaddle训练任务中
+的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
+PaddlePaddle的Docker Image里。为此，我们需要制作一个包含训练数据的PaddlePaddle镜像。

-Paddle 的 [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) 
-里介绍了用Paddle源码中的脚本下载训练数据的过程。
-而 `paddledev/paddle:cpu-demo-latest` 镜像里有 Paddle 源码与demo，（ 请注意，默认的
-Paddle镜像 `paddledev/paddle:cpu-latest` 是不包括源码的, Paddle的各版本镜像可以参考 [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html) ），所以我们使用这个镜像来下载训练数据到Docker container中，然后把这个包含了训练数据的container保存为一个新的镜像。
+PaddlePaddle的 `paddlepaddle/paddle:cpu-demo-latest` 镜像里有PaddlePaddle的源码与demo，
+（请注意，默认的PaddlePaddle生产环境镜像 `paddlepaddle/paddle:latest` 是不包括源码的，PaddlePaddle的各版本镜像可以参考
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)），
+下面我们使用这个镜像来下载数据到Docker Container中，并把这个包含了训练数据的Container保存为一个新的镜像。

 ### 运行容器

 ```
-$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
 ```

 ### 下载数据
@@ -103,7 +104,7 @@ spec:
      restartPolicy: Never
 ```

-### 创建Paddle Job
+### 创建PaddlePaddle Job

 使用上文创建的yaml文件创建Kubernetes Job，命令为：


--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
 # Kubernetes分布式训练

-前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
-
-有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群，可以参考[k8s_basis](./k8s_basis_cn.md)。
+前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cluster/cluster_train_cn.html)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。

 ## 整体方案

@@ -28,7 +26,7 @@ PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行
 - 拷贝训练文件到容器内
 - 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练

-因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/src/k8s_train/Dockerfile)。
+因为官方镜像 `paddlepaddle/paddle:latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile)。

 ```bash
 $ cd doc/howto/usage/k8s/src/k8s_train
@@ -62,7 +60,7 @@ spec:
      hostNetwork: true
      containers:
      - name: paddle-data
-        image: paddledev/paddle-tutorial:k8s_data
+        image: paddlepaddle/paddle-tutorial:k8s_data
        imagePullPolicy: Always
        volumeMounts:
        - mountPath: "/mnt"
@@ -149,20 +147,19 @@ spec:

 文件中，`metadata`下的`name`表示这个job的名字。`parallelism，completions`字段表示这个job会同时开启3个PaddlePaddle节点，成功训练且退出的pod数目为3时，这个job才算成功结束。然后申明一个存储卷`jobpath`，代表宿主机目录`/home/work/mfs`，在对容器的描述`containers`字段中，将此目录挂载为容器的`/home/jobpath`目录，这样容器的`/home/jobpath`目录就成为了共享存储，放在这个目录里的文件其实是保存到了MFS上。

-`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内。
+`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内：
+

-环境变量 | 说明
--- | ---
-JOB_PATH | 共享存储挂在的路径
-JOB_NAME | Job的名字
-TRAIN_CONFIG_DIR | 本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
-CONF_PADDLE_NIC | `paddle pserver`进程需要的`--nics`参数，即网卡名
-CONF_PADDLE_PORT | `paddle paserver`的`--port`参数
-CONF_PADDLE_PORTS_NUM | 稠密更新的端口数量，即`--ports_num`参数
-CONF_PADDLE_PORTS_NUM_SPARSE | 稀疏更新的端口数量，即`--ports_num_for_sparse`参数
-CONF_PADDLE_GRADIENT_NUM | 训练节点数量，即`--num_gradient_servers参数`
+- JOB_PATH：共享存储挂在的路径
+- JOB_NAME：Job的名字
+- TRAIN_CONFIG_DIR：本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
+- CONF_PADDLE_NIC：`paddle pserver`进程需要的`--nics`参数，即网卡名
+- CONF_PADDLE_PORT：`paddle paserver`的`--port`参数
+- CONF_PADDLE_PORTS_NUM：稠密更新的端口数量，即`--ports_num`参数
+- CONF_PADDLE_PORTS_NUM_SPARSE：稀疏更新的端口数量，即`--ports_num_for_sparse`参数
+- CONF_PADDLE_GRADIENT_NUM：训练节点数量，即`--num_gradient_servers参数`

-这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。
+这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。

 编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。


--- a/doc/howto/usage/k8s/k8s_en.md
+++ b/doc/howto/usage/k8s/k8s_en.md
-# Paddle On Kubernetes
+# PaddlePaddle On Kubernetes

->In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
+In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster.

 ## Build Docker Image

-In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
+In distributed Kubernetes cluster, we will use Ceph or other distributed
+storage system for storing training related data so that all processes in
+PaddlePaddle training can retrieve data from Ceph. In this example, we will
+only demo training job on single machine. In order to simplify the requirement
+of the environment, we will directly put training data into the PaddlePaddle Docker Image,
+so we need to create a PaddlePaddle Docker image that includes the training data.

-Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
-And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
+The production Docker Image `paddlepaddle/paddle:cpu-demo-latest` has the PaddlePaddle
+source code and demo. (Caution: Default PaddlePaddle Docker Image `paddlepaddle/paddle:latest` doesn't include
+the source code, PaddlePaddle's different versions of Docker Image can be referred here:
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_en.html)),
+so we run this Docker Image and download the training data, and then commit the whole
+Container to be a new Docker Image.

 ### Run Docker Container

 ```
-$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
 ```

 ### Download Training Data
@@ -67,7 +76,7 @@ $ docker commit quick_start_data mypaddle/paddle:quickstart

 ## Use Kubernetes For Training

->We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.

 ### Create Yaml Files

@@ -99,7 +108,7 @@ spec:
      restartPolicy: Never
 ```

-### Start Paddle Job
+### Start PaddlePaddle Job

 Using the above yaml file to start the Kubernetes job.


--- a/doc/howto/usage/cluster/openmpi_cn.md
+++ b/doc/howto/usage/cluster/openmpi_cn.md
+# 在OpenMPI集群中提交训练作业
+
+## 准备OpenMPI集群
+
+执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
+
+```bash
+paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
+
+## 启动集群作业
+
+您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
+
+```bash
+# 获得head和node节点的IP地址
+kubectl get po -o wide
+# 将node节点的IP地址保存到machines文件中
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# 拷贝必要的文件到head节点
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# ssh 登录到head节点
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- 以下操作均在head节点中执行 ---------------
+# 准备训练数据
+python prepare.py
+# 拷贝训练程序和字典文件到每台MPI节点
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# 创建日志目录
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# 拷贝训练数据到各自的节点
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# 启动训练任务
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
--- a/doc/howto/usage/cluster/openmpi_en.md
+++ b/doc/howto/usage/cluster/openmpi_en.md
+# Cluster Training Using OpenMPI
+
+## Prepare an OpenMPI cluster
+
+Run the following command to start a 3-node MPI cluster and one "head" node.
+
+```bash
+cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+Then you can log in to every OpenMPI node using ssh without input any passwords.
+
+## Launching Cluster Job
+
+Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
+
+```bash
+# find out node IP addresses
+kubectl get po -o wide
+# generate a "machines" file containing node IP addresses
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# copy necessary files onto "head" node
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# login to head node using ssh
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- in head node ---------------
+# prepare training data
+python prepare.py
+# copy training data and dict file to MPI nodes
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# creat a directory for storing log files
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# copy training data to every node
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# start the job
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
--- a/doc/howto/usage/k8s/src/Dockerfile
+++ b/doc/howto/usage/k8s/src/Dockerfile
-FROM paddledev/paddle:cpu-latest
+FROM paddlepaddle/paddle:latest

 MAINTAINER zjsxzong89@gmail.com


--- a/doc/howto/usage/k8s/src/add_security_group.png
+++ b/doc/howto/usage/k8s/src/add_security_group.png
--- a/doc/howto/usage/k8s/src/create_efs.png
+++ b/doc/howto/usage/k8s/src/create_efs.png
--- a/doc/howto/usage/k8s/src/efs_mount.png
+++ b/doc/howto/usage/k8s/src/efs_mount.png
--- a/doc/howto/usage/cluster/src/k8s-paddle-arch.png
+++ b/doc/howto/usage/cluster/src/k8s-paddle-arch.png
--- a/doc/howto/usage/k8s/src/k8s_data/Dockerfile
+++ b/doc/howto/usage/k8s/src/k8s_data/Dockerfile
--- a/doc/howto/usage/k8s/src/k8s_data/README.md
+++ b/doc/howto/usage/k8s/src/k8s_data/README.md
--- a/doc/howto/usage/k8s/src/k8s_data/get_data.sh
+++ b/doc/howto/usage/k8s/src/k8s_data/get_data.sh
--- a/doc/howto/usage/k8s/src/k8s_train/Dockerfile
+++ b/doc/howto/usage/k8s/src/k8s_train/Dockerfile
-FROM paddledev/paddle:cpu-latest
+FROM paddlepaddle/paddle:latest

 COPY start.sh /root/
 COPY start_paddle.py /root/

--- a/doc/howto/usage/k8s/src/k8s_train/README.md
+++ b/doc/howto/usage/k8s/src/k8s_train/README.md
--- a/doc/howto/usage/k8s/src/k8s_train/start.sh
+++ b/doc/howto/usage/k8s/src/k8s_train/start.sh
--- a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
+++ b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
--- a/doc/howto/usage/k8s/src/managed_policy.png
+++ b/doc/howto/usage/k8s/src/managed_policy.png
--- a/doc/howto/usage/k8s/src/pserver_and_trainer.png
+++ b/doc/howto/usage/k8s/src/pserver_and_trainer.png
--- a/doc/howto/usage/k8s/src/route53_create_recordset.png
+++ b/doc/howto/usage/k8s/src/route53_create_recordset.png
--- a/doc/howto/usage/k8s/src/route53_create_zone.png
+++ b/doc/howto/usage/k8s/src/route53_create_zone.png
--- a/doc/howto/usage/k8s/src/worker_security_group.png
+++ b/doc/howto/usage/k8s/src/worker_security_group.png
--- a/doc/howto/usage/k8s/k8s_basis_cn.md
+++ b/doc/howto/usage/k8s/k8s_basis_cn.md
-# Kubernetes 简介
-
-[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
-
- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
-
- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
-
- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods，job启动后会创建这些pod并开始执行一个程序，等待这个程序执行成功并返回0则成功退出，如果执行失败，也可以配置不同的重试机制。
-
- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
-
- [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
-
- [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合，将外部的存储服务在Kubernetes中描述成为统一的资源形式，便于存储资源管理和Pod引用。
-
-## 部署Kubernetes集群
-
-Kubernetes提供了多种集群部署的方案，本文档内不重复介绍。这里给出集中常见的部署方法：
-
- [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器，便于本地验证和测试。
- [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统，不同主机(Bare-Metal, AWS, GCE)条件下，快速部署集群。
- [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。
- [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。
-
-可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
-
-## 选择存储方案
-
-容器不会保留在运行时生成的数据，job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务，需要有一个外部的存储服务来保存训练所需数据和训练输出。
-常见的可选存储服务包括：
-
- [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单，可以用于小量数据的验证。不提供分布式存储，高可用，冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)。
- [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统，可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。
- [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统，支持rbd，POSIX API接口(ceph fs)和对象存储API，参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。
- [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
-
-## 配置kubectl
-
-### 安装kubectl
-```
-# OS X
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
-
-# Linux
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
-
-# Windows
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
-```
-
-### 配置kubectl访问你的kubernetes集群
-
-编辑`~/.kube/config`这个配置文件，修改`Master-IP`的地址。如果使用SSL认证，则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问（比如通过8080端口），也可以去掉这些证书的配置。
-```
-apiVersion: v1
-clusters:
- cluster:
-    certificate-authority: /path/to/ca.crt
-    server: https://[Master-IP]:443
-  name: minikube
-contexts:
- context:
-    cluster: minikube
-    user: minikube
-  name: minikube
-current-context: minikube
-kind: Config
-preferences: {}
-users:
- name: minikube
-  user:
-    client-certificate: /path/to/apiserver.crt
-    client-key: /Users/wuyi/.minikube/apiserver.key
-```
--- a/doc/howto/usage/k8s/src/k8s-paddle-arch.png
+++ b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
--- a/doc/mobile/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -18,11 +18,11 @@ PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/

 - `CMAKE_SYSTEM_NAME`，CMake编译的目标平台，必须设置为`iOS`。在设置`CMAKE_SYSTEM_NAME=iOS`后，PaddlePaddle的CMake系统会自动编译所有的第三方依赖库，并且强制设置一些PaddlePaddle参数的值（`WITH_C_API=ON`、`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`）。
 - `WITH_C_API`，是否编译C-API预测库，必须设置为ON。在iOS平台上只支持使用C-API来预测。
- `WITH_SWIG_PY`，必须设置为ON。在iOS平台上不支持通过swig调用来训练或者预测。
+- `WITH_SWIG_PY`，必须设置为`OFF`。在iOS平台上不支持通过swig调用来训练或者预测。

 iOS平台可选配置参数：

- `IOS_PLATFORM`，可设置为`OS/SIMULATOR`，默认值为`OS`。
+- `IOS_PLATFORM`，可设置为`OS`（默认值）或`SIMULATOR`。
  - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
  - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
 - `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示，默认编译所有架构：

--- a/doc/mobile/cross_compiling_for_ios_en.md
+++ b/doc/mobile/cross_compiling_for_ios_en.md
+# PaddlePaddle Compiling Guide for iOS
+
+This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
+
+## Preparation
+
+Apple provides Xcode for cross-compiling and IDE for iOS development. Download from App store or [here](https://developer.apple.com/cn/xcode/). To verify your installation, run command as follows
+
+```bash
+$ xcodebuild -version
+Xcode 9.0
+Build version 9A235
+```
+
+## Cross-compiling configurations
+
+PaddlePaddle provides cross-compiling toolchain configuration documentation [cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake), which has some default settings for frequently used compilers.
+
+There are some mandatory environment variables need to be set before cross compiling PaddlePaddle for iOS:
+
+- `CMAKE_SYSTEM_NAME`, CMake compiling target platform name, has to be `iOS`. PaddlePaddle CMake will compile all the third party dependencies and enforce some parameters (`WITH_C_API=ON`, `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`,`WITH_RDMA=OFF`) when this variable is set with value `iOS`.
+
+- `WITH_C_API`, Whether to compile inference C-API library, has to be `ON`, since C-API is the only supported interface for inferencing in iOS.
+- `WITH_SWIG_PY`, has to be `OFF`. It's not supported to inference or train via swig in iOS.
+
+Optional environment variables for iOS are:
+
+- `IOS_PLATFORM`, either `OS` (default) or `SIMULATOR`.
+  - `OS`, build targets ARM-based physical devices like iPhone or iPad.
+  - `SIMULATOR`, build targets x86 architecture simulators.
+- `IOS_ARCH`, target architecture. By default, all architecture types will be compiled. If you need to specify the architecture to compile for, please find valid values for different `IOS_PLATFORM` settings from the table below:
+
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 </td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 </td>
+    </tr>
+    </tbody>
+    </table>
+
+- `IOS_DEPLOYMENT_TARGET`, minimum iOS version to deployment, `7.0` by default.
+- `IOS_ENABLE_BITCODE`, whether to enable [Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3), values can be `ON/OFF`, `ON` by default.
+- `IOS_USE_VECLIB_FOR_BLAS`, whether to use [vecLib](https://developer.apple.com/documentation/accelerate/veclib) framework for BLAS computing. values can be `ON/OFF`, `OFF` by default.
+- `IOS_DEVELOPMENT_ROOT`, the path to `Developer` directory, can be explicitly set with your `/path/to/platform/Developer`. If left blank, PaddlePaddle will automatically pick the Xcode corresponding `platform`'s `Developer` directory based on your `IOS_PLATFORM` value.
+- `IOS_SDK_ROOT`, the path to `SDK` root, can be explicitly set with your  `/path/to/platform/Developer/SDKs/SDK`. if left black, PaddlePaddle will pick the latest SDK in the directory of `IOS_DEVELOPMENT_ROOT`.
+
+other settings：
+
+- `USE_EIGEN_FOR_BLAS`, whether to use Eigen for matrix computing. effective when `IOS_USE_VECLIB_FOR_BLAS=OFF`. Values can be `ON/OFF`, `OFF` by default.
+- `HOST_C/CXX_COMPILER`, host C/C++ compiler. Uses value from environment variable `CC/CXX` by default or `cc/c++` if `CC/CXX` doesn't exist.
+
+some typical cmake configurations:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DIOS_ARCH="armv7;arm64" \
+      -DIOS_ENABLE_BITCODE=ON \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=SIMULATOR \
+      -DIOS_ARCH="x86_64" \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+You can set other compiling parameters for your own need. I.E. if you are trying to minimize the library size, set `CMAKE_BUILD_TYPE` with `MinSizeRel`; or if the performance is your concern, set `CMAKE_BUILD_TYPE` with `Release`. You can even manipulate the PaddlePaddle compiling procedure by manually set `CMAKE_C/CXX_FLAGS` values.
+
+**TIPS for a better performance**:
+
+- set `CMAKE_BUILD_TYPE` with `Release`
+- set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
+
+## Compile and install
+
+After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
+
+```
+$ make
+$ make install
+```
+
+Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
+
+`your/path/to/install` directory will have following directories after `compile` and `install`:
+
+- `include`, contains all the C-API header files.
+- `lib`, contains PaddlePaddle C-API static library.
+- `third_party` contains all the 3rd party libraries.
+
+Please note: if PaddlePaddle library need to support both physical devices and simulators, you will need to compile correspondingly, then merge fat library with `lipo`.
+
+Now you will have PaddlePaddle library compiled and installed, the fat library can be used in deep learning related iOS APPs. Please refer to C-API documentation for usage guides.
--- a/doc/mobile/index_en.rst
+++ b/doc/mobile/index_en.rst
@@ -5,4 +5,5 @@ MOBILE
  :maxdepth: 1

  cross_compiling_for_android_en.md
+  cross_compiling_for_ios_en.md
  cross_compiling_for_raspberry_en.md
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -58,3 +58,6 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
        proto_desc)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
+
+cc_library(init SRCS init.cc DEPS gflags executor place stringpiece)
+cc_test(init_test SRCS init_test.cc DEPS init)
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -19,42 +19,42 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
  switch (attr_desc.type()) {
-    case framework::AttrType::BOOLEAN: {
+    case proto::AttrType::BOOLEAN: {
      return attr_desc.b();
    }
-    case framework::AttrType::INT: {
+    case proto::AttrType::INT: {
      return attr_desc.i();
    }
-    case framework::AttrType::FLOAT: {
+    case proto::AttrType::FLOAT: {
      return attr_desc.f();
    }
-    case framework::AttrType::STRING: {
+    case proto::AttrType::STRING: {
      return attr_desc.s();
    }
-    case framework::AttrType::BOOLEANS: {
+    case proto::AttrType::BOOLEANS: {
      std::vector<bool> val(attr_desc.bools_size());
      for (int i = 0; i < attr_desc.bools_size(); ++i) {
        val[i] = attr_desc.bools(i);
      }
      return val;
    }
-    case framework::AttrType::INTS: {
+    case proto::AttrType::INTS: {
      std::vector<int> val(attr_desc.ints_size());
      for (int i = 0; i < attr_desc.ints_size(); ++i) {
        val[i] = attr_desc.ints(i);
      }
      return val;
    }
-    case framework::AttrType::FLOATS: {
+    case proto::AttrType::FLOATS: {
      std::vector<float> val(attr_desc.floats_size());
      for (int i = 0; i < attr_desc.floats_size(); ++i) {
        val[i] = attr_desc.floats(i);
      }
      return val;
    }
-    case framework::AttrType::STRINGS: {
+    case proto::AttrType::STRINGS: {
      std::vector<std::string> val(attr_desc.strings_size());
      for (int i = 0; i < attr_desc.strings_size(); ++i) {
        val[i] = attr_desc.strings(i);

--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -27,12 +27,12 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 template <typename T>
-inline AttrType AttrTypeID() {
+inline proto::AttrType AttrTypeID() {
  Attribute tmp = T();
-  return static_cast<AttrType>(tmp.which() - 1);
+  return static_cast<proto::AttrType>(tmp.which() - 1);
 }

-Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc);

 class AttrReader {
 public:

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -341,7 +341,7 @@ static void CreateGradVarInBlock(
      auto* param = block_desc->FindVarRecursive(pname);
      auto* grad = block_desc->FindVar(arg);
      if (param == nullptr) {
-        grad->SetDataType(DataType::FP32);
+        grad->SetDataType(proto::DataType::FP32);
      } else {
        grad->SetDataType(param->GetDataType());
      }

--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -166,7 +166,7 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {

 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SumOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "the input tensors of sum operator.").AsDuplicable();
    AddOutput("Out", "the output tensor of sum operator.");

--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -128,22 +128,22 @@ BlockDescBind *BlockDescBind::ParentBlock() const {
  return prog_->MutableBlock(static_cast<size_t>(this->desc_->parent_idx()));
 }

-BlockDesc *BlockDescBind::Proto() {
+proto::BlockDesc *BlockDescBind::Proto() {
  Flush();
  return desc_;
 }

-BlockDescBind::BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
+BlockDescBind::BlockDescBind(ProgramDescBind *prog, proto::BlockDesc *desc)
    : prog_(prog), desc_(desc), need_update_(false) {
-  for (const VarDesc &var_desc : desc_->vars()) {
+  for (const proto::VarDesc &var_desc : desc_->vars()) {
    vars_[var_desc.name()].reset(new VarDescBind(var_desc));
  }
-  for (const OpDesc &op_desc : desc_->ops()) {
+  for (const proto::OpDesc &op_desc : desc_->ops()) {
    ops_.emplace_back(new OpDescBind(op_desc, prog));
  }
 }

-BlockDescBind::BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
+BlockDescBind::BlockDescBind(const BlockDescBind &other, proto::BlockDesc *desc,
                             ProgramDescBind *prog)
    : prog_(prog), desc_(desc) {
  need_update_ = true;

--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -36,9 +36,9 @@ class ProgramDescBind;

 class BlockDescBind {
 public:
-  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc);
+  BlockDescBind(ProgramDescBind *prog, proto::BlockDesc *desc);

-  BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
+  BlockDescBind(const BlockDescBind &other, proto::BlockDesc *desc,
                ProgramDescBind *prog);

  ~BlockDescBind() {
@@ -88,7 +88,7 @@ class BlockDescBind {

  void Flush();

-  BlockDesc *Proto();
+  proto::BlockDesc *Proto();

  ProgramDescBind *Program() { return this->prog_; }

@@ -98,7 +98,7 @@ class BlockDescBind {

 private:
  ProgramDescBind *prog_;   // not_own
-  BlockDesc *desc_;        // not_own
+  proto::BlockDesc *desc_;  // not_own
  bool need_update_;

  std::deque<std::unique_ptr<OpDescBind>> ops_;

--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -20,7 +20,8 @@
 namespace paddle {
 namespace framework {

-inline DataType ToDataType(std::type_index type) {
+inline proto::DataType ToDataType(std::type_index type) {
+  using namespace paddle::framework::proto;
  if (typeid(float).hash_code() == type.hash_code()) {
    return DataType::FP32;
  } else if (typeid(double).hash_code() == type.hash_code()) {
@@ -36,7 +37,8 @@ inline DataType ToDataType(std::type_index type) {
  }
 }

-inline std::type_index ToTypeIndex(DataType type) {
+inline std::type_index ToTypeIndex(proto::DataType type) {
+  using namespace paddle::framework::proto;
  switch (type) {
    case DataType::FP32:
      return typeid(float);
@@ -54,7 +56,8 @@ inline std::type_index ToTypeIndex(DataType type) {
 }

 template <typename Visitor>
-inline void VisitDataType(DataType type, Visitor visitor) {
+inline void VisitDataType(proto::DataType type, Visitor visitor) {
+  using namespace paddle::framework::proto;
  switch (type) {
    case DataType::FP32:
      visitor.template operator()<float>();

--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 #include <sstream>
 #include <vector>


--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
@@ -90,7 +90,7 @@ struct OpInfoFiller<T, kOperator> {
 template <typename T>
 struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
  void operator()(const char* op_type, OpInfo* info) const {
-    info->proto_ = new OpProto;
+    info->proto_ = new proto::OpProto;
    info->checker_ = new OpAttrChecker();
    auto maker = T(info->proto_, info->checker_);
    maker.Validate();

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -33,48 +33,28 @@ namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";

-Executor::Executor(const std::vector<platform::Place>& places) : own_(true) {
-  PADDLE_ENFORCE_GT(places.size(), 0);
-  device_contexts_.resize(places.size());
-  for (size_t i = 0; i < places.size(); i++) {
-    if (platform::is_cpu_place(places[i])) {
-      device_contexts_[i] = new platform::CPUDeviceContext(
-          boost::get<platform::CPUPlace>(places[i]));
-    } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_CUDA
-      device_contexts_[i] = new platform::CUDADeviceContext(
-          boost::get<platform::GPUPlace>(places[i]));
-#else
-      PADDLE_THROW(
-          "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
-          "option");
-#endif
-    }
-  }
-}
+DeviceContextPool* DeviceContextPool::pool = nullptr;

-Executor::~Executor() {
-  if (own_) {
-    for (auto& device_context : device_contexts_) {
-      delete device_context;
-    }
-  }
+Executor::Executor(const std::vector<platform::Place>& places) {
+  DeviceContextPool& pool = DeviceContextPool::Get();
+  auto borrowed_contexts = pool.Borrow(places);
+  device_contexts_.swap(borrowed_contexts);
 }

-static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
-  if (var_type == VarDesc::LOD_TENSOR) {
+static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
+  if (var_type == proto::VarDesc::LOD_TENSOR) {
    var->GetMutable<LoDTensor>();
-  } else if (var_type == VarDesc::SELECTED_ROWS) {
+  } else if (var_type == proto::VarDesc::SELECTED_ROWS) {
    var->GetMutable<SelectedRows>();
-  } else if (var_type == VarDesc::FEED_MINIBATCH) {
+  } else if (var_type == proto::VarDesc::FEED_MINIBATCH) {
    var->GetMutable<FeedFetchList>();
-  } else if (var_type == VarDesc::FETCH_LIST) {
+  } else if (var_type == proto::VarDesc::FETCH_LIST) {
    var->GetMutable<FeedFetchList>();
-  } else if (var_type == VarDesc::STEP_SCOPES) {
+  } else if (var_type == proto::VarDesc::STEP_SCOPES) {
    var->GetMutable<std::vector<framework::Scope>>();
-  } else if (var_type == VarDesc::LOD_RANK_TABLE) {
+  } else if (var_type == proto::VarDesc::LOD_RANK_TABLE) {
    var->GetMutable<LoDRankTable>();
-  } else if (var_type == VarDesc::LOD_TENSOR_ARRAY) {
+  } else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) {
    var->GetMutable<LoDTensorArray>();
  } else {
    PADDLE_THROW(
@@ -132,8 +112,5 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
  }
 }

-Executor::Executor(const platform::DeviceContext& device)
-    : device_contexts_({&device}), own_(false) {}
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -14,19 +14,98 @@ limitations under the License. */

 #pragma once

+#include <map>
+#include <unordered_map>
+
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace framework {

+class DeviceContextPool {
+ public:
+  static DeviceContextPool& Get() {
+    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
+    return *pool;
+  }
+
+  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
+    if (pool == nullptr) {
+      pool = new DeviceContextPool(places);
+    }
+    return *pool;
+  }
+
+  std::vector<const platform::DeviceContext*> Borrow(
+      const std::vector<platform::Place>& places) {
+    PADDLE_ENFORCE_GT(places.size(), 0);
+    PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
+    std::vector<const platform::DeviceContext*> borrowed_contexts;
+    for (auto& place : places) {
+      auto range = device_contexts_.equal_range(place);
+      if (range.first == range.second) {
+        PADDLE_THROW(
+            "'Place' is not supported, Please re-compile with WITH_GPU "
+            "option");
+      }
+      // TODO(dzhwinter) : assign the first found device. Will enhanced later.
+      // device load balancer maybe useful here.
+      borrowed_contexts.emplace_back(range.first->second);
+    }
+    return borrowed_contexts;
+  }
+
+  explicit DeviceContextPool(const std::vector<platform::Place>& places) {
+    PADDLE_ENFORCE_GT(places.size(), 0);
+    for (size_t i = 0; i < places.size(); i++) {
+      if (platform::is_cpu_place(places[i])) {
+        device_contexts_.emplace(
+            places[i], new platform::CPUDeviceContext(
+                           boost::get<platform::CPUPlace>(places[i])));
+      } else if (platform::is_gpu_place(places[i])) {
+#ifdef PADDLE_WITH_CUDA
+        device_contexts_.emplace(
+            places[i], new platform::CUDADeviceContext(
+                           boost::get<platform::GPUPlace>(places[i])));
+#else
+        PADDLE_THROW(
+            "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
+            "option");
+#endif
+      }
+    }
+  }
+
+  ~DeviceContextPool() {}
+
+ private:
+  static DeviceContextPool* pool;
+  struct Hash {
+    std::hash<int> hash_;
+    size_t operator()(const platform::Place& place) const {
+      return hash_(place.which());
+    }
+  };
+  std::unordered_multimap<const platform::Place, const platform::DeviceContext*,
+                          Hash>
+      device_contexts_;
+  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+};
+
 class Executor {
 public:
+  // TODO(dzhwinter) : Do not rely on this function, it will be removed
+  explicit Executor(const platform::DeviceContext& device)
+      : Executor(std::vector<platform::Place>({device.GetPlace()})) {}
+
+  explicit Executor(const platform::Place& place)
+      : Executor(std::vector<platform::Place>({place})) {}
+
  explicit Executor(const std::vector<platform::Place>& places);
-  explicit Executor(const platform::DeviceContext& devices);
-  ~Executor();

  /* @Brief
   * Runtime evaluation of the given ProgramDesc under certain Scope
@@ -39,7 +118,6 @@ class Executor {

 private:
  std::vector<const platform::DeviceContext*> device_contexts_;
-  bool own_;
 };

 }  // namespace framework

--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -14,7 +14,7 @@ limitations under the License. */

 syntax = "proto2";
 option optimize_for = LITE_RUNTIME;
-package paddle.framework;
+package paddle.framework.proto;

 enum AttrType {
  INT = 0;

--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <algorithm>
+#include <string>
+
+#include "paddle/framework/executor.h"
+#include "paddle/framework/init.h"
+#include "paddle/platform/place.h"
+#include "paddle/string/piece.h"
+
+namespace paddle {
+namespace framework {
+
+std::once_flag gflags_init_flag;
+
+// TODO(qijun) move init gflags to init.cc
+void InitGflags(std::vector<std::string> &argv) {
+  std::call_once(gflags_init_flag, [&]() {
+    int argc = argv.size();
+    char **arr = new char *[argv.size()];
+    std::string line;
+    for (size_t i = 0; i < argv.size(); i++) {
+      arr[i] = &argv[i][0];
+      line += argv[i];
+      line += ' ';
+    }
+    google::ParseCommandLineFlags(&argc, &arr, true);
+    VLOG(1) << "Init commandline: " << line;
+  });
+}
+
+bool InitDevices(const std::vector<std::string> &devices) {
+  // device format
+  // CPU
+  // GPU:1
+  // TODO(dzhwinter) : add device format annotation for users.
+  std::vector<platform::Place> places;
+  for (auto &device : devices) {
+    auto p = string::Piece(device);
+    if (string::Find(p, ':', 0) == string::Piece::npos) {
+      places.emplace_back(platform::CPUPlace());
+    } else if (string::HasPrefix(p, "GPU")) {
+#ifdef PADDLE_WITH_CUDA
+      auto pos = string::RFind(p, ':', string::Piece::npos);
+      auto number = device.substr(pos + 1);
+      places.emplace_back(platform::GPUPlace(std::stoi(number)));
+#else
+      LOG(WARNING)
+          << "'GPU' is not supported, Please re-compile with WITH_GPU option";
+#endif
+    } else {
+      return false;
+    }
+  }
+
+  if (std::find_if(places.begin(), places.end(),
+                   [&](const platform::Place &place) {
+                     return platform::is_cpu_place(place);
+                   }) == places.end()) {
+    places.emplace_back(platform::CPUPlace());
+    LOG(WARNING) << "Not specified any device, use CPU by Default.";
+  }
+  DeviceContextPool::Create(places);
+  return true;
+  return true;
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/init.h
+++ b/paddle/framework/init.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <mutex>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+namespace paddle {
+namespace framework {
+
+void InitGflags(std::vector<std::string> &argv);
+
+bool InitDevices(const std::vector<std::string> &devices);
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+
+TEST(Init, InitDevices) {
+  using paddle::framework::InitDevices;
+  std::vector<std::string> ds1 = {"CPU"};
+  ASSERT_EQ(InitDevices(ds1), true);
+
+#ifdef PADDLE_WITH_CUDA
+  std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
+  ASSERT_EQ(InitDevices(ds2), true);
+#endif
+}
--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
@@ -46,4 +46,13 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
 }

 }  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table) {
+  out << "NumOfSequence " << table.items().size() << "\n";
+  for (auto& each_item : table.items()) {
+    out << "\tSeq #" << each_item.index << ", Len=" << each_item.length << "\n";
+  }
+  return out;
+}
 }  // namespace paddle
--- a/paddle/framework/lod_rank_table.h
+++ b/paddle/framework/lod_rank_table.h
@@ -13,6 +13,7 @@
   limitations under the License. */

 #pragma once
+#include <iosfwd>
 #include "paddle/framework/lod_tensor.h"

 namespace paddle {
@@ -52,4 +53,8 @@ class LoDRankTable {
 };

 }  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table);
+
 }  // namespace paddle
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -197,7 +197,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
  {  // the 2nd field, tensor description
     // int32_t  size
     // void*    protobuf message
-    framework::TensorDesc desc;
+    proto::TensorDesc desc;
    desc.set_data_type(framework::ToDataType(tensor.type()));
    auto dims = framework::vectorize(tensor.dims());
    auto *pb_dims = desc.mutable_dims();
@@ -262,7 +262,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
  uint32_t version;
  is.read(reinterpret_cast<char *>(&version), sizeof(version));
  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  framework::TensorDesc desc;
+  proto::TensorDesc desc;
  {  // int32_t size
     // proto buffer
    int32_t size;
@@ -281,16 +281,16 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
    void *buf;
    platform::Place cpu = platform::CPUPlace();
    switch (desc.data_type()) {
-      case framework::FP32:
+      case proto::FP32:
        buf = tensor->mutable_data<float>(cpu);
        break;
-      case framework::FP64:
+      case proto::FP64:
        buf = tensor->mutable_data<double>(cpu);
        break;
-      case framework::INT32:
+      case proto::INT32:
        buf = tensor->mutable_data<int>(cpu);
        break;
-      case framework::INT64:
+      case proto::INT64:
        buf = tensor->mutable_data<int64_t>(cpu);
        break;
      default:

--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -184,6 +184,18 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
  return tensor;
 }

+// Get the absolute offset of a lod[start_level][start_idx:end_idx] and
+// relative length of details for every levels(i.e., [start_level: ]).
+//
+// For example,
+//   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
+//   start_level = 0
+//   start_idx = 1
+//   end_idx = 3
+//
+// Returns:
+//  LoD = [[1, 4], [2, 4, 2, 3, 2]]
+//  pair<size_t, size_t> = {11, 24}
 std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
    const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);


--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -58,11 +58,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
    PADDLE_ENFORCE_LT(j, Outputs(out).size());
    auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
    auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
-    if (in_var->GetType() != VarDesc::LOD_TENSOR) {
+    if (in_var->GetType() != proto::VarDesc::LOD_TENSOR) {
      VLOG(3) << "input " << in << " is not LodTensor";
      return;
    }
-    PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
+    PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR,
                      "The %d-th output of Output(%s) must be LoDTensor.", j,
                      out);
    out_var->SetLoDLevel(in_var->GetLodLevel());
@@ -70,7 +70,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  bool IsRuntime() const override;

 protected:
-  VarDesc::VarType GetVarType(const std::string &name) const override;
+  proto::VarDesc::VarType GetVarType(const std::string &name) const override;

  DDim GetDim(const std::string &name) const override;

@@ -90,12 +90,12 @@ OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
  need_update_ = true;
 }

-OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
+OpDescBind::OpDescBind(const proto::OpDesc &desc, ProgramDescBind *prog)
    : desc_(desc), need_update_(false) {
  // restore inputs_
  int input_size = desc_.inputs_size();
  for (int i = 0; i < input_size; ++i) {
-    const OpDesc::Var &var = desc_.inputs(i);
+    const proto::OpDesc::Var &var = desc_.inputs(i);
    std::vector<std::string> &args = inputs_[var.parameter()];
    int argu_size = var.arguments_size();
    args.reserve(argu_size);
@@ -106,7 +106,7 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
  // restore outputs_
  int output_size = desc_.outputs_size();
  for (int i = 0; i < output_size; ++i) {
-    const OpDesc::Var &var = desc_.outputs(i);
+    const proto::OpDesc::Var &var = desc_.outputs(i);
    std::vector<std::string> &args = outputs_[var.parameter()];
    int argu_size = var.arguments_size();
    args.reserve(argu_size);
@@ -115,9 +115,9 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
    }
  }
  // restore attrs_
-  for (const OpDesc::Attr &attr : desc_.attrs()) {
+  for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
    std::string attr_name = attr.name();
-    if (attr.type() != AttrType::BLOCK) {
+    if (attr.type() != proto::AttrType::BLOCK) {
      attrs_[attr_name] = GetAttrValue(attr);
    } else {
      auto bid = attr.block_idx();
@@ -126,7 +126,7 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
  }
 }

-OpDesc *OpDescBind::Proto() {
+proto::OpDesc *OpDescBind::Proto() {
  Flush();
  return &desc_;
 }
@@ -175,10 +175,10 @@ void OpDescBind::SetOutput(const std::string &param_name,
  this->outputs_[param_name] = args;
 }

-AttrType OpDescBind::GetAttrType(const std::string &name) const {
+proto::AttrType OpDescBind::GetAttrType(const std::string &name) const {
  auto it = attrs_.find(name);
  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return static_cast<AttrType>(it->second.which() - 1);
+  return static_cast<proto::AttrType>(it->second.which() - 1);
 }

 std::vector<std::string> OpDescBind::AttrNames() const {
@@ -253,8 +253,8 @@ void OpDescBind::RenameInput(const std::string &old_name,
 }

 struct SetAttrDescVisitor : public boost::static_visitor<void> {
-  explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
-  mutable OpDesc::Attr *attr_;
+  explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {}
+  mutable proto::OpDesc::Attr *attr_;
  void operator()(int v) const { attr_->set_i(v); }
  void operator()(float v) const { attr_->set_f(v); }
  void operator()(const std::string &v) const { attr_->set_s(v); }
@@ -272,7 +272,9 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
  void operator()(const std::vector<bool> &v) const {
    VectorToRepeated(v, attr_->mutable_bools());
  }
-  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->idx()); }
+  void operator()(proto::BlockDesc *desc) const {
+    attr_->set_block_idx(desc->idx());
+  }
  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };

@@ -297,7 +299,7 @@ void OpDescBind::Flush() {
      auto *attr_desc = desc_.add_attrs();
      attr_desc->set_name(attr.first);
      attr_desc->set_type(
-          static_cast<framework::AttrType>(attr.second.which() - 1));
+          static_cast<proto::AttrType>(attr.second.which() - 1));
      SetAttrDescVisitor visitor(attr_desc);
      boost::apply_visitor(visitor, attr.second);
    }
@@ -375,7 +377,7 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
    for (auto &out_pair : this->outputs_) {
      for (auto &out_var_name : out_pair.second) {
        block->FindRecursiveOrCreateVar(out_var_name)
-            ->SetType(VarDesc::LOD_TENSOR);
+            ->SetType(proto::VarDesc::LOD_TENSOR);
      }
    }
  }
@@ -484,7 +486,7 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
 }
 bool CompileTimeInferShapeContext::IsRuntime() const { return false; }

-VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
+proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
    const std::string &name) const {
  return block_.FindVarRecursive(name)->GetType();
 }

--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -33,9 +33,9 @@ class OpDescBind {
  OpDescBind(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs, const AttributeMap &attrs);

-  OpDescBind(const OpDesc &desc, ProgramDescBind *prog);
+  OpDescBind(const proto::OpDesc &desc, ProgramDescBind *prog);

-  OpDesc *Proto();
+  proto::OpDesc *Proto();

  std::string Type() const { return desc_.type(); }

@@ -59,7 +59,7 @@ class OpDescBind {
    return attrs_.find(name) != attrs_.end();
  }

-  AttrType GetAttrType(const std::string &name) const;
+  proto::AttrType GetAttrType(const std::string &name) const;

  std::vector<std::string> AttrNames() const;

@@ -126,7 +126,7 @@ class OpDescBind {
    return ret_val;
  }

-  OpDesc desc_;
+  proto::OpDesc desc_;
  VariableNameMap inputs_;
  VariableNameMap outputs_;
  AttributeMap attrs_;

--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -34,7 +34,7 @@ class InferShapeBase {
 struct OpInfo {
  OpCreator creator_;
  GradOpMakerFN grad_op_maker_;
-  OpProto* proto_{nullptr};
+  proto::OpProto* proto_{nullptr};
  OpAttrChecker* checker_{nullptr};
  InferVarTypeFN infer_var_type_;
  InferShapeFN infer_shape_;
@@ -43,7 +43,7 @@ struct OpInfo {
    return proto_ != nullptr && checker_ != nullptr;
  }

-  const OpProto& Proto() const {
+  const proto::OpProto& Proto() const {
    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
    PADDLE_ENFORCE(proto_->IsInitialized(),
                   "Operator Proto must be initialized in op info");

--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
@@ -22,6 +22,8 @@ namespace framework {
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
 public:
+  using OpProto = proto::OpProto;
+  using OpAttrChecker = framework::OpAttrChecker;
  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
      : proto_(proto), op_checker_(op_checker) {}

@@ -80,7 +82,7 @@ class OpProtoAndCheckerMaker {

 class NOPMaker : public OpProtoAndCheckerMaker {
 public:
-  NOPMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  NOPMaker(OpProto* proto, framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {}
 };


--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/framework/op_proto_maker_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */

 class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 public:
-  TestAttrProtoMaker(paddle::framework::OpProto* proto,
+  TestAttrProtoMaker(paddle::framework::proto::OpProto* proto,
                     paddle::framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddAttr<float>("scale", "scale of test op");
@@ -27,7 +27,7 @@ class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };

 TEST(ProtoMaker, DuplicatedAttr) {
-  paddle::framework::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto;
  paddle::framework::OpAttrChecker op_checker;
  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
@@ -35,7 +35,7 @@ TEST(ProtoMaker, DuplicatedAttr) {

 class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 public:
-  TestInOutProtoMaker(paddle::framework::OpProto* proto,
+  TestInOutProtoMaker(paddle::framework::proto::OpProto* proto,
                      paddle::framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("input", "input of test op");
@@ -44,7 +44,7 @@ class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };

 TEST(ProtoMaker, DuplicatedInOut) {
-  paddle::framework::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto;
  paddle::framework::OpAttrChecker op_checker;
  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);

--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -31,7 +31,8 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
 }

 static VariableNameMap ConvertOpDescVarsToVarNameMap(
-    const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
+    const google::protobuf::RepeatedPtrField<proto::OpDesc::Var>&
+        op_desc_vars) {
  VariableNameMap ret_val;
  for (auto& var : op_desc_vars) {
    auto& var_names = ret_val[var.parameter()];
@@ -43,7 +44,8 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
  return ret_val;
 }

-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
+    const proto::OpDesc& op_desc) {
  VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
             "used in unit tests. Use CreateOp(const OpDescBind& op_desc) "
             "instead.";

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -77,7 +77,7 @@ class OpRegistry {
                                                const VariableNameMap& outputs,
                                                AttributeMap attrs);

-  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
+  static std::unique_ptr<OperatorBase> CreateOp(const proto::OpDesc& op_desc);

  static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
 };

--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -51,7 +51,7 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {

 static void BuildVar(const std::string& param_name,
                     std::initializer_list<const char*> arguments,
-                     paddle::framework::OpDesc::Var* var) {
+                     paddle::framework::proto::OpDesc::Var* var) {
  var->set_parameter(param_name);
  for (auto& arg_name : arguments) {
    var->add_arguments(arg_name);
@@ -63,7 +63,7 @@ REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp,
                             paddle::framework::MyTestOpProtoAndCheckerMaker);

 TEST(OpRegistry, CreateOp) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("cos_sim");
  BuildVar("input", {"aa"}, op_desc.add_inputs());
  BuildVar("output", {"bb"}, op_desc.add_outputs());
@@ -71,7 +71,7 @@ TEST(OpRegistry, CreateOp) {
  float scale = 3.3;
  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(scale);

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
@@ -83,14 +83,14 @@ TEST(OpRegistry, CreateOp) {
 }

 TEST(OpRegistry, IllegalAttr) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("cos_sim");
  BuildVar("input", {"aa"}, op_desc.add_inputs());
  BuildVar("output", {"bb"}, op_desc.add_outputs());

  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(-2.0);

  bool caught = false;
@@ -108,7 +108,7 @@ TEST(OpRegistry, IllegalAttr) {
 }

 TEST(OpRegistry, DefaultValue) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("cos_sim");
  BuildVar("input", {"aa"}, op_desc.add_inputs());
  BuildVar("output", {"bb"}, op_desc.add_outputs());
@@ -123,7 +123,7 @@ TEST(OpRegistry, DefaultValue) {
 }

 TEST(OpRegistry, CustomChecker) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("my_test_op");
  BuildVar("input", {"ii"}, op_desc.add_inputs());
  BuildVar("output", {"oo"}, op_desc.add_outputs());
@@ -145,7 +145,7 @@ TEST(OpRegistry, CustomChecker) {
  // set 'test_attr' set to an illegal value
  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("test_attr");
-  attr->set_type(paddle::framework::AttrType::INT);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
  attr->set_i(3);
  caught = false;
  try {
@@ -164,7 +164,7 @@ TEST(OpRegistry, CustomChecker) {
  op_desc.mutable_attrs()->Clear();
  attr = op_desc.mutable_attrs()->Add();
  attr->set_name("test_attr");
-  attr->set_type(paddle::framework::AttrType::INT);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
  attr->set_i(4);
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  paddle::platform::CPUDeviceContext dev_ctx;

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -377,7 +377,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
    }
  }

-  VarDesc::VarType GetVarType(const std::string& name) const override {
+  proto::VarDesc::VarType GetVarType(const std::string& name) const override {
    auto* var = scope_.FindVar(name);
    return ToVarType(var->Type());
  }
@@ -417,7 +417,7 @@ OpKernelType OperatorWithKernel::GetKernelType(
    const ExecutionContext& ctx) const {
  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
 }
-DataType OperatorWithKernel::IndicateDataType(
+proto::DataType OperatorWithKernel::IndicateDataType(
    const ExecutionContext& ctx) const {
  auto& scope = ctx.scope();
  int data_type = -1;
@@ -443,7 +443,7 @@ DataType OperatorWithKernel::IndicateDataType(
    }
  }
  PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
-  return static_cast<DataType>(data_type);
+  return static_cast<proto::DataType>(data_type);
 }

 }  // namespace framework

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -358,12 +358,13 @@ struct OpKernelType {
  };

  platform::Place place_;
-  DataType data_type_;
+  proto::DataType data_type_;

-  OpKernelType(DataType data_type, platform::Place place)
+  OpKernelType(proto::DataType data_type, platform::Place place)
      : place_(place), data_type_(data_type) {}

-  OpKernelType(DataType data_type, const platform::DeviceContext& dev_ctx)
+  OpKernelType(proto::DataType data_type,
+               const platform::DeviceContext& dev_ctx)
      : place_(dev_ctx.GetPlace()), data_type_(data_type) {}

  bool operator==(const OpKernelType& o) const {
@@ -409,7 +410,7 @@ class OperatorWithKernel : public OperatorBase {
 private:
  // indicate kernel DataType by input data. Defaultly all input data must be
  // same.
-  DataType IndicateDataType(const ExecutionContext& ctx) const;
+  proto::DataType IndicateDataType(const ExecutionContext& ctx) const;
 };

 std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key);

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -58,7 +58,7 @@ class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {

 static void BuildVar(const std::string& param_name,
                     std::initializer_list<const char*> arguments,
-                     paddle::framework::OpDesc::Var* var) {
+                     paddle::framework::proto::OpDesc::Var* var) {
  var->set_parameter(param_name);
  for (auto& arg_name : arguments) {
    *var->mutable_arguments()->Add() = arg_name;
@@ -70,14 +70,14 @@ REGISTER_OP_WITHOUT_GRADIENT(
    paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);

 TEST(OperatorBase, all) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("test_operator");
  BuildVar("input", {"IN1"}, op_desc.add_inputs());
  BuildVar("output", {"OUT1"}, op_desc.add_outputs());

  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(3.14);

  paddle::platform::CPUDeviceContext device_context;
@@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel {
 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {}
  OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
-    return OpKernelType(DataType::FP32, ctx.GetPlace());
+    return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
  }
 };

@@ -195,14 +195,14 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,

 // test with single input
 TEST(OpKernel, all) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("op_with_kernel");
  BuildVar("x", {"IN1"}, op_desc.add_inputs());
  BuildVar("y", {"OUT1"}, op_desc.add_outputs());

  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(3.14);

  paddle::platform::CPUDeviceContext cpu_device_context;
@@ -224,7 +224,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
  using namespace paddle::framework;

-  OpDesc op_desc;
+  proto::OpDesc op_desc;
  op_desc.set_type("op_multi_inputs_with_kernel");
  BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
  BuildVar("k", {"k0"}, op_desc.add_inputs());
@@ -232,7 +232,7 @@ TEST(OpKernel, multi_inputs) {

  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(3.14);

  paddle::platform::CPUDeviceContext cpu_device_context;

--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -26,7 +26,7 @@ BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) {
  return blocks_.back().get();
 }

-ProgramDesc *ProgramDescBind::Proto() {
+proto::ProgramDesc *ProgramDescBind::Proto() {
  for (auto &block : blocks_) {
    block->Flush();
  }
@@ -49,7 +49,7 @@ ProgramDescBind::ProgramDescBind(const ProgramDescBind &o) {
  }
 }

-ProgramDescBind::ProgramDescBind(const ProgramDesc &desc) {
+ProgramDescBind::ProgramDescBind(const proto::ProgramDesc &desc) {
  desc_ = desc;
  for (auto &block_desc : *desc_.mutable_blocks()) {
    blocks_.emplace_back(new BlockDescBind(this, &block_desc));

--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -29,7 +29,7 @@ class ProgramDescBind {
 public:
  ProgramDescBind();

-  explicit ProgramDescBind(const ProgramDesc &desc);
+  explicit ProgramDescBind(const proto::ProgramDesc &desc);

  ProgramDescBind(const ProgramDescBind &o);

@@ -43,10 +43,10 @@ class ProgramDescBind {

  size_t Size() const { return blocks_.size(); }

-  ProgramDesc *Proto();
+  proto::ProgramDesc *Proto();

 private:
-  ProgramDesc desc_;
+  proto::ProgramDesc desc_;

  std::vector<std::unique_ptr<BlockDescBind>> blocks_;
 };

--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
@@ -22,15 +22,15 @@ TEST(ProgramDesc, copy_ctor) {
  ProgramDescBind program;
  auto* global_block = program.MutableBlock(0);
  auto* x = global_block->Var("X");
-  x->SetType(VarDesc_VarType_LOD_TENSOR);
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
  x->SetLoDLevel(0);
-  x->SetDataType(FP32);
+  x->SetDataType(proto::FP32);
  x->SetShape({1000, 784});

  auto* y = global_block->Var("Y");
-  y->SetType(VarDesc_VarType_LOD_TENSOR);
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
  y->SetLoDLevel(0);
-  y->SetDataType(FP32);
+  y->SetDataType(proto::FP32);
  y->SetShape({784, 100});

  auto* op = global_block->AppendOp();
@@ -39,7 +39,7 @@ TEST(ProgramDesc, copy_ctor) {
  op->SetInput("Y", {y->Name()});

  auto* out = global_block->Var("Out");
-  out->SetType(VarDesc_VarType_LOD_TENSOR);
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
  op->SetOutput("Y", {out->Name()});

  ProgramDescBind program_copy(program);
@@ -84,15 +84,15 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
  ProgramDescBind program_origin;
  auto* global_block = program_origin.MutableBlock(0);
  auto* x = global_block->Var("X");
-  x->SetType(VarDesc_VarType_LOD_TENSOR);
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
  x->SetLoDLevel(0);
-  x->SetDataType(FP32);
+  x->SetDataType(proto::FP32);
  x->SetShape({1000, 784});

  auto* y = global_block->Var("Y");
-  y->SetType(VarDesc_VarType_LOD_TENSOR);
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
  y->SetLoDLevel(0);
-  y->SetDataType(FP32);
+  y->SetDataType(proto::FP32);
  y->SetShape({784, 100});

  auto* op = global_block->AppendOp();
@@ -101,7 +101,7 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
  op->SetInput("Y", {y->Name()});

  auto* out = global_block->Var("Out");
-  out->SetType(VarDesc_VarType_LOD_TENSOR);
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
  op->SetOutput("Y", {out->Name()});

  std::string binary_str;

--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -29,7 +29,7 @@ const std::string kFetchOpType = "fetch";
 const std::string kDropOutOpType = "dropout";
 const std::string kBatchNormOpType = "batch_norm";

-bool HasDependentVar(const OpDesc& op_desc,
+bool HasDependentVar(const proto::OpDesc& op_desc,
                     const std::set<std::string>& dependent_vars) {
  for (auto& var : op_desc.outputs()) {
    for (auto& argu : var.arguments()) {
@@ -41,14 +41,15 @@ bool HasDependentVar(const OpDesc& op_desc,
  return false;
 }

-bool IsTarget(const OpDesc& op_desc) {
+bool IsTarget(const proto::OpDesc& op_desc) {
  if (op_desc.has_is_target()) {
    return op_desc.is_target();
  }
  return false;
 }

-void prune_impl(const ProgramDesc& input, ProgramDesc* output, int block_id) {
+void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
+                int block_id) {
  // TODO(tonyyang-svail):
  //    - will change to use multiple blocks for RNN op and Cond Op

@@ -104,12 +105,12 @@ void prune_impl(const ProgramDesc& input, ProgramDesc* output, int block_id) {
 }

 // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
-void Prune(const ProgramDesc& input, ProgramDesc* output) {
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
  prune_impl(input, output, 0);
 }

-void inference_optimize_impl(const ProgramDesc& input, ProgramDesc* output,
-                             int block_id) {
+void inference_optimize_impl(const proto::ProgramDesc& input,
+                             proto::ProgramDesc* output, int block_id) {
  *output = input;
  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
  for (auto& op_desc : *op_field) {
@@ -125,7 +126,8 @@ void inference_optimize_impl(const ProgramDesc& input, ProgramDesc* output,
  }
 }

-void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output) {
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output) {
  inference_optimize_impl(input, output, 0);
 }


--- a/paddle/framework/prune.h
+++ b/paddle/framework/prune.h
@@ -20,9 +20,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-void Prune(const ProgramDesc& input, ProgramDesc* output);
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output);

-void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output);
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output);

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
@@ -34,7 +34,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
  for (auto kv : outputs) {
    for (auto v : kv.second) {
      auto var = block->Var(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
+      var->SetDataType(paddle::framework::proto::DataType::FP32);
    }
  }

@@ -57,14 +57,14 @@ TEST(Prune, one_operator) {
  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
        block);

-  f::ProgramDesc *pdesc = program.Proto();
-  f::ProgramDesc pruned;
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc pruned;

-  Prune(*pdesc, &pruned);
+  f::Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0);

  pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true);
-  Prune(*pdesc, &pruned);
+  f::Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1);
 }

@@ -81,12 +81,12 @@ TEST(Prune, forward) {
  AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{},
        block);

-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();

  for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) {
-    f::ProgramDesc pruned;
+    f::proto::ProgramDesc pruned;
    pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true);
-    Prune(*pdesc, &pruned);
+    f::Prune(*pdesc, &pruned);
    PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1);
  }
 }
@@ -104,11 +104,11 @@ TEST(Prune, multi_input_op) {
  AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}},
        f::AttributeMap{}, block);

-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
  pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);

-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4);
 }

@@ -123,11 +123,11 @@ TEST(Prune, multi_output_op) {
  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
        block);

-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);

-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2);
 }

@@ -142,11 +142,11 @@ TEST(Prune, multi_target) {
  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
        block);

-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
  pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);

-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3);
 }
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -57,17 +57,17 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
    SetDim(names[i], dims[i]);
  }
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetInputsVarType(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetInputsVarType(
    const std::string &name) const {
  return GetVarTypes(Inputs(name));
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetOutputsVarType(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetOutputsVarType(
    const std::string &name) const {
  return GetVarTypes(Outputs(name));
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetVarTypes(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetVarTypes(
    const std::vector<std::string> &names) const {
-  std::vector<VarDesc::VarType> retv;
+  std::vector<proto::VarDesc::VarType> retv;
  retv.resize(names.size());
  std::transform(names.begin(), names.end(), retv.begin(),
                 std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,

--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -27,8 +27,9 @@ class InferShapeContext {
  virtual bool HasInput(const std::string &name) const = 0;
  virtual bool HasOutput(const std::string &name) const = 0;

-  std::vector<VarDesc::VarType> GetInputsVarType(const std::string &name) const;
-  std::vector<VarDesc::VarType> GetOutputsVarType(
+  std::vector<proto::VarDesc::VarType> GetInputsVarType(
+      const std::string &name) const;
+  std::vector<proto::VarDesc::VarType> GetOutputsVarType(
      const std::string &name) const;

  virtual bool HasInputs(const std::string &name) const = 0;
@@ -65,10 +66,10 @@ class InferShapeContext {
  std::vector<framework::DDim> GetDims(
      const std::vector<std::string> &names) const;

-  std::vector<VarDesc::VarType> GetVarTypes(
+  std::vector<proto::VarDesc::VarType> GetVarTypes(
      const std::vector<std::string> &names) const;

-  virtual VarDesc::VarType GetVarType(const std::string &name) const = 0;
+  virtual proto::VarDesc::VarType GetVarType(const std::string &name) const = 0;
 };

 }  // namespace framework

--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -18,15 +18,17 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); }
+proto::VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); }

-void VarDescBind::SetType(VarDesc::VarType type) { desc_.set_type(type); }
+void VarDescBind::SetType(proto::VarDesc::VarType type) {
+  desc_.set_type(type);
+}

 void VarDescBind::SetShape(const std::vector<int64_t> &dims) {
  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }

-void VarDescBind::SetDataType(DataType data_type) {
+void VarDescBind::SetDataType(proto::DataType data_type) {
  mutable_tensor_desc()->set_data_type(data_type);
 }

@@ -34,14 +36,16 @@ std::vector<int64_t> VarDescBind::Shape() const {
  return RepeatedToVector(tensor_desc().dims());
 }

-DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }
+proto::DataType VarDescBind::GetDataType() const {
+  return tensor_desc().data_type();
+}

 void VarDescBind::SetLoDLevel(int32_t lod_level) {
  switch (desc_.type()) {
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
      desc_.mutable_lod_tensor()->set_lod_level(lod_level);
      break;
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
      desc_.mutable_tensor_array()->set_lod_level(lod_level);
      break;
    default:
@@ -52,9 +56,9 @@ void VarDescBind::SetLoDLevel(int32_t lod_level) {

 int32_t VarDescBind::GetLodLevel() const {
  switch (desc_.type()) {
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
      return desc_.lod_tensor().lod_level();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
      return desc_.tensor_array().lod_level();
    default:
      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
@@ -62,29 +66,29 @@ int32_t VarDescBind::GetLodLevel() const {
  }
 }

-const TensorDesc &VarDescBind::tensor_desc() const {
+const proto::TensorDesc &VarDescBind::tensor_desc() const {
  PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
  switch (desc_.type()) {
-    case VarDesc::SELECTED_ROWS:
+    case proto::VarDesc::SELECTED_ROWS:
      return desc_.selected_rows();
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
      return desc_.lod_tensor().tensor();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
      return desc_.tensor_array().tensor();
    default:
      PADDLE_THROW("Unexpected branch.");
  }
 }

-TensorDesc *VarDescBind::mutable_tensor_desc() {
+proto::TensorDesc *VarDescBind::mutable_tensor_desc() {
  PADDLE_ENFORCE(desc_.has_type(),
                 "invoke MutableTensorDesc must after set type");
  switch (desc_.type()) {
-    case VarDesc::SELECTED_ROWS:
+    case proto::VarDesc::SELECTED_ROWS:
      return desc_.mutable_selected_rows();
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
      return desc_.mutable_lod_tensor()->mutable_tensor();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
      return desc_.mutable_tensor_array()->mutable_tensor();
    default:
      PADDLE_THROW("Unexpected branch.");

--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -57,40 +57,40 @@ class VarDescBind {
 public:
  explicit VarDescBind(const std::string &name) {
    desc_.set_name(name);
-    desc_.set_type(VarDesc::LOD_TENSOR);
+    desc_.set_type(proto::VarDesc::LOD_TENSOR);
  }

-  explicit VarDescBind(const VarDesc &desc) : desc_(desc) {}
+  explicit VarDescBind(const proto::VarDesc &desc) : desc_(desc) {}

-  VarDesc *Proto() { return &desc_; }
+  proto::VarDesc *Proto() { return &desc_; }

  std::string Name() const { return desc_.name(); }

  void SetShape(const std::vector<int64_t> &dims);

-  void SetDataType(DataType data_type);
+  void SetDataType(proto::DataType data_type);

  std::vector<int64_t> Shape() const;

-  DataType GetDataType() const;
+  proto::DataType GetDataType() const;

  void SetLoDLevel(int32_t lod_level);

  int32_t GetLodLevel() const;

-  VarDesc::VarType GetType() const;
+  proto::VarDesc::VarType GetType() const;

-  void SetType(VarDesc::VarType type);
+  void SetType(proto::VarDesc::VarType type);

  bool Persistable() const { return desc_.persistable(); }

  void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }

 private:
-  const TensorDesc &tensor_desc() const;
-  TensorDesc *mutable_tensor_desc();
+  const proto::TensorDesc &tensor_desc() const;
+  proto::TensorDesc *mutable_tensor_desc();

-  VarDesc desc_;
+  proto::VarDesc desc_;
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -20,15 +20,15 @@

 namespace paddle {
 namespace framework {
-inline VarDesc::VarType ToVarType(std::type_index type) {
+inline proto::VarDesc::VarType ToVarType(std::type_index type) {
  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
-    return VarDesc_VarType_LOD_TENSOR;
+    return proto::VarDesc_VarType_LOD_TENSOR;
  } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
-    return VarDesc_VarType_LOD_RANK_TABLE;
+    return proto::VarDesc_VarType_LOD_RANK_TABLE;
  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
-    return VarDesc_VarType_LOD_TENSOR_ARRAY;
+    return proto::VarDesc_VarType_LOD_TENSOR_ARRAY;
  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
-    return VarDesc_VarType_SELECTED_ROWS;
+    return proto::VarDesc_VarType_SELECTED_ROWS;
  } else {
    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
  }
@@ -37,16 +37,16 @@ inline VarDesc::VarType ToVarType(std::type_index type) {
 template <typename Visitor>
 inline void VisitVarType(const Variable& var, Visitor visitor) {
  switch (ToVarType(var.Type())) {
-    case VarDesc_VarType_LOD_TENSOR:
+    case proto::VarDesc_VarType_LOD_TENSOR:
      visitor(var.Get<framework::LoDTensor>());
      return;
-    case VarDesc_VarType_LOD_RANK_TABLE:
+    case proto::VarDesc_VarType_LOD_RANK_TABLE:
      visitor(var.Get<LoDRankTable>());
      return;
-    case VarDesc_VarType_LOD_TENSOR_ARRAY:
+    case proto::VarDesc_VarType_LOD_TENSOR_ARRAY:
      visitor(var.Get<LoDTensorArray>());
      return;
-    case VarDesc_VarType_SELECTED_ROWS:
+    case proto::VarDesc_VarType_SELECTED_ROWS:
      visitor(var.Get<SelectedRows>());
      return;
    default:

--- a/paddle/framework/var_type_inference_test.cc
+++ b/paddle/framework/var_type_inference_test.cc
@@ -36,14 +36,14 @@ class SumOpVarTypeInference : public VarTypeInference {
  void operator()(const OpDescBind &op_desc,
                  BlockDescBind *block) const override {
    auto &inputs = op_desc.Input("X");
-    auto default_var_type = VarDesc::SELECTED_ROWS;
+    auto default_var_type = proto::VarDesc::SELECTED_ROWS;

    bool any_input_is_lod_tensor = std::any_of(
        inputs.begin(), inputs.end(), [block](const std::string &name) {
-          return block->Var(name)->GetType() == VarDesc::LOD_TENSOR;
+          return block->Var(name)->GetType() == proto::VarDesc::LOD_TENSOR;
        });
    if (any_input_is_lod_tensor) {
-      default_var_type = VarDesc::LOD_TENSOR;
+      default_var_type = proto::VarDesc::LOD_TENSOR;
    }

    auto out_var_name = op_desc.Output("Out").front();
@@ -68,19 +68,19 @@ TEST(InferVarType, sum_op) {
  op->SetInput("X", {"test_a", "test_b", "test_c"});
  op->SetOutput("Out", {"test_out"});

-  prog.MutableBlock(0)->Var("test_a")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_b")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarDesc::SELECTED_ROWS);
  prog.MutableBlock(0)->Var("test_out");

  op->InferVarType(prog.MutableBlock(0));

-  ASSERT_EQ(VarDesc::SELECTED_ROWS,
+  ASSERT_EQ(proto::VarDesc::SELECTED_ROWS,
            prog.MutableBlock(0)->Var("test_out")->GetType());

-  prog.MutableBlock(0)->Var("test_b")->SetType(VarDesc::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::LOD_TENSOR);
  op->InferVarType(prog.MutableBlock(0));
-  ASSERT_EQ(VarDesc::LOD_TENSOR,
+  ASSERT_EQ(proto::VarDesc::LOD_TENSOR,
            prog.MutableBlock(0)->Var("test_out")->GetType());
 }

@@ -91,14 +91,14 @@ TEST(InferVarType, sum_op_without_infer_var_type) {
  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
  op->SetOutput("Out", {"test2_out"});

-  prog.MutableBlock(0)->Var("test2_a")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_b")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarDesc::SELECTED_ROWS);
  prog.MutableBlock(0)->Var("test2_out");

  op->InferVarType(prog.MutableBlock(0));

-  ASSERT_EQ(VarDesc_VarType_LOD_TENSOR,
+  ASSERT_EQ(proto::VarDesc_VarType_LOD_TENSOR,
            prog.MutableBlock(0)->Var("test2_out")->GetType());
 }


--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -126,6 +126,11 @@ public:
      inputData += inputChannels * inputHeight * inputWidth;
      outputData += outputChannels * outputHeight * outputWidth;
    }
+#ifdef PADDLE_MOBILE_INFERENCE
+    if (Device == DEVICE_TYPE_CPU) {
+      memory_.reset();
+    }
+#endif
  }
 };


--- a/paddle/gserver/layers/ROIPoolLayer.cpp
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -84,12 +84,15 @@ void ROIPoolLayer::forward(PassType passType) {
  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;

  real* outputData = outputValue->getData();
+  real* argmaxData = nullptr;
+  if (passType != PASS_TEST) {
    Matrix::resizeOrCreate(maxIdxs_,
                           numROIs,
                           channels_ * pooledHeight_ * pooledWidth_,
                           false,
                           false);
-  real* argmaxData = maxIdxs_->getData();
+    argmaxData = maxIdxs_->getData();
+  }

  for (size_t n = 0; n < numROIs; ++n) {
    // the first five elememts of each RoI should be:
@@ -128,23 +131,29 @@ void ROIPoolLayer::forward(PassType passType) {
          bool isEmpty = (hend <= hstart) || (wend <= wstart);
          size_t poolIndex = ph * pooledWidth_ + pw;
          outputData[poolIndex] = isEmpty ? 0 : -FLT_MAX;
+          if (argmaxData) {
            argmaxData[poolIndex] = -1;
+          }

          for (size_t h = hstart; h < hend; ++h) {
            for (size_t w = wstart; w < wend; ++w) {
              size_t index = h * width_ + w;
              if (batchData[index] > outputData[poolIndex]) {
                outputData[poolIndex] = batchData[index];
+                if (argmaxData) {
                  argmaxData[poolIndex] = index;
                }
              }
            }
          }
        }
+      }
      batchData += channelOffset;
      outputData += poolChannelOffset;
+      if (argmaxData) {
        argmaxData += poolChannelOffset;
      }
+    }
    bottomROIs += roiOffset;
  }
  if (useGpu_) {

--- a/paddle/gserver/layers/SequenceToBatch.cpp
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
@@ -171,12 +171,31 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch,
    hl_sequence2batch_copy(
        batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
  } else {
-    for (int i = 0; i < batchCount; ++i) {
    if (seq2batch) {
+#ifdef PADDLE_USE_MKLML
+      const int blockMemSize = 8 * 1024;
+      const int blockSize = blockMemSize / sizeof(real);
+#pragma omp parallel for collapse(2)
+      for (int i = 0; i < batchCount; ++i) {
+        for (int j = 0; j < seqWidth; j += blockSize) {
+          memcpy(batch.rowBuf(i) + j,
+                 sequence.rowBuf(idxData[i]) + j,
+                 (j + blockSize > seqWidth) ? (seqWidth - j) * sizeof(real)
+                                            : blockMemSize);
+        }
+      }
+#else
+      for (int i = 0; i < batchCount; ++i) {
        memcpy(batch.rowBuf(i),
               sequence.rowBuf(idxData[i]),
               seqWidth * sizeof(real));
+      }
+#endif
    } else {
+#ifdef PADDLE_USE_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < batchCount; ++i) {
        memcpy(sequence.rowBuf(idxData[i]),
               batch.rowBuf(i),
               seqWidth * sizeof(real));

--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -79,7 +79,7 @@ public:
 #ifdef PADDLE_CUDA_FP16
  HOSTDEVICE inline explicit float16(const half& h) {
 #if CUDA_VERSION >= 9000
-    x = reinterpret_cast<__half_raw*>(&h)->x;
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
 #else
    x = h.x;
 #endif  // CUDA_VERSION >= 9000
@@ -145,7 +145,7 @@ public:
 #ifdef PADDLE_CUDA_FP16
  HOSTDEVICE inline float16& operator=(const half& rhs) {
 #if CUDA_VERSION >= 9000
-    x = reinterpret_cast<__half_raw*>(&rhs)->x;
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
 #else
    x = rhs.x;
 #endif

--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -19,6 +19,7 @@ limitations under the License. */

 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
+#include <algorithm>   // for std::max

 #include "gflags/gflags.h"

@@ -28,7 +29,7 @@ limitations under the License. */
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
 DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
-
+DECLARE_double(fraction_of_gpu_memory_to_use);
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -77,45 +78,20 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
  // if size is 0.  We just make sure it does.
  if (size <= 0) return nullptr;
-
-  size_t available = 0;
-  size_t capacity = 0;
-  paddle::platform::GpuMemoryUsage(available, capacity);
-
-  // Reserve memory for page tables, etc.
-  size_t reserving = 0.05 * capacity + paddle::platform::GpuMinChunkSize();
-  size_t usable = available > reserving ? available - reserving : 0;
-
-  // If remaining size no less than expected size, using general
-  // cudaMalloc to allocate GPU memory.
-  void* p = 0;
-  if (size <= usable) {
+  void* p;
  cudaError_t result = cudaMalloc(&p, size);
  if (result == cudaSuccess) {
    index = 0;
    gpu_alloc_size_ += size;
    return p;
-    }
-  }
-
-  // If remaining size less than expected size or cudaMalloc failed,
-  // cudaMallocHost will be considered as a fallback allocator.
-  //
-  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
-  // of host fallback allocation. Allocates too much would reduce
-  // the amount of memory available to the underlying system for paging.
-  usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
-
-  if (size > usable) return nullptr;
-
-  cudaError_t result = cudaMallocHost(&p, size);
-  if (result == cudaSuccess) {
-    index = 1;
-    fallback_alloc_size_ += size;
-    return p;
-  }
-
+  } else {
+    LOG(WARNING)
+        << "Cannot malloc " << size / 1024.0 / 1024.0
+        << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
+           "environment variable to a lower value. Current value is "
+        << FLAGS_fraction_of_gpu_memory_to_use;
    return nullptr;
+  }
 }

 void GPUAllocator::Free(void* p, size_t size, size_t index) {

--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -63,8 +63,7 @@ class AccuracyOp : public framework::OperatorWithKernel {

 class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AccuracyOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  AccuracyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    // TODO(typhoonzero): support both inference value and indices.
    AddInput("Out", "The network output of topk (inferences)");

--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -26,7 +26,7 @@ template <int BlockSize>
 __global__ void AccuracyCudaKernel(const int N, const int D,
                                   const int64_t* Xdata,
                                   const int64_t* labeldata, int* correct_data,
-                                   float* accuracy) {
+                                   float* accuracy, int* total_data) {
  int count = 0;
  __shared__ int total[BlockSize];

@@ -47,6 +47,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D,
  if (threadIdx.x == 0) {
    *correct_data = result;
    *accuracy = static_cast<float>(result) / static_cast<float>(N);
+    *total_data = N;
  }
 }

@@ -80,22 +81,11 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
    if (num_samples == 0) {
      return;
    }
-    platform::GpuMemcpyAsync(total_data, &num_samples, sizeof(int),
-                             cudaMemcpyHostToDevice, stream);

    AccuracyCudaKernel<
        PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
        num_samples, infer_width, indices_data, label_data, correct_data,
-        accuracy_data);
-
-    int d_num_samples, d_num_correct;
-    float d_accuracy;
-    platform::GpuMemcpyAsync(&d_num_correct, correct_data, sizeof(int),
-                             cudaMemcpyDeviceToHost, stream);
-    platform::GpuMemcpyAsync(&d_num_samples, total_data, sizeof(int),
-                             cudaMemcpyDeviceToHost, stream);
-    platform::GpuMemcpyAsync(&d_accuracy, accuracy_data, sizeof(float),
-                             cudaMemcpyDeviceToHost, stream);
+        accuracy_data, total_data);
  }
 };


--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -38,9 +38,8 @@ class ActivationOpGrad : public framework::OperatorWithKernel {

 class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SigmoidOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Sigmoid operator");
    AddOutput("Y", "Output of Sigmoid operator");
    AddComment(R"DOC(
@@ -54,9 +53,8 @@ $$y = \frac{1}{1 + e^{-x}}$$

 class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LogSigmoidOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LogSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of LogSigmoid operator");
    AddOutput("Y", "Output of LogSigmoid operator");
    AddComment(R"DOC(
@@ -70,8 +68,8 @@ $$y = \log \frac{1}{1 + e^{-x}}$$

 class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ExpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ExpOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Exp operator");
    AddOutput("Y", "Output of Exp operator");
    AddComment(R"DOC(
@@ -85,8 +83,8 @@ $y = e^x$

 class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Relu operator");
    AddOutput("Y", "Output of Relu operator");
    AddComment(R"DOC(
@@ -100,9 +98,8 @@ $y = \max(x, 0)$

 class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LeakyReluOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of LeakyRelu operator");
    AddOutput("Y", "Output of LeakyRelu operator");
    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
@@ -117,9 +114,8 @@ $y = \max(x, \alpha * x)$

 class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Softshrink operator");
    AddOutput("Y", "Output of Softshrink operator");
    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
@@ -140,8 +136,8 @@ $$

 class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  TanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  TanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Tanh operator");
    AddOutput("Y", "Output of Tanh operator");
    AddComment(R"DOC(
@@ -155,9 +151,8 @@ $$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

 class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  TanhShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  TanhShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of TanhShrink operator");
    AddOutput("Y", "Output of TanhShrink operator");
    AddComment(R"DOC(
@@ -171,9 +166,8 @@ $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

 class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  HardShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of HardShrink operator");
    AddOutput("Y", "Output of HardShrink operator");
    AddAttr<float>("threshold", "The value of threshold for HardShrink")
@@ -195,8 +189,8 @@ $$

 class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SqrtOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Sqrt operator");
    AddOutput("Y", "Output of Sqrt operator");
    AddComment(R"DOC(
@@ -210,8 +204,8 @@ $y = \sqrt{x}$

 class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AbsOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  AbsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Abs operator");
    AddOutput("Y", "Output of Abs operator");
    AddComment(R"DOC(
@@ -225,8 +219,8 @@ $y = |x|$

 class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CeilOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  CeilOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Ceil operator");
    AddOutput("Y", "Output of Ceil operator");
    AddComment(R"DOC(
@@ -240,8 +234,8 @@ $y = ceil(x)$

 class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FloorOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  FloorOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Floor operator");
    AddOutput("Y", "Output of Floor operator");
    AddComment(R"DOC(
@@ -255,8 +249,8 @@ $y = floor(x)$

 class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RoundOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Round operator");
    AddOutput("Y", "Output of Round operator");
    AddComment(R"DOC(
@@ -270,9 +264,8 @@ $y = [x]$

 class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ReciprocalOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ReciprocalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Reciprocal operator");
    AddOutput("Y", "Output of Reciprocal operator");
    AddComment(R"DOC(
@@ -286,8 +279,8 @@ $$y = \frac{1}{x}$$

 class LogOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LogOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LogOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Log operator");
    AddOutput("Y", "Output of Log operator");
    AddComment(R"DOC(
@@ -303,8 +296,8 @@ Natural logarithm of x.

 class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SquareOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SquareOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Square operator");
    AddOutput("Y", "Output of Square operator");
    AddComment(R"DOC(
@@ -318,9 +311,8 @@ $y = x^2$

 class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftplusOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftplusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Softplus operator");
    AddOutput("Y", "Output of Softplus operator");
    AddComment(R"DOC(
@@ -334,9 +326,8 @@ $y = \ln(1 + e^{x})$

 class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftsignOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftsignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Softsign operator");
    AddOutput("Y", "Output of Softsign operator");
    AddComment(R"DOC(
@@ -350,8 +341,8 @@ $$y = \frac{x}{1 + |x|}$$

 class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  BReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  BReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of BRelu operator");
    AddOutput("Y", "Output of BRelu operator");
    AddAttr<float>("t_min", "The min marginal value of BRelu")
@@ -369,9 +360,8 @@ $y = \max(\min(x, t_{min}), t_{max})$

 class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftReluOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of SoftRelu operator");
    AddOutput("Y", "Output of SoftRelu operator");
    AddAttr<float>("threshold", "The threshold value of SoftRelu")
@@ -387,8 +377,8 @@ $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$

 class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ELUOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of ELU operator");
    AddOutput("Y", "Output of ELU operator");
    AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
@@ -406,8 +396,8 @@ $y = \max(0, x) + \min(0, \alpha * (e^x - 1))$

 class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Relu6OpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  Relu6OpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Relu6 operator");
    AddOutput("Y", "Output of Relu6 operator");
    AddAttr<float>("threshold", "The threshold value of Relu6")
@@ -423,8 +413,8 @@ $y = \min(\max(0, x), 6)$

 class PowOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  PowOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  PowOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Pow operator");
    AddOutput("Y", "Output of Pow operator");
    AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
@@ -439,8 +429,8 @@ $y = x^{factor}$

 class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  STanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  STanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of STanh operator");
    AddOutput("Y", "Output of STanh operator");
    AddAttr<float>("scale_a", "The scale parameter of a for the input")
@@ -458,9 +448,8 @@ $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$

 class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ThresholdedReluOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ThresholdedReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of ThresholdedRelu operator");
    AddOutput("Y", "Output of ThresholdedRelu operator");
    AddAttr<float>("threshold", "The threshold location of activation")
@@ -481,9 +470,8 @@ $$

 class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  HardSigmoidOpMaker(framework::OpProto *proto,
-                     framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  HardSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of HardSigmoid operator");
    AddOutput("Y", "Output of HardSigmoid operator");
    AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
@@ -508,8 +496,8 @@ It is recommended to use the defaults for this activation.

 class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SwishOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SwishOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Swish operator");
    AddOutput("Y", "Output of Swish operator");
    AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);

--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -59,8 +59,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {

 class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AdadeltaOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  AdadeltaOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");

--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -59,8 +59,7 @@ class AdagradOp : public framework::OperatorWithKernel {

 class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AdagradOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  AdagradOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");

--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -73,7 +73,7 @@ class AdamOp : public framework::OperatorWithKernel {

 class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AdamOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AdamOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");

--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -67,7 +67,7 @@ class AdamaxOp : public framework::OperatorWithKernel {

 class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AdamaxOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AdamaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");

--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -114,8 +114,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {

 class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ArrayToLoDTensorOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  ArrayToLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(std::vector<LodTensor>) A vector of tensors that is going to "

--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -86,8 +86,7 @@ class AssignOp : public framework::OperatorBase {

 class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AssignOpProtoMaker(framework::OpProto *proto,
-                     framework::OpAttrChecker *op_checker)
+  AssignOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
@@ -109,8 +108,8 @@ class AssignInferShape : public framework::InferShapeBase {
  void operator()(framework::InferShapeContext *context) const override {
    if (context->HasInput("X")) {
      auto type = context->GetInputsVarType("X")[0];
-      if (type == framework::VarDesc_VarType_SELECTED_ROWS ||
-          type == framework::VarDesc_VarType_LOD_TENSOR) {
+      if (type == framework::proto::VarDesc_VarType_SELECTED_ROWS ||
+          type == framework::proto::VarDesc_VarType_LOD_TENSOR) {
        context->SetOutputDim("Out", context->GetInputDim("X"));
      }
    }

--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -49,7 +49,7 @@ class AucOp : public framework::OperatorWithKernel {

 class AucOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AucOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AucOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Out",
             "A floating point 2D tensor, values are in the range [0, 1]."

--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -85,8 +85,7 @@ class BatchNormOp : public framework::OperatorWithKernel {

 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  BatchNormOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  BatchNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddAttr<bool>("is_test", "").SetDefault(false);
    AddAttr<float>("momentum", "").SetDefault(0.9);

--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -83,9 +83,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {

 class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  BeamSearchDecodeOpProtoMaker(framework::OpProto* proto,
-                               framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  BeamSearchDecodeOpProtoMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Ids",
             "(LodTensorArray)"
             "score of the candidate words in each step");
@@ -123,10 +122,10 @@ class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
  void operator()(const framework::OpDescBind& op_desc,
                  framework::BlockDescBind* block) const override {
    for (auto& o : op_desc.Output("SentenceIds")) {
-      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
    }
    for (auto& o : op_desc.Output("SentenceScores")) {
-      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
    }
  }
 };

--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
@@ -153,8 +153,7 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
 class BeamSearchProtoAndCheckerMaker
    : public framework::OpProtoAndCheckerMaker {
 public:
-  BeamSearchProtoAndCheckerMaker(framework::OpProto *proto,
-                                 framework::OpAttrChecker *op_checker)
+  BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    // inputs and outputs stored in proto
    AddInput("pre_ids", "ids in previous step");

--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -65,8 +65,7 @@ class BilinearTensorProductOp : public framework::OperatorWithKernel {

 class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  BilinearTensorProductOpMaker(framework::OpProto* proto,
-                               framework::OpAttrChecker* op_checker)
+  BilinearTensorProductOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of bilinear_tensor_product operator.");
    AddInput("Y", "The second input of bilinear_tensor_product operator.");

--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -20,8 +20,7 @@ namespace operators {

 class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CastOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  CastOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of cast op");
    AddOutput("Out", "The output tensor of cast op");

--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
@@ -55,7 +55,7 @@ class CastOpKernel : public framework::OpKernel<InT> {
    auto* in = context.Input<framework::Tensor>("X");
    auto* out = context.Output<framework::Tensor>("Out");
    framework::VisitDataType(
-        static_cast<framework::DataType>(context.Attr<int>("out_dtype")),
+        static_cast<framework::proto::DataType>(context.Attr<int>("out_dtype")),
        CastOpFunctor<DeviceContext, InT>(
            in, out, context.template device_context<DeviceContext>()));
  }

--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/operators/chunk_eval_op.cc
@@ -32,6 +32,13 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
                   "Output(Recall) of ChunkEvalOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
                   "Output(F1-Score) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumInferChunks"),
+                   "Output(NumInferChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumLabelChunks"),
+                   "Output(NumLabelChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NumCorrectChunks"),
+        "Output(NumCorrectChunks) of ChunkEvalOp should not be null.");

    auto inference_dim = ctx->GetInputDim("Inference");
    auto label_dim = ctx->GetInputDim("Label");
@@ -42,20 +49,22 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Precision", {1});
    ctx->SetOutputDim("Recall", {1});
    ctx->SetOutputDim("F1-Score", {1});
+    ctx->SetOutputDim("NumInferChunks", {1});
+    ctx->SetOutputDim("NumLabelChunks", {1});
+    ctx->SetOutputDim("NumCorrectChunks", {1});
  }

 protected:
  framework::OpKernelType GetKernelType(
      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(framework::DataType::FP32,
+    return framework::OpKernelType(framework::proto::DataType::FP32,
                                   ctx.device_context());
  }
 };

 class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ChunkEvalOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ChunkEvalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Inference",
             "(Tensor, default: Tensor<int64_t>). "
@@ -70,6 +79,16 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
              "sensitivity) of chunks on the given mini-batch.");
    AddOutput("F1-Score",
              "(float). The evaluated F1-Score on the given mini-batch.");
+    AddOutput("NumInferChunks",
+              "(int64_t). The number of chunks in Inference on the given "
+              "mini-batch.");
+    AddOutput(
+        "NumLabelChunks",
+        "(int64_t). The number of chunks in Label on the given mini-batch.");
+    AddOutput(
+        "NumCorrectChunks",
+        "(int64_t). The number of chunks both in Inference and Label on the "
+        "given mini-batch.");
    AddAttr<int>("num_chunk_types",
                 "(int). The number of chunk type. See below for details.");
    AddAttr<std::string>(

--- a/paddle/operators/chunk_eval_op.h
+++ b/paddle/operators/chunk_eval_op.h
@@ -111,9 +111,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
    std::vector<Segment> label_segments;
    std::vector<Segment> output_segments;
    std::set<int> excluded_chunk_types;
-    int64_t num_output_segments = 0;
-    int64_t num_label_segments = 0;
-    int64_t num_correct = 0;
+
    if (context.Attr<std::string>("chunk_scheme") == "IOB") {
      num_tag_types = 2;
      tag_begin = 0;
@@ -151,12 +149,24 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
    auto* precision = context.Output<Tensor>("Precision");
    auto* recall = context.Output<Tensor>("Recall");
    auto* f1 = context.Output<Tensor>("F1-Score");
+    auto* num_infer_chunks = context.Output<Tensor>("NumInferChunks");
+    auto* num_label_chunks = context.Output<Tensor>("NumLabelChunks");
+    auto* num_correct_chunks = context.Output<Tensor>("NumCorrectChunks");

    const int64_t* inference_data = inference->data<int64_t>();
    const int64_t* label_data = label->data<int64_t>();
    T* precision_data = precision->mutable_data<T>(context.GetPlace());
    T* racall_data = recall->mutable_data<T>(context.GetPlace());
    T* f1_data = f1->mutable_data<T>(context.GetPlace());
+    int64_t* num_infer_chunks_data =
+        num_infer_chunks->mutable_data<int64_t>(context.GetPlace());
+    int64_t* num_label_chunks_data =
+        num_label_chunks->mutable_data<int64_t>(context.GetPlace());
+    int64_t* num_correct_chunks_data =
+        num_correct_chunks->mutable_data<int64_t>(context.GetPlace());
+    *num_infer_chunks_data = 0;
+    *num_label_chunks_data = 0;
+    *num_correct_chunks_data = 0;

    auto lod = label->lod();
    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
@@ -166,16 +176,22 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
    for (int i = 0; i < num_sequences; ++i) {
      int seq_length = lod[0][i + 1] - lod[0][i];
      EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
-                 output_segments, label_segments, num_output_segments,
-                 num_label_segments, num_correct, num_chunk_types,
-                 num_tag_types, other_chunk_type, tag_begin, tag_inside,
-                 tag_end, tag_single, excluded_chunk_types);
-    }
-    *precision_data = !num_output_segments ? 0 : static_cast<T>(num_correct) /
-                                                     num_output_segments;
-    *racall_data = !num_label_segments ? 0 : static_cast<T>(num_correct) /
-                                                 num_label_segments;
-    *f1_data = !num_correct ? 0 : 2 * (*precision_data) * (*racall_data) /
+                 output_segments, label_segments, *num_infer_chunks_data,
+                 *num_label_chunks_data, *num_correct_chunks_data,
+                 num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
+                 tag_inside, tag_end, tag_single, excluded_chunk_types);
+    }
+    *precision_data = !(*num_infer_chunks_data)
+                          ? 0
+                          : static_cast<T>(*num_correct_chunks_data) /
+                                (*num_infer_chunks_data);
+    *racall_data = !(*num_label_chunks_data)
+                       ? 0
+                       : static_cast<T>(*num_correct_chunks_data) /
+                             (*num_label_chunks_data);
+    *f1_data = !(*num_correct_chunks_data)
+                   ? 0
+                   : 2 * (*precision_data) * (*racall_data) /
                         ((*precision_data) + (*racall_data));
  }


--- a/paddle/operators/clip_by_norm_op.cc
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -37,8 +37,7 @@ class ClipByNormOp : public framework::OperatorWithKernel {

 class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ClipByNormOpMaker(framework::OpProto* proto,
-                    framework::OpAttrChecker* op_checker)
+  ClipByNormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor) The input of clip_by_norm op."

--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -38,7 +38,7 @@ class ClipOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ClipOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor)The input of clip op."

--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 template <typename OpComment>
 class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CompareOpProtoMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  CompareOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    OpComment comment;
    AddInput("X",

--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -58,7 +58,7 @@ class ConcatOp : public framework::OperatorWithKernel {

 class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ConcatOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ConcatOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input tensors of concat operator.").AsDuplicable();
    AddOutput("Out", "Output tensor of concat operator.");

--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -205,8 +205,7 @@ void CondOp::Run(const Scope& scope,

 class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CondOpProtoAndCheckerMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Cond", "The condition, which is a bool vector");
    AddInput("Xs", "Inputs of Subnets").AsDuplicable();

--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@@ -74,8 +74,7 @@ class ConditionalBlockOp : public ConditionalOp {

 class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ConditionalBlockOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  ConditionalBlockOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The conditional variable of this operator. If X is empty, the "

--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -19,8 +19,7 @@ namespace operators {

 class CudnnConv2DOpMaker : public Conv2DOpMaker {
 public:
-  CudnnConv2DOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  CudnnConv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : Conv2DOpMaker(proto, op_checker) {
    AddAttr<int>("workspace_size_MB",
                 "workspace size for cudnn, in MB, "
@@ -34,8 +33,7 @@ class CudnnConv2DOpMaker : public Conv2DOpMaker {

 class CudnnConv3DOpMaker : public Conv3DOpMaker {
 public:
-  CudnnConv3DOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  CudnnConv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : Conv3DOpMaker(proto, op_checker) {
    AddAttr<int>("workspace_size_MB",
                 "workspace size for cudnn, in MB, "

--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -66,8 +66,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }

-Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput(
      "Input",
@@ -138,8 +137,7 @@ $$
 )DOC");
 }

-Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput(
      "Input",

--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -50,14 +50,12 @@ inline bool IsExpand(std::vector<int64_t>& filter_dim,
 // operator implementations can reuse the code.
 class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Conv2DOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };

 class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Conv3DOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };

 class ConvOp : public framework::OperatorWithKernel {

--- a/paddle/operators/conv_shift_op.cc
+++ b/paddle/operators/conv_shift_op.cc
@@ -75,8 +75,7 @@ class ConvShiftGradOp : public framework::OperatorWithKernel {

 class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ConvShiftOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ConvShiftOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "

--- a/paddle/operators/conv_transpose_cudnn_op.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cc
@@ -19,8 +19,7 @@ namespace operators {

 class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
 public:
-  CudnnConv2DTransposeOpMaker(framework::OpProto* proto,
-                              framework::OpAttrChecker* op_checker)
+  CudnnConv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : Conv2DTransposeOpMaker(proto, op_checker) {
    AddAttr<int>("workspace_size_MB",
                 "workspace size for cudnn, in MB, "
@@ -34,8 +33,7 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {

 class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker {
 public:
-  CudnnConv3DTransposeOpMaker(framework::OpProto* proto,
-                              framework::OpAttrChecker* op_checker)
+  CudnnConv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : Conv3DTransposeOpMaker(proto, op_checker) {
    AddAttr<int>("workspace_size_MB",
                 "workspace size for cudnn, in MB, "

--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -58,8 +58,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }

-Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
-    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput(
      "Input",
@@ -123,8 +123,8 @@ Example:
 )DOC");
 }

-Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
-    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput("Input",
           "(Tensor) The input tensor of convolution transpose operator."

--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -30,14 +30,12 @@ using DDim = framework::DDim;
 // operator implementations can reuse the code.
 class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Conv2DTransposeOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker);
+  Conv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };

 class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Conv3DTransposeOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker);
+  Conv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };

 class ConvTransposeOp : public framework::OperatorWithKernel {

--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -62,7 +62,7 @@ class CosSimOp : public framework::OperatorWithKernel {

 class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CosSimOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  CosSimOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The 1st input of cos_sim op.");
    AddInput("Y", "The 2nd input of cos_sim op.");

--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -18,8 +18,7 @@ namespace paddle {
 namespace operators {
 class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CRFDecodingOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  CRFDecodingOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Emission",
             "(LoDTensor, default: LoDTensor<float>). A LoDTensor with shape "

--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -52,7 +52,7 @@ class CropOp : public framework::OperatorWithKernel {

 class CropOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CropOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  CropOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The input of pad op. "
@@ -88,7 +88,8 @@ There are two ways to set shape:

 The input should be a k-D tensor(k > 0 and k < 7). As an example:

-Given:
+Case 1:
+Given

    X = [[0, 1, 2, 0, 0]
         [0, 3, 4, 0, 0]
@@ -107,6 +108,27 @@ we get:
    Out = [[1, 2],
           [3, 4]].

+
+Case 2:
+Given
+
+    X = [[0, 1, 2, 5, 0]
+         [0, 3, 4, 6, 0]
+         [0, 0, 0, 0, 0]],
+
+and
+
+    offsets = [0, 1],
+
+and
+
+    Y = [[0, 0, 0]
+         [0, 0, 0]],
+
+we get:
+
+    Out = [[1, 2, 5],
+           [3, 4, 6]].
 )DOC");
  }
 };

--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -111,8 +111,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {

 class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CrossEntropyOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "

--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
@@ -55,8 +55,7 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {

 class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  DecayedAdagradOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  DecayedAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");

--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -40,8 +40,7 @@ class DropoutOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  DropoutOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of dropout op.");
    AddOutput("Out", "The output of dropout op.");

--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -71,7 +71,7 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
      auto M = EigenMatrix<T>::Reshape(*mask, 1);
      Y.device(place) = X * M;
    } else {
-      Y.device(place) = X * dropout_prob;
+      Y.device(place) = X * (1.0f - dropout_prob);
    }
  }
 };

--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -57,7 +57,7 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
      auto Y = EigenMatrix<T>::Reshape(*y, 1);
      auto& place =
          *context.template device_context<DeviceContext>().eigen_device();
-      Y.device(place) = X * dropout_prob;
+      Y.device(place) = X * (1.0f - dropout_prob);
    }
  }
 };

--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
@@ -19,8 +19,7 @@ namespace paddle {
 namespace operators {
 class ElementwiseAddOpMaker : public ElementwiseOpMaker {
 public:
-  ElementwiseAddOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : ElementwiseOpMaker(proto, op_checker) {
    SetComment("Add", "$Out = X + Y$");
    AddComment(comment_);

--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
@@ -19,8 +19,7 @@ namespace paddle {
 namespace operators {
 class ElementwiseDivOpMaker : public ElementwiseOpMaker {
 public:
-  ElementwiseDivOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : ElementwiseOpMaker(proto, op_checker) {
    SetComment("Div", "$Out = X / Y$");
    AddComment(comment_);

--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -20,8 +20,7 @@ namespace operators {

 class ElementwiseMulOpMaker : public ElementwiseOpMaker {
 public:
-  ElementwiseMulOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : ElementwiseOpMaker(proto, op_checker) {
    SetComment("Mul", "$Out = X \\odot\\ Y$");
    AddComment(comment_);

--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -43,8 +43,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {

 class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ElementwiseOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The first input tensor of elementwise op");
    AddInput("Y", "(Tensor) The second input tensor of elementwise op");

--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -103,11 +103,13 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext> {

  MidWiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
    ++j_;
-    i_ = j_ / post_;
-    if (UNLIKELY(i_ == n_)) {
+    if (UNLIKELY(j_ == post_)) {
+      ++i_;
      j_ = 0;
+      if (UNLIKELY(i_ == n_)) {
        i_ = 0;
      }
+    }
    return *this;
  }

@@ -125,10 +127,10 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext> {

 private:
  const T* ptr_;
-  int i_;
+  int64_t i_;
  int64_t j_;
  int64_t n_;
-  int post_;
+  int64_t post_;
 };

 #ifdef __NVCC__

--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -19,8 +19,7 @@ namespace paddle {
 namespace operators {
 class ElementwiseSubOpMaker : public ElementwiseOpMaker {
 public:
-  ElementwiseSubOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : ElementwiseOpMaker(proto, op_checker) {
    SetComment("Sub", "$Out = X - Y$");
    AddComment(comment_);

--- a/paddle/operators/expand_op.cc
+++ b/paddle/operators/expand_op.cc
@@ -55,7 +55,7 @@ class ExpandOp : public framework::OperatorWithKernel {

 class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ExpandOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."

--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -54,8 +54,7 @@ class FeedOp : public framework::OperatorBase {

 class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FeedOpInfoMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  FeedOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of feed op");
    AddOutput("Out", "The output of feed op");

--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -61,8 +61,7 @@ class FetchOp : public framework::OperatorBase {

 class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FetchOpInfoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  FetchOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of fetch op");
    AddOutput("Out", "The output of fetch op");

--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -52,7 +52,7 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
        ctx.device_context());
  }
 };
@@ -60,13 +60,12 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
 class FillConstantBatchSizeLikeOpMaker
    : public framework::OpProtoAndCheckerMaker {
 public:
-  FillConstantBatchSizeLikeOpMaker(framework::OpProto *proto,
-                                   framework::OpAttrChecker *op_checker)
+  FillConstantBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddAttr<int>("dtype",
                 "(int, default 5 (FP32)) "
                 "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
    AddInput("Input",
             "(Tensor) Tensor "
             "whose dim_idx th dimension is used to specify the batch_size");

--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -34,7 +34,8 @@ class FillConstantOp : public framework::OperatorBase {
  using framework::OperatorBase::OperatorBase;
  void Run(const framework::Scope &scope,
           const platform::DeviceContext &dev_ctx) const override {
-    auto data_type = static_cast<framework::DataType>(Attr<int>("dtype"));
+    auto data_type =
+        static_cast<framework::proto::DataType>(Attr<int>("dtype"));
    auto value = Attr<float>("value");
    auto force_cpu = Attr<bool>("force_cpu");
    auto &out =
@@ -52,13 +53,12 @@ class FillConstantOp : public framework::OperatorBase {

 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FillConstantOpMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  FillConstantOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddAttr<int>("dtype",
                 "(int, default 5 (FP32)) "
                 "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
    AddAttr<float>("value", "(float, default 0) The value to be filled")
        .SetDefault(0.0f);

--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
@@ -48,7 +48,7 @@ class FillOp : public framework::OperatorBase {
                                "Cannot find variable %s", Output("Out"))
                        .GetMutable<framework::LoDTensor>());
    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
-    auto dtype = static_cast<framework::DataType>(Attr<int>("dtype"));
+    auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype"));
    platform::CPUPlace cpu;
    auto force_cpu = Attr<bool>("force_cpu");
    out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(),
@@ -76,7 +76,7 @@ class FillOp : public framework::OperatorBase {

 class FillOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FillOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  FillOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddComment(R"DOC(Fill operator

@@ -88,7 +88,7 @@ Fill an tensor with `value` and `shape`. The type of the tensor is specify by
        "value", "The float values of tensor, which are flatten in row major");
    AddAttr<std::vector<int>>("shape", "The shape of output tensor");
    AddAttr<int>("dtype", "The data type of output tensor, Default is float")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
    AddAttr<bool>("force_cpu",
                  "Whether the output tensor must be at CPU memory or not. "
                  "Default is false.")

--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -33,8 +33,7 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {

 class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FillZerosLikeOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of fill-zeros-like op.");
    AddOutput("Y", "The variable will be filled up with zeros.");

--- a/paddle/operators/ftrl_op.cc
+++ b/paddle/operators/ftrl_op.cc
@@ -57,7 +57,7 @@ class FTRLOp : public framework::OperatorWithKernel {

 class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FTRLOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  FTRLOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param",
             "(Tensor, default Tensor<float>) "

--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -67,7 +67,7 @@ class GatherGradOp : public framework::OperatorWithKernel {

 class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  GatherOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  GatherOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The source input of gather op");
    AddInput("Index", "The index input of gather op");

--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -60,15 +60,14 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
        ctx.device_context());
  }
 };

 class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  GaussianRandomOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  GaussianRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddOutput("Out", "Output matrix of gaussian random op");

@@ -91,7 +90,7 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("dtype",
                 "(int, default 5(FP32)) "
                 "Output data type.")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);

    AddComment(R"DOC(
 GaussianRandom Operator.

--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
@@ -67,7 +67,7 @@ class GRUOp : public framework::OperatorWithKernel {

 class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  GRUOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  GRUOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Input",
             "(LoDTensor) The first input is a LodTensor, which supports "

--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@@ -71,8 +71,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {

 class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  GRUUnitOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  GRUUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Input",
             "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "

--- a/paddle/operators/hinge_loss_op.cc
+++ b/paddle/operators/hinge_loss_op.cc
@@ -46,8 +46,7 @@ class HingeLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  HingeLossOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  HingeLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Logits",
             "The input value (Logits) of Hinge loss op."

--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
@@ -45,8 +45,7 @@ class HuberLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  HuberLossOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  HuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The input value of huber loss op."

--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -70,8 +70,7 @@ class IncrementOp : public framework::OperatorBase {

 class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  IncrementOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  IncrementOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The input tensor of increment operator");
    AddOutput("Out", "(Tensor) The output tensor of increment operator.");

--- a/paddle/operators/is_empty_op.cc
+++ b/paddle/operators/is_empty_op.cc
@@ -47,8 +47,7 @@ class IsEmptyOp : public framework::OperatorBase {

 class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  IsEmptyOpProtoMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  IsEmptyOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(kInput, "(Tensor) Tensor which is to be checked.");
    AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");

--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
@@ -48,7 +48,7 @@ class L1NormGradOp : public framework::OperatorWithKernel {

 class L1NormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  L1NormOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  L1NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The input of l1_norm op.");
    AddOutput("Out", "(Scalar) The output of l1_norm op.");

--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -19,8 +19,7 @@ namespace operators {

 class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LinearChainCRFOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  LinearChainCRFOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Emission",
             "(LoDTensor, default LoDTensor<float>) "

--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -58,8 +58,7 @@ class LoadOp : public framework::OperatorBase {

 class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LoadOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  LoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddOutput("Out", "(Tensor) The tensor need to be loaded");
    AddAttr<std::string>("file_path",

--- a/paddle/operators/lod_array_length_op.cc
+++ b/paddle/operators/lod_array_length_op.cc
@@ -38,8 +38,7 @@ class LoDArrayLengthOp : public framework::OperatorBase {

 class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LoDArrayLengthProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  LoDArrayLengthProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(LoDTensorArray) The input tensor array.");
    AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");

--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -30,13 +30,13 @@ class LoDRankTableOp : public framework::OperatorBase {
        scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
    VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
    out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
+    VLOG(10) << Input("X") << "'s lod information is " << *out;
  }
 };

 class LoDRankTableOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LoDRankTableOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  LoDRankTableOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(LoDTensor) input lod tensor, must contain lod information.");
@@ -67,7 +67,7 @@ class LoDRankTableInferVarType : public framework::VarTypeInference {
                  framework::BlockDescBind *block) const override {
    for (auto &o : op_desc.Output("Out")) {
      block->FindRecursiveOrCreateVar(o)->SetType(
-          framework::VarDesc::LOD_RANK_TABLE);
+          framework::proto::VarDesc::LOD_RANK_TABLE);
    }
  }
 };

--- a/paddle/operators/lod_reset_op.cc
+++ b/paddle/operators/lod_reset_op.cc
@@ -48,8 +48,7 @@ class LoDResetOp : public framework::OperatorWithKernel {

 class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LoDResetOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
    AddInput("TargetLoD",

--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -97,8 +97,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {

 class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LoDTensorToArrayOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  LoDTensorToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "");
    AddInput("RankTable", "");
@@ -131,7 +130,7 @@ class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
  void operator()(const framework::OpDescBind &op_desc,
                  framework::BlockDescBind *block) const override {
    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+      block->Var(out_var)->SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
    }
  }
 };

--- a/paddle/operators/log_loss_op.cc
+++ b/paddle/operators/log_loss_op.cc
@@ -46,8 +46,7 @@ class LogLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LogLossOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  LogLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Predicted",
             "The input value (Predicted) of Log loss op."

--- a/paddle/operators/logical_op.cc
+++ b/paddle/operators/logical_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 template <typename OpComment>
 class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  BinaryLogicalOpProtoMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  BinaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    OpComment comment;
    AddInput("X",
@@ -45,8 +44,7 @@ Each element of Out is calculated by %s
 template <typename OpComment>
 class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  UnaryLogicalOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  UnaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    OpComment comment;
    AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator",

--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -51,8 +51,7 @@ class LookupTableOp : public framework::OperatorWithKernel {

 class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LookupTableOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("W",
             "An input represents embedding tensors, "
@@ -117,11 +116,12 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
    if (is_sparse) {
      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
              << " is set to SelectedRows";
-      block->Var(out_var_name)->SetType(framework::VarDesc::SELECTED_ROWS);
+      block->Var(out_var_name)
+          ->SetType(framework::proto::VarDesc::SELECTED_ROWS);
    } else {
      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
              << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(out_var_name)->SetType(framework::proto::VarDesc::LOD_TENSOR);
    }
  }
 };

--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
@@ -140,7 +140,7 @@ class LRNOp : public framework::OperatorWithKernel {
 template <typename T>
 class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LRNOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  LRNOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor) The input of LRN operator. "

--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -102,7 +102,7 @@ class LSTMOp : public framework::OperatorWithKernel {

 class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LSTMOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  LSTMOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Input",
             "(LoDTensor) the first input is a LodTensor, which support "

--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -48,10 +48,12 @@ class LstmUnitOp : public framework::OperatorWithKernel {

 class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LstmUnitOpMaker(framework::OpProto* proto,
-                  framework::OpAttrChecker* op_checker)
+  LstmUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "FC input before the non-linear activation.");
+    AddInput("X",
+             "Lstm unit only applies non-linear activations, please make sure"
+             "that linear tranformation has already been applied to `X`. "
+             "Linear tranformation can be applied by adding a `fc` layer");
    AddInput(
        "C_prev",
        "The cell state tensor of last time-step in the Lstm Unit operator.");

--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -42,8 +42,7 @@ class MarginRankLossOp : public framework::OperatorWithKernel {
 template <typename T>
 class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MarginRankLossOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  MarginRankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X1",
             "(2-D tensor with shape [batch_size x 1]) The score for "

--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
@@ -61,14 +61,13 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,

    const T* im_data = im.data<T>();
    T* col_data = col->data<T>();
-
    for (int c = 0; c < channels_col; ++c) {
      int w_offset = c % filter_width;
      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / filter_width / filter_height;
+      int c_im = c / (filter_width * filter_height);
      for (int h = 0; h < col_height; ++h) {
-        for (int w = 0; w < col_width; ++w) {
        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < col_width; ++w) {
          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
          int col_idx = (c * col_height + h) * col_width + w;
          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
@@ -130,16 +129,14 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
    for (int c = 0; c < channels_col; ++c) {
      int w_offset = c % filter_width;
      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / filter_width / filter_height;
+      int c_im = c / (filter_width * filter_height);
      for (int h = 0; h < col_height; ++h) {
-        for (int w = 0; w < col_width; ++w) {
        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < col_width; ++w) {
          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-
          if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
              (im_col_idx) >= 0 && (im_col_idx) < im_width) {
-            im_row_idx += c_im * im_height;
-            im_data[im_row_idx * im_width + im_col_idx] +=
+            im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] +=
                col_data[(c * col_height + h) * col_width + w];
          }
        }
@@ -199,12 +196,13 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
        for (int channel = 0; channel < im_channels; ++channel) {
          for (int filter_row_idx = 0; filter_row_idx < filter_height;
               ++filter_row_idx) {
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
            int im_row_offset =
                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
              int im_col_offset =
                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+
              int col_offset =
                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
                    channel) *
@@ -271,12 +269,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
        for (int channel = 0; channel < im_channels; ++channel) {
          for (int filter_row_idx = 0; filter_row_idx < filter_height;
               ++filter_row_idx) {
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
            int im_row_offset =
                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
              int im_col_offset =
                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+
              int col_offset =
                  (((col_row_idx * col_width + col_col_idx) * im_channels +
                    channel) *
@@ -284,6 +283,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                   filter_row_idx) *
                      filter_width +
                  filter_col_idx;
+
              if (im_row_offset >= 0 && im_row_offset < im_height &&
                  im_col_offset >= 0 && im_col_offset < im_width) {
                int im_offset =

--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -277,6 +277,14 @@ void set_constant_with_place<platform::CPUPlace>(
                           TensorSetConstantCPU(tensor, value));
 }

+template <>
+void set_constant_with_place<platform::MKLDNNPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstantCPU(tensor, value));
+}
+
 struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
  TensorSetConstantWithPlace(const platform::DeviceContext& context,
                             framework::Tensor* tensor, float value)

--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -274,7 +274,7 @@ void set_constant_with_place<platform::GPUPlace>(
 }

 template <>
-void set_constant_with_place<platform::CudnnPlace>(
+void set_constant_with_place<platform::CUDNNPlace>(
    const platform::DeviceContext& context, framework::Tensor* tensor,
    float value) {
  set_constant_with_place<platform::GPUPlace>(context, tensor, value);

--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
@@ -130,7 +130,7 @@ class MatMulOp : public framework::OperatorWithKernel {

 class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MatMulOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MatMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of MatMul op");
    AddInput("Y", "The second input of MatMul op");

--- a/paddle/operators/max_sequence_len_op.cc
+++ b/paddle/operators/max_sequence_len_op.cc
@@ -40,8 +40,7 @@ class MaxSeqenceLenOp : public framework::OperatorBase {

 class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MaxSeqenceLenOpProtoMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxSeqenceLenOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("RankTable", "The lod_rank_table.");
    AddOutput("Out", "The max sequence length.");

--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
@@ -20,7 +20,7 @@ using framework::Tensor;

 class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MaxOutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",

--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -32,7 +32,7 @@ class MeanOp : public framework::OperatorWithKernel {

 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MeanOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MeanOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of mean op");
    AddOutput("Out", "The output of mean op");

--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -114,8 +114,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {

 class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MergeLoDTensorOpProtoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  MergeLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The input LoDTensor, contains complete lod information to "

--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -46,7 +46,7 @@ class MinusOp : public framework::OperatorWithKernel {

 class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MinusOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The left tensor of minus operator.");
    AddInput("Y", "The right tensor of minus operator.");

--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -39,8 +39,7 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {

 class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ModifiedHuberLossOpMaker(framework::OpProto* proto,
-                           framework::OpAttrChecker* op_checker)
+  ModifiedHuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The input tensor of modified huber loss op. "

--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
@@ -54,8 +54,7 @@ class MomentumOp : public framework::OperatorWithKernel {

 class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MomentumOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  MomentumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param",
             "(Tensor, default Tensor<float>) "

--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -71,7 +71,7 @@ class MulOpShapeInference : public framework::InferShapeBase {

 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MulOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of mul op");
    AddInput("Y", "The second input of mul op");

--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -61,8 +61,7 @@ class MultiplexOp : public framework::OperatorWithKernel {

 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MultiplexOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  MultiplexOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Ids", "The index tensor of multiplex operator.");
    AddInput("X", "The candidate tensors of multiplex operator.")

--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -43,8 +43,7 @@ class NCCLInitOp : public framework::OperatorBase {

 class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  NCCLInitOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddOutput("Communicator",
              "Create Communicator for communicating between gpus");
@@ -52,7 +51,7 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("dtype",
                 "(int, default 5 (FP32)) "
                 "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
    AddComment(R"DOC(
 NCCLInit Operator.

@@ -141,8 +140,7 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
 // AllreduceOp
 class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  NCCLAllReduceOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of AllReduce op");
    AddInput("Communicator", "Communicator for communicating between gpus");
@@ -163,8 +161,7 @@ AllReduce the input tensors.
 // ReduceOp
 class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  NCCLReduceOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  NCCLReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of Reduce op");
    AddInput("Communicator", "Communicator for communicating between gpus");
@@ -190,8 +187,7 @@ Reduce the tensors.
 // BcastOp
 class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  NCCLBcastOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  NCCLBcastOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of BcastSend op");
    AddInput("Communicator", "Communicator for communicating between gpus");

--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
@@ -73,7 +73,7 @@ class NCEOp : public framework::OperatorWithKernel {

 class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  NCEOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  NCEOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
    AddInput(

--- a/paddle/operators/batch_norm_op.md
+++ b/paddle/operators/batch_norm_op.md
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/name_convention.md
@@ -35,8 +35,8 @@ Here we give some examples to show how these rules will be used.
 ```c++
 class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AccumulateOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  AccumulateOpMaker(OpProto *proto,
+                    OpAttrChecker *op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
    If the output size is not the same as input size,

--- a/paddle/operators/net_op_design.md
+++ b/paddle/operators/net_op_design.md
--- a/paddle/operators/op_documentation/op_markdown_format.md
+++ b/paddle/operators/op_documentation/op_markdown_format.md
+# Standard Markdown Format for Operators
+The following should be the standard format for documentation for all the operators that will get rendered in the `html`:
+
+```
+Operator Name (In PaddlePaddle)
+
+Operator Name (Standard)
+
+Operator description.
+
+LaTeX equation of how the operator performs an update.
+
+The signature of the operator.
+```
+
+Each section mentioned above has been covered in further detail in the rest of the document.
+
+# PaddlePaddle Operator Name
+This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
+`array to lod tensor` should be written as `array_to_lod_tensor`.
+
+This naming convention should be standard across all PaddlePaddle operators.
+
+# Standard Operator Name
+This is the standard name of the operator as used in the community. The general standard is usually:
+- Standard abbreviations like `SGD` are written in all capital letters.
+- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
+- Keep numbers inside a word as is, with no boundary delimiters.
+- Follow the name of the operator with the keyword: `Activation Operator.`
+
+# Operator description
+This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
+
+# LaTeX equation
+This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
+
+# The signature
+This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
+`Section :
+VariableName : (VariableType) VariableDescription
+...
+...
+`
+
+
+The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`:
+
+```
+sgd
+
+SGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+param_out = param_learning_rate * grad
+
+Inputs:
+Param : (Tensor) Input parameter
+LearningRate : (Tensor) Learning rate of SGD
+Grad : (Tensor) Input gradient
+
+Outputs:
+ParamOut : (Tensor) Output parameter
+```
--- a/paddle/operators/rnn_design.md
+++ b/paddle/operators/rnn_design.md
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -48,7 +48,7 @@ class PadOp : public framework::OperatorWithKernel {

 class PadOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  PadOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  PadOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The input of pad op. "

--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -67,8 +67,7 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
 }

-Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput(
      "X",
@@ -136,8 +135,7 @@ Example:
 )DOC");
 }

-Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput("X",
           "(Tensor) The input tensor of pooling operator. "

--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -40,14 +40,12 @@ class PoolOpGrad : public framework::OperatorWithKernel {

 class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Pool2dOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Pool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };

 class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Pool3dOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Pool3dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };

 template <typename DeviceContext, typename T>

--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -100,8 +100,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {

 class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MaxPool2dWithIndexOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxPool2dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",
@@ -178,8 +177,7 @@ Example:

 class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MaxPool3dWithIndexOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxPool3dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor) The input tensor of pooling operator. "

--- a/paddle/operators/positive_negative_pair_op.cc
+++ b/paddle/operators/positive_negative_pair_op.cc
@@ -95,8 +95,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {

 class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  PositiveNegativePairOpMaker(framework::OpProto *proto,
-                              framework::OpAttrChecker *op_checker)
+  PositiveNegativePairOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Score",
             "(Tensor, float) Model Score on an item (with "

--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
@@ -90,8 +90,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {

 class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  PrecisionRecallOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
+  PrecisionRecallOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("MaxProbs",
             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "

--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -38,7 +38,7 @@ class PReluOp : public framework::OperatorWithKernel {

 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  PReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  PReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of prelu operator.");
    AddInput("Alpha", "The alpha weight of prelu operator.");

--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
@@ -59,8 +59,7 @@ class ProximalAdagradOp : public framework::OperatorWithKernel {

 class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ProximalAdagradOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
+  ProximalAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param",
             "(Tensor, default Tensor<float>) "

--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
@@ -47,8 +47,7 @@ class ProximalGDOp : public framework::OperatorWithKernel {

 class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ProximalGDOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  ProximalGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param",
             "(Tensor, default Tensor<float>) "

--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -45,8 +45,7 @@ class RankLossOp : public framework::OperatorWithKernel {

 class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RankLossOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  RankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Label",
             "(2-D Tensor with shape [batch_size x 1]) "

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -497,8 +497,7 @@ class RecurrentGradOp : public RecurrentBase {

 class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RecurrentOpProtoMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  RecurrentOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(kInputs, "rnn inputs").AsDuplicable();
    AddInput(kInitialStates, "rnn initial states").AsDuplicable();

--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -97,7 +97,7 @@ class RecvOp : public framework::OperatorBase {

 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RecvOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("RX", "(Tensor) Input tensor to be saved");
    AddComment(R"DOC(

--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -83,7 +83,7 @@ class ReduceGradOp : public framework::OperatorWithKernel {

 class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor) The input tensor. Tensors with rank at most 6 are "
@@ -135,8 +135,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.

 class ReduceSumOpMaker : public ReduceOpMaker {
 public:
-  ReduceSumOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceSumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : ReduceOpMaker(proto, op_checker) {
    SetComment("ReduceSum", "sum");
    AddComment(comment_);
@@ -145,8 +144,7 @@ class ReduceSumOpMaker : public ReduceOpMaker {

 class ReduceMeanOpMaker : public ReduceOpMaker {
 public:
-  ReduceMeanOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  ReduceMeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : ReduceOpMaker(proto, op_checker) {
    SetComment("ReduceMean", "mean");
    AddComment(comment_);
@@ -155,8 +153,7 @@ class ReduceMeanOpMaker : public ReduceOpMaker {

 class ReduceMaxOpMaker : public ReduceOpMaker {
 public:
-  ReduceMaxOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceMaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : ReduceOpMaker(proto, op_checker) {
    SetComment("ReduceMax", "max");
    AddComment(comment_);
@@ -165,8 +162,7 @@ class ReduceMaxOpMaker : public ReduceOpMaker {

 class ReduceMinOpMaker : public ReduceOpMaker {
 public:
-  ReduceMinOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceMinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : ReduceOpMaker(proto, op_checker) {
    SetComment("ReduceMin", "min");
    AddComment(comment_);

--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -34,20 +34,32 @@ class ReshapeOp : public framework::OperatorWithKernel {
    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
    PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
    auto x_dims = ctx->GetInputDim("X");
-    // TODO(qiao) change batch_size
-    for (size_t i = 1; i < shape.size(); ++i) {
-      PADDLE_ENFORCE(shape[i] > 0,
-                     "Each dimension of Attr(shape) "
-                     "must be positive except the first one.");
+
+    std::vector<size_t> neg_dims_idx;
+    // set some dimension to -1 if it is unknown
+    const int unknown_size = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
+                     "Each dimension of Attr(shape) must be positive or %d.",
+                     unknown_size);
+      if (shape[i] == unknown_size) {
+        neg_dims_idx.push_back(i);
+        PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
+                       "Only one dimension of Attr(shape) can be unknown.");
      }
-    if (shape[0] < 0) {
-      shape[0] = x_dims[0];
    }
-    // capacity check
+
    int64_t capacity =
        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
    int64_t in_size = framework::product(x_dims);
-    PADDLE_ENFORCE_EQ(capacity, in_size,
+    if (neg_dims_idx.size() == 1) {
+      // dim infer
+      shape[neg_dims_idx[0]] = in_size / (-capacity);
+      // recalculate capacity
+      capacity = shape[neg_dims_idx[0]] * (-capacity);
+    }
+    // capacity check
+    PADDLE_ENFORCE(capacity == in_size,
                   "The size of Input(X) mismatches with Attr(shape).");
    // resize output
    std::vector<int64_t> shape_int64(shape.size(), 0);
@@ -65,8 +77,7 @@ class ReshapeOp : public framework::OperatorWithKernel {

 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ReshapeOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of reshape operator.");
    AddOutput("Out", "The output tensor of reshape operator.");
@@ -88,6 +99,9 @@ the tensor X into a 2-D tensor:

    [[1, 2, 3, 4]]

+One dimension in the target shape can be set -1, representing that its
+size is unknown. In this case, the real dimension will be infered from 
+the original shape of Input(X) and other dimensions in the target shape.
 )DOC");
  }
 };

--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
@@ -63,8 +63,7 @@ class RmspropOp : public framework::OperatorWithKernel {

 class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RmspropOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  RmspropOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param",
             "(Tensor, default Tensor<float>) "

--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/operators/rnn_memory_helper_op.cc
@@ -57,15 +57,14 @@ class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {

 class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RNNMemoryHelperOpInfoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  RNNMemoryHelperOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "");
    AddOutput("Out", "");
    AddAttr<int>("dtype",
                 "(int, default 5 (FP32)) "
                 "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
    AddComment("");
  }
 };
@@ -114,8 +113,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
 class RNNMemoryHelperGradOpInfoMaker
    : public framework::OpProtoAndCheckerMaker {
 public:
-  RNNMemoryHelperGradOpInfoMaker(framework::OpProto *proto,
-                                 framework::OpAttrChecker *op_checker)
+  RNNMemoryHelperGradOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(framework::GradVarName("Out"), "");
    AddInput("X", "");
@@ -124,7 +122,7 @@ class RNNMemoryHelperGradOpInfoMaker
    AddAttr<int>("dtype",
                 "(int, default 5 (FP32)) "
                 "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
    AddComment("");
  }
 };

--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -99,8 +99,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {

 class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ROIPoolOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  ROIPoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor), "

--- a/paddle/operators/row_conv_op.cc
+++ b/paddle/operators/row_conv_op.cc
@@ -76,8 +76,7 @@ class RowConvGradOp : public framework::OperatorWithKernel {

 class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RowConvOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  RowConvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(LoDTensor), the input(X) is a LodTensor, which supports "

--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -94,8 +94,7 @@ class SaveOp : public framework::OperatorBase {

 class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SaveOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  SaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor ) Input tensor to be saved");
    AddComment(R"DOC(

--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -38,7 +38,7 @@ class ScaleOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) Input tensor of scale operator.");
    AddOutput("Out", "(Tensor) Output tensor of scale operator.");

--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -78,8 +78,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {

 class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ScatterOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Ref", "The source input of scatter op");
    AddInput("Index",

--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -59,7 +59,7 @@ class SendOp : public framework::OperatorBase {

 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SendOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) Input tensor to be saved");
    AddOutput("Out", "(Tensor) Output fetched from server");

--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -43,8 +43,7 @@ class SequenceConcatOp : public framework::OperatorWithKernel {

 class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SequenceConcatOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  SequenceConcatOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(LodTensorArray) Input is a vector of LoDTensor, "

--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -100,8 +100,7 @@ class SequenceConvGradOp : public framework::OperatorWithKernel {

 class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SequenceConvOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SequenceConvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",

--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
@@ -12,14 +12,14 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "paddle/operators/seq_expand_op.h"
+#include "paddle/operators/sequence_expand_op.h"

 namespace paddle {
 namespace operators {

 using framework::Tensor;

-class SeqExpandOp : public framework::OperatorWithKernel {
+class SequenceExpandOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

@@ -35,25 +35,24 @@ class SeqExpandOp : public framework::OperatorWithKernel {
  }
 };

-class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SeqExpandOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor or LoDTensor) The input(X) of this operator can be a "
             "LoDTensor or a base Tensor.");
    AddInput("Y",
-             "(LoDTensor)The reference input(Y) of seq_expand op."
+             "(LoDTensor)The reference input(Y) of sequence_expand op."
             "It must be a LoDTensor with k-level(k>0)."
             "The input(X) will be expanded according to LOD of input(Y)."
             "The element numbers of last level in input(Y) "
             "must be equal to dims[0] of input(X).");
    AddOutput("Out",
-              "(LodTensor)The output of seq_expand op."
+              "(LodTensor)The output of sequence_expand op."
              "The lod of output will be as same as input(Y)'s lod.");
    AddComment(R"DOC(
-Seq Expand Operator.
+Sequence Expand Operator.

 This operator expands input(X) according to LOD of input(Y).
 Following are cases to better explain how this works:
@@ -124,7 +123,7 @@ then we get 2-level LoDTensor
  }
 };

-class SeqExpandOpGrad : public framework::OperatorWithKernel {
+class SequenceExpandOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

@@ -146,11 +145,11 @@ class SeqExpandOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(seq_expand, ops::SeqExpandOp, ops::SeqExpandOpMaker,
-            seq_expand_grad, ops::SeqExpandOpGrad);
+REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
+            sequence_expand_grad, ops::SequenceExpandOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    seq_expand,
-    ops::SeqExpandKernel<paddle::platform::CPUDeviceContext, float>);
+    sequence_expand,
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
-    seq_expand_grad,
-    ops::SeqExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
+    sequence_expand_grad,
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
--- a/paddle/operators/seq_expand_op.cu
+++ b/paddle/operators/seq_expand_op.cu
@@ -13,12 +13,12 @@
   limitations under the License. */

 #define EIGEN_USE_GPU
-#include "paddle/operators/seq_expand_op.h"
+#include "paddle/operators/sequence_expand_op.h"

 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    seq_expand,
-    ops::SeqExpandKernel<paddle::platform::CUDADeviceContext, float>);
+    sequence_expand,
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>);
 REGISTER_OP_CUDA_KERNEL(
-    seq_expand_grad,
-    ops::SeqExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
+    sequence_expand_grad,
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
@@ -24,7 +24,7 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;

 template <typename DeviceContext, typename T>
-class SeqExpandKernel : public framework::OpKernel<T> {
+class SequenceExpandKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* x = context.Input<LoDTensor>("X");
@@ -71,7 +71,7 @@ class SeqExpandKernel : public framework::OpKernel<T> {
 *
 * */
 template <typename DeviceContext, typename T>
-class SeqExpandGradKernel : public framework::OpKernel<T> {
+class SequenceExpandGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));

--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -37,8 +37,7 @@ class SequencePoolOp : public framework::OperatorWithKernel {

 class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SequencePoolOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SequencePoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
    AddOutput("Out",

--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -79,8 +79,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {

 class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SequenceSliceOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  SequenceSliceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(LoDTensor), "

--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -33,8 +33,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {

 class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SequenceSoftmaxOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker)
+  SequenceSoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
@@ -51,10 +50,14 @@ input Tensor can be either [N, 1] or [N], where N is the sum of the length
 of all sequences.

 The algorithm works as follows:
+
    for i-th sequence in a mini-batch:
-        $$Out(X[lod[i]:lod[i+1]], :) =
-            \frac{\exp(X[lod[i]:lod[i+1], :])}
-            {\sum(\exp(X[lod[i]:lod[i+1], :]))}$$
+
+$$
+Out(X[lod[i]:lod[i+1]], :) = \
+\frac{\exp(X[lod[i]:lod[i+1], :])} \
+{\sum(\exp(X[lod[i]:lod[i+1], :]))}
+$$

 For example, for a mini-batch of 3 sequences with variable-length,
 each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],

--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -43,7 +43,7 @@ class SGDOp : public framework::OperatorWithKernel {

 class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SGDOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("LearningRate", "(Tensor) Learning rate of SGD");

--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -54,8 +54,7 @@ class ShrinkRNNMemoryOp : public ArrayOp {

 class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto,
-                              framework::OpAttrChecker *op_checker)
+  ShrinkRNNMemoryOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
    AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");

--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -86,8 +86,8 @@ class SigmoidCrossEntropyWithLogitsGradOp
 class SigmoidCrossEntropyWithLogitsOpMaker
    : public framework::OpProtoAndCheckerMaker {
 public:
-  SigmoidCrossEntropyWithLogitsOpMaker(framework::OpProto* proto,
-                                       framework::OpAttrChecker* op_checker)
+  SigmoidCrossEntropyWithLogitsOpMaker(OpProto* proto,
+                                       OpAttrChecker* op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "

--- a/paddle/operators/sign_op.cc
+++ b/paddle/operators/sign_op.cc
@@ -34,7 +34,7 @@ class SignOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class SignOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SignOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) Input tensor of sign operator.");
    AddOutput("Out", "(Tensor) Output tensor of sign operator.");

--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -47,8 +47,7 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SmoothL1LossOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SmoothL1LossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "

--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -36,8 +36,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {

 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftmaxOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The input tensor of softmax. "

--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 class SoftmaxWithCrossEntropyOpMaker
    : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftmaxWithCrossEntropyOpMaker(framework::OpProto* proto,
-                                 framework::OpAttrChecker* op_checker)
+  SoftmaxWithCrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Logits",
             "(Tensor, default: Tensor<float>), The unscaled log probabilities "

--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -118,8 +118,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {

 class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SplitLoDTensorOpProtoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  SplitLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input LoDTensor");
    AddInput("Mask", "A bool column vector which mask the input");

--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@@ -65,7 +65,7 @@ class SplitOp : public framework::OperatorWithKernel {

 class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SplitOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SplitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) Input tensor of the split operator.");
    AddOutput("Out", "(Tensor) Output tensors of the split operator.")

--- a/paddle/operators/spp_op.cc
+++ b/paddle/operators/spp_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/spp_op.h"
+namespace paddle {
+namespace operators {
+
+class SppOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SppOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of spp operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of spp operator."
+              "N * M."
+              "M = C * H * W");
+    AddAttr<int>("pyramid_height", "(int), multi level pooling");
+    AddAttr<std::string>(
+        "pooling_type",
+        "(string), pooling type, can be \"max\" for max-pooling "
+        "and \"avg\" for average-pooling.")
+        .InEnum({"max", "avg"});
+    AddComment(R"DOC(
+        "With spatial pyramid pooling, the input image can
+        be of any sizes. This not only allows arbitrary aspect
+        ratios, but also allows arbitrary scales. We can resize
+        the input image to any scale (e.g., min(w, h)=180, 224,
+        ...) and apply the same deep network. When the
+        input image is at different scales, the network (with
+        the same filter sizes) will extract features at different
+        scales. The scales play important roles in traditional
+        methods.
+        Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(H_{out}, W_{out})$
+        Where
+          $$
+            H_{out} = N \\
+            W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in}
+          $$
+        paper https://arxiv.org/pdf/1406.4729v4.pdf
+        )DOC");
+  }
+};
+
+class SppOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SppOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SppOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    int pyramid_height = ctx->Attrs().Get<int>("pyramid_height");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Spping intput must be of 4-dimensional.");
+    int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];
+    std::vector<int64_t> output_shape({in_x_dims[0], outlen});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class SppOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/operators/spp_op.cu.cc
+++ b/paddle/operators/spp_op.cu.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/spp_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/operators/spp_op.h
+++ b/paddle/operators/spp_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SppKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    out->mutable_data<T>(context.GetPlace());
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t output_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // pooling output shape
+      framework::Tensor out_level;
+      std::vector<int64_t> output_shape_vec(
+          {in_x->dims()[0], in_x->dims()[1], bins, bins});
+      framework::DDim output_shape(framework::make_ddim(output_shape_vec));
+      out_level.mutable_data<T>(output_shape, context.GetPlace());
+      // pooling
+      if (pooling_type == "max") {
+        math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
+        math::MaxPool<T> max_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, max_process, &out_level);
+      } else if (pooling_type == "avg") {
+        math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
+        math::AvgPool<T> avg_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, avg_process, &out_level);
+      }
+      // flatten pooling output shape
+      int output_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> output_flatten_shape_vec(
+          {in_x->dims()[0], output_flatten_w});
+      framework::DDim output_flatten_shape(
+          framework::make_ddim(output_flatten_shape_vec));
+      out_level.Resize(output_flatten_shape);
+      // concat
+      auto out_level_stride = framework::stride(out_level.dims());
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_level.data<T>(), out_level_stride, out_level.dims(),
+                       out_stride, out->data<T>() + output_offset);
+      output_offset += out_level.dims()[1] * out_level_stride[1];
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class SppGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    zero(device_ctx, in_x_grad, static_cast<T>(0));
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t out_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // split out and outgrad  ...  to flatten
+      framework::Tensor out_level;
+      framework::Tensor outgrad_level;
+      int out_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> out_flatten_shape_vec(
+          {in_x->dims()[0], out_flatten_w});
+      framework::DDim out_flatten_shape(
+          framework::make_ddim(out_flatten_shape_vec));
+      out_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      auto flatten_stride = framework::stride(out_level.dims());
+      // memcpy
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out->data<T>() + out_offset, out_stride,
+                       out_level.dims(), flatten_stride, out_level.data<T>());
+
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_grad->data<T>() + out_offset, out_stride,
+                       outgrad_level.dims(), flatten_stride,
+                       outgrad_level.data<T>());
+      out_offset += out_level.dims()[1] * out_stride[1];
+      // flatten backward to nchw
+
+      std::vector<int64_t> out_shape_vec({in_x->dims()[0], in_x->dims()[1]});
+      out_shape_vec.push_back(
+          (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1);
+      out_shape_vec.push_back(
+          (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1);
+      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+      out_level.ShareDataWith(out_level);
+      out_level.Resize(out_shape);
+      outgrad_level.ShareDataWith(outgrad_level);
+      outgrad_level.Resize(out_shape);
+      // pooling backward
+      if (pooling_type == "max") {
+        math::MaxPool2dGradFunctor<DeviceContext, T> pool2d_backward;
+        pool2d_backward(context.template device_context<DeviceContext>(), *in_x,
+                        *&out_level, *&outgrad_level, kernel_size, strides,
+                        paddings, in_x_grad);
+      } else if (pooling_type == "avg") {
+        math::Pool2dGradFunctor<DeviceContext, math::AvgPoolGrad<T>, T>
+            pool_backward;
+        math::AvgPoolGrad<T> avg_process;
+        pool_backward(context.template device_context<DeviceContext>(), *in_x,
+                      *&out_level, *&outgrad_level, kernel_size, strides,
+                      paddings, avg_process, in_x_grad);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -56,8 +56,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {

 class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SquaredL2DistanceOpMaker(framework::OpProto* proto,
-                           framework::OpAttrChecker* op_checker)
+  SquaredL2DistanceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) Input of SquaredL2DistanceOp.");
    AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp.");

--- a/paddle/operators/squared_l2_norm_op.cc
+++ b/paddle/operators/squared_l2_norm_op.cc
@@ -48,8 +48,7 @@ class SquaredL2NormGradOp : public framework::OperatorWithKernel {

 class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SquaredL2NormOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  SquaredL2NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The input of squared_l2_norm op.");
    AddOutput("Out", "(Scalar) The output of squared_l2_norm op.");

--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -29,7 +29,7 @@ class SumOp : public framework::OperatorWithKernel {
                   "Output(Out) of SumOp should not be null.");
    if (ctx->IsRuntime() &&
        ctx->GetOutputsVarType("Out")[0] ==
-            framework::VarDesc::LOD_TENSOR_ARRAY) {
+            framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
      return;  // skip runtime infershape when is tensor array;
    }

@@ -72,8 +72,8 @@ class SumOp : public framework::OperatorWithKernel {
      PADDLE_ENFORCE_NE(dtype, -1,
                        "Sum operator should have at least one tensor");

-      return framework::OpKernelType(static_cast<framework::DataType>(dtype),
-                                     ctx.device_context());
+      return framework::OpKernelType(
+          static_cast<framework::proto::DataType>(dtype), ctx.device_context());
    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
      return framework::OpKernelType(
          framework::ToDataType(
@@ -98,7 +98,7 @@ class SumOp : public framework::OperatorWithKernel {

 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SumOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  SumOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
        .AsDuplicable();
@@ -118,7 +118,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
  void operator()(const framework::OpDescBind& op_desc,
                  framework::BlockDescBind* block) const override {
    auto& inputs = op_desc.Input("X");
-    auto var_type = framework::VarDesc::SELECTED_ROWS;
+    auto var_type = framework::proto::VarDesc::SELECTED_ROWS;

    for (auto& name : op_desc.Input("X")) {
      VLOG(10) << name << " "
@@ -128,12 +128,12 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
    bool any_input_is_lod_tensor = std::any_of(
        inputs.begin(), inputs.end(), [block](const std::string& name) {
          return block->FindRecursiveOrCreateVar(name)->GetType() ==
-                 framework::VarDesc::LOD_TENSOR;
+                 framework::proto::VarDesc::LOD_TENSOR;
        });

    auto is_tensor_array = [block](const std::string& name) {
      return detail::Ref(block->FindRecursiveOrCreateVar(name)).GetType() ==
-             framework::VarDesc::LOD_TENSOR_ARRAY;
+             framework::proto::VarDesc::LOD_TENSOR_ARRAY;
    };

    bool any_input_is_tensor_array =
@@ -152,9 +152,9 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
        PADDLE_ENFORCE(all_inputs_are_tensor_array,
                       "Not all inputs are tensor array:\n%s", os.str());
      }
-      var_type = framework::VarDesc::LOD_TENSOR_ARRAY;
+      var_type = framework::proto::VarDesc::LOD_TENSOR_ARRAY;
    } else if (any_input_is_lod_tensor) {
-      var_type = framework::VarDesc::LOD_TENSOR;
+      var_type = framework::proto::VarDesc::LOD_TENSOR;
    }

    auto out_var_name = op_desc.Output("Out").front();

--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -51,8 +51,7 @@ class WriteToArrayOp : public ArrayOp {

 class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  WriteToArrayOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  WriteToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
    AddInput(
@@ -104,7 +103,7 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
    VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
    auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name),
                            "Cannot found %s", out_name);
-    out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+    out.SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
    auto *x = block->FindVarRecursive(x_name);
    if (x != nullptr) {
      out.SetDataType(x->GetDataType());
@@ -140,8 +139,7 @@ class ReadFromArrayOp : public ArrayOp {

 class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ReadFromArrayProtoMaker(framework::OpProto *proto,
-                          framework::OpAttrChecker *op_checker)
+  ReadFromArrayProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(TensorArray) the array will be read from.");
    AddInput("I",

--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -46,7 +46,7 @@ class TopkOp : public framework::OperatorWithKernel {

 class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  TopkOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  TopkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The input of Topk op");
    AddOutput("Out", "(Tensor) The output tensor of Topk op");

--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -55,8 +55,7 @@ class TransposeOp : public framework::OperatorWithKernel {

 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  TransposeOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  TransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",

--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -66,15 +66,14 @@ class UniformRandomOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
        ctx.GetPlace());
  }
 };

 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  UniformRandomOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  UniformRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddOutput("Out", "(Tensor) The output tensor of uniform random op");
    AddComment(R"DOC(
@@ -100,7 +99,7 @@ uniform distribution.
                 "0 means use a seed generated by the system.")
        .SetDefault(0);
    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
  }
 };
 }  // namespace operators

--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -18,8 +18,7 @@ namespace operators {

 class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Unpool2dOpMaker(framework::OpProto* proto,
-                  framework::OpAttrChecker* op_checker)
+  Unpool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",

--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -64,7 +64,7 @@ class WhileOp : public framework::OperatorBase {

 class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  WhileOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(kParameters,
             "A set of variables, which are required by operators inside the "
@@ -321,10 +321,10 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
        continue;
      }
      auto dims = ctx->GetInputsElementDim(kParameters, i);
-      if (var_types[i] == framework::VarDesc::LOD_TENSOR) {
+      if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) {
        names_to_set.push_back(pg_names[i]);
        dims_to_set.push_back(dims);
-      } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) {
+      } else if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
        // not sure how to set the dim of LOD_TENSOR_ARRAY
        names_to_set.push_back(pg_names[i]);
        dims_to_set.push_back(dims);

--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -19,7 +19,7 @@ CPUDeviceContext::CPUDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
 }

-CPUDeviceContext::CPUDeviceContext(CPUPlace place) {
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
  eigen_device_.reset(new Eigen::DefaultDevice());
 }

@@ -27,7 +27,7 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
  return eigen_device_.get();
 }

-Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
+Place CPUDeviceContext::GetPlace() const { return place_; }

 #ifdef PADDLE_WITH_CUDA

@@ -125,21 +125,21 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }

 cudaStream_t CUDADeviceContext::stream() const { return stream_; }

-CudnnDeviceContext::CudnnDeviceContext(CudnnPlace place)
+CUDNNDeviceContext::CUDNNDeviceContext(CUDNNPlace place)
    : CUDADeviceContext(place), place_(place) {
  PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
  PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream()));
 }

-CudnnDeviceContext::~CudnnDeviceContext() {
+CUDNNDeviceContext::~CUDNNDeviceContext() {
  SetDeviceId(place_.device);
  Wait();
  PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
 }

-Place CudnnDeviceContext::GetPlace() const { return CudnnPlace(); }
+Place CUDNNDeviceContext::GetPlace() const { return CUDNNPlace(); }

-cudnnHandle_t CudnnDeviceContext::cudnn_handle() const { return cudnn_handle_; }
+cudnnHandle_t CUDNNDeviceContext::cudnn_handle() const { return cudnn_handle_; }

 #endif


--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -45,6 +45,7 @@ class CPUDeviceContext : public DeviceContext {
  Place GetPlace() const override;

 private:
+  CPUPlace place_;
  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };

@@ -86,10 +87,10 @@ class CUDADeviceContext : public DeviceContext {
  cublasHandle_t cublas_handle_;
 };

-class CudnnDeviceContext : public CUDADeviceContext {
+class CUDNNDeviceContext : public CUDADeviceContext {
 public:
-  explicit CudnnDeviceContext(CudnnPlace place);
-  virtual ~CudnnDeviceContext();
+  explicit CUDNNDeviceContext(CUDNNPlace place);
+  virtual ~CUDNNDeviceContext();

  /*! \brief  Return place in the device context. */
  Place GetPlace() const final;
@@ -99,7 +100,7 @@ class CudnnDeviceContext : public CUDADeviceContext {

 private:
  cudnnHandle_t cudnn_handle_;
-  CudnnPlace place_;
+  CUDNNPlace place_;
 };

 #endif

--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -47,14 +47,14 @@ TEST(Device, CUDADeviceContext) {
  }
 }

-TEST(Device, CudnnDeviceContext) {
-  using paddle::platform::CudnnDeviceContext;
-  using paddle::platform::CudnnPlace;
+TEST(Device, CUDNNDeviceContext) {
+  using paddle::platform::CUDNNDeviceContext;
+  using paddle::platform::CUDNNPlace;
  if (paddle::platform::dynload::HasCUDNN()) {
    int count = paddle::platform::GetCUDADeviceCount();
    for (int i = 0; i < count; ++i) {
-      CudnnDeviceContext* device_context =
-          new CudnnDeviceContext(CudnnPlace(i));
+      CUDNNDeviceContext* device_context =
+          new CUDNNDeviceContext(CUDNNPlace(i));
      cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
      ASSERT_NE(nullptr, cudnn_handle);
      ASSERT_NE(nullptr, device_context->stream());

--- a/paddle/platform/dynload/nccl.cc
+++ b/paddle/platform/dynload/nccl.cc
@@ -25,6 +25,11 @@ void *nccl_dso_handle;

 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);

+void LoadNCCLDSO() {
+  platform::call_once(nccl_dso_flag,
+                      [] { GetNCCLDsoHandle(&nccl_dso_handle); });
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
@@ -28,14 +28,14 @@ extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;

 #ifdef PADDLE_USE_DSO
+extern void LoadNCCLDSO();
+
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
  struct DynLoad__##__name {                                     \
    template <typename... Args>                                  \
    auto operator()(Args... args) -> decltype(__name(args...)) { \
      using nccl_func = decltype(__name(args...)) (*)(Args...);  \
-      platform::call_once(nccl_dso_flag,                               \
-                          paddle::platform::dynload::GetNCCLDsoHandle, \
-                          &nccl_dso_handle);                           \
+      paddle::platform::dynload::LoadNCCLDSO();                  \
      void* p_##__name = dlsym(nccl_dso_handle, #__name);        \
      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
    }                                                            \

--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -73,19 +73,20 @@ size_t GpuMaxChunkSize() {
  size_t available = 0;

  GpuMemoryUsage(available, total);
-
-  // Reserving the rest memory for page tables, etc.
-  size_t reserving = 0.05 * total;
-
+  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
+           << total / 1024 / 1024 << "M";
+  size_t reserving = static_cast<size_t>(0.05 * total);
  // If available less than minimum chunk size, no usable memory exists.
  available =
-      std::max(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
-               reserving) -
-      reserving;
+      std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
+               total - reserving);
+
+  // Reserving the rest memory for page tables, etc.

-  size_t allocating = FLAGS_fraction_of_gpu_memory_to_use * total;
+  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
+                                          (total - reserving));

-  PADDLE_ENFORCE_LT(allocating, available);
+  PADDLE_ENFORCE_LE(allocating, available);

  return allocating;
 }

--- a/paddle/platform/nccl_test.cu
+++ b/paddle/platform/nccl_test.cu
@@ -31,7 +31,7 @@ namespace platform {
 TEST(NCCL, init) {
  std::vector<ncclComm_t> comms;
  comms.resize(dev_count);
-  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
+  dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
  for (int i = 0; i < dev_count; ++i) {
    dynload::ncclCommDestroy(comms[i]);
  }
@@ -62,7 +62,7 @@ TEST(NCCL, all_reduce) {
  std::vector<ncclComm_t> comms;
  comms.resize(dev_count);
  VLOG(1) << "Initializing ncclComm";
-  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
+  dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
  VLOG(1) << "ncclComm initialized";
  VLOG(1) << "Creating thread data";
  std::vector<std::unique_ptr<PerThreadData<double>>> data;

--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -23,6 +23,7 @@ class PlacePrinter : public boost::static_visitor<> {
 public:
  explicit PlacePrinter(std::ostream &os) : os_(os) {}
  void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
+  void operator()(const MKLDNNPlace &) { os_ << "MKLDNNPlace"; }
  void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }

 private:
@@ -38,12 +39,17 @@ const Place &get_place() { return the_default_place; }

 const GPUPlace default_gpu() { return GPUPlace(0); }
 const CPUPlace default_cpu() { return CPUPlace(); }
+const MKLDNNPlace default_mkldnn() { return MKLDNNPlace(); }

 bool is_gpu_place(const Place &p) {
  return boost::apply_visitor(IsGPUPlace(), p);
 }
 bool is_cpu_place(const Place &p) {
-  return !boost::apply_visitor(IsGPUPlace(), p);
+  return !is_gpu_place(p) && !is_mkldnn_place(p);
+}
+
+bool is_mkldnn_place(const Place &p) {
+  return boost::apply_visitor(IsMKLDNNPlace(), p);
 }

 bool places_are_same_class(const Place &p1, const Place &p2) {

--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -31,6 +31,14 @@ struct CPUPlace {
  inline bool operator!=(const CPUPlace &) const { return false; }
 };

+struct MKLDNNPlace {
+  MKLDNNPlace() {}
+
+  // needed for variant equality comparison
+  inline bool operator==(const MKLDNNPlace &) const { return true; }
+  inline bool operator!=(const MKLDNNPlace &) const { return false; }
+};
+
 struct GPUPlace {
  GPUPlace() : GPUPlace(0) {}
  explicit GPUPlace(int d) : device(d) {}
@@ -43,21 +51,28 @@ struct GPUPlace {
  int device;
 };

-struct CudnnPlace : public GPUPlace {
-  CudnnPlace() : GPUPlace() {}
-  explicit CudnnPlace(int d) : GPUPlace(d) {}
+struct CUDNNPlace : public GPUPlace {
+  CUDNNPlace() : GPUPlace() {}
+  explicit CUDNNPlace(int d) : GPUPlace(d) {}
 };

 struct IsGPUPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const MKLDNNPlace &) const { return false; }
  bool operator()(const GPUPlace &gpu) const { return true; }
 };

+struct IsMKLDNNPlace : public boost::static_visitor<bool> {
+  bool operator()(const MKLDNNPlace &) const { return true; }
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const GPUPlace &) const { return false; }
+};
+
 // Define the max number of Place in bit length. i.e., the max number of places
 // should be less equal than 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
 #define NUM_PLACE_TYPE_LIMIT_IN_BIT 4

-typedef boost::variant<CudnnPlace, GPUPlace, CPUPlace> Place;
+typedef boost::variant<CUDNNPlace, GPUPlace, CPUPlace, MKLDNNPlace> Place;

 // static check number of place types is less equal than
 // 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
@@ -70,9 +85,11 @@ const Place &get_place();

 const GPUPlace default_gpu();
 const CPUPlace default_cpu();
+const MKLDNNPlace default_mkldnn();

 bool is_gpu_place(const Place &);
 bool is_cpu_place(const Place &);
+bool is_mkldnn_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);

 std::ostream &operator<<(std::ostream &, const Place &);

--- a/paddle/platform/place_test.cc
+++ b/paddle/platform/place_test.cc
@@ -5,25 +5,37 @@
 TEST(Place, Equality) {
  paddle::platform::CPUPlace cpu;
  paddle::platform::GPUPlace g0(0), g1(1), gg0(0);
+  paddle::platform::CUDNNPlace d0(0), d1(1), dd0(0);

  EXPECT_EQ(cpu, cpu);
  EXPECT_EQ(g0, g0);
  EXPECT_EQ(g1, g1);
  EXPECT_EQ(g0, gg0);
+  EXPECT_EQ(d0, dd0);

  EXPECT_NE(g0, g1);
+  EXPECT_NE(d0, d1);

  EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
+
+  EXPECT_TRUE(paddle::platform::is_gpu_place(d0));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, d0));
 }

 TEST(Place, Default) {
  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place()));
  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
+  EXPECT_TRUE(
+      paddle::platform::is_mkldnn_place(paddle::platform::default_mkldnn()));

  paddle::platform::set_place(paddle::platform::CPUPlace());
  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
+
+  paddle::platform::set_place(paddle::platform::MKLDNNPlace());
+  EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
+  EXPECT_TRUE(paddle::platform::is_mkldnn_place(paddle::platform::get_place()));
 }

 TEST(Place, Print) {

--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
@@ -14,6 +14,19 @@

 #pragma once

+#ifdef __CUDACC__
+#ifdef __CUDACC_VER_MAJOR__
+// CUDA 9 define `__CUDACC_VER__` as a warning message, manually define
+// __CUDACC_VER__ instead.
+#undef __CUDACC_VER__
+
+#define __CUDACC_VER__                                         \
+  (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 + \
+   __CUDACC_VER_BUILD__)
+#endif
+
+#endif
+
 #include <boost/config.hpp>

 #ifdef PADDLE_WITH_CUDA

--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
 if(WITH_PYTHON)
  cc_library(paddle_pybind SHARED
-    SRCS pybind.cc exception.cc protobuf.cc
-    DEPS pybind python backward proto_desc paddle_memory executor prune
+    SRCS pybind.cc exception.cc protobuf.cc const_value.cc
+    DEPS pybind python backward proto_desc paddle_memory executor prune init
    ${GLOB_OP_LIB})
 endif(WITH_PYTHON)


--- a/paddle/pybind/const_value.cc
+++ b/paddle/pybind/const_value.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "const_value.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindConstValue(pybind11::module& m) {
+  m.def("kEmptyVarName", [] { return framework::kEmptyVarName; });
+  m.def("kTempVarName", [] { return framework::kTempVarName; });
+  m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
+  m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+}
+
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/pybind/const_value.h
+++ b/paddle/pybind/const_value.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <Python.h>
+#include "paddle/platform/enforce.h"
+#include "pybind11/pybind11.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+extern void BindConstValue(pybind11::module& m);
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/pybind/print_operators_doc.cc
+++ b/paddle/pybind/print_operators_doc.cc
@@ -31,31 +31,32 @@ std::string Escape(const std::string& s) {
  return r;
 }

-std::string AttrType(paddle::framework::AttrType at) {
+std::string AttrType(paddle::framework::proto::AttrType at) {
  switch (at) {
-    case paddle::framework::INT:
+    case paddle::framework::proto::INT:
      return "int";
-    case paddle::framework::FLOAT:
+    case paddle::framework::proto::FLOAT:
      return "float";
-    case paddle::framework::STRING:
+    case paddle::framework::proto::STRING:
      return "string";
-    case paddle::framework::BOOLEAN:
+    case paddle::framework::proto::BOOLEAN:
      return "bool";
-    case paddle::framework::INTS:
+    case paddle::framework::proto::INTS:
      return "int array";
-    case paddle::framework::FLOATS:
+    case paddle::framework::proto::FLOATS:
      return "float array";
-    case paddle::framework::STRINGS:
+    case paddle::framework::proto::STRINGS:
      return "string array";
-    case paddle::framework::BOOLEANS:
+    case paddle::framework::proto::BOOLEANS:
      return "bool array";
-    case paddle::framework::BLOCK:
+    case paddle::framework::proto::BLOCK:
      return "block id";
  }
  return "UNKNOWN";  // not possible
 }

-void PrintVar(const paddle::framework::OpProto::Var& v, std::stringstream& ss) {
+void PrintVar(const paddle::framework::proto::OpProto::Var& v,
+              std::stringstream& ss) {
  ss << " { "
     << "\n"
     << "   \"name\" : \"" << Escape(v.name()) << "\",\n"
@@ -65,7 +66,7 @@ void PrintVar(const paddle::framework::OpProto::Var& v, std::stringstream& ss) {
     << " },";
 }

-void PrintAttr(const paddle::framework::OpProto::Attr& a,
+void PrintAttr(const paddle::framework::proto::OpProto::Attr& a,
               std::stringstream& ss) {
  ss << " { "
     << "\n"
@@ -81,7 +82,7 @@ void PrintOpProto(const std::string& type,
                  std::stringstream& ss) {
  std::cerr << "Processing " << type << "\n";

-  const paddle::framework::OpProto* p = opinfo.proto_;
+  const paddle::framework::proto::OpProto* p = opinfo.proto_;
  if (p == nullptr) {
    return;  // It is possible that an operator doesn't have OpProto.
  }

--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -144,7 +144,7 @@ void BindProgramDesc(py::module &m) {
      .def("serialize_to_string", SerializeMessage<ProgramDescBind>)
      .def("parse_from_string",
           [](ProgramDescBind &program_desc, const std::string &data) {
-             ProgramDesc *desc = program_desc.Proto();
+             proto::ProgramDesc *desc = program_desc.Proto();
             PADDLE_ENFORCE(desc->ParseFromString(data),
                            "Fail to parse ProgramDesc from string. This could "
                            "be a bug of Paddle.");
@@ -184,14 +184,14 @@ void BindBlockDesc(py::module &m) {
 }

 void BindVarDsec(py::module &m) {
-  py::enum_<DataType>(m, "DataType", "")
-      .value("BOOL", DataType::BOOL)
-      .value("INT16", DataType::INT16)
-      .value("INT32", DataType::INT32)
-      .value("INT64", DataType::INT64)
-      .value("FP16", DataType::FP16)
-      .value("FP32", DataType::FP32)
-      .value("FP64", DataType::FP64);
+  py::enum_<proto::DataType>(m, "DataType", "")
+      .value("BOOL", proto::DataType::BOOL)
+      .value("INT16", proto::DataType::INT16)
+      .value("INT32", proto::DataType::INT32)
+      .value("INT64", proto::DataType::INT64)
+      .value("FP16", proto::DataType::FP16)
+      .value("FP32", proto::DataType::FP32)
+      .value("FP64", proto::DataType::FP64);

  py::class_<VarDescBind> var_desc(m, "VarDesc", "");
  var_desc
@@ -213,27 +213,27 @@ void BindVarDsec(py::module &m) {
      .def("persistable", &VarDescBind::Persistable)
      .def("set_persistable", &VarDescBind::SetPersistable);

-  py::enum_<VarDesc::VarType>(var_desc, "VarType", "")
-      .value("LOD_TENSOR", VarDesc::LOD_TENSOR)
-      .value("SELECTED_ROWS", VarDesc::SELECTED_ROWS)
-      .value("FEED_MINIBATCH", VarDesc::FEED_MINIBATCH)
-      .value("FETCH_LIST", VarDesc::FETCH_LIST)
-      .value("STEP_SCOPES", VarDesc::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", VarDesc::LOD_RANK_TABLE)
-      .value("LOD_TENSOR_ARRAY", VarDesc::LOD_TENSOR_ARRAY);
+  py::enum_<proto::VarDesc::VarType>(var_desc, "VarType", "")
+      .value("LOD_TENSOR", proto::VarDesc::LOD_TENSOR)
+      .value("SELECTED_ROWS", proto::VarDesc::SELECTED_ROWS)
+      .value("FEED_MINIBATCH", proto::VarDesc::FEED_MINIBATCH)
+      .value("FETCH_LIST", proto::VarDesc::FETCH_LIST)
+      .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
+      .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
+      .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY);
 }

 void BindOpDesc(py::module &m) {
-  py::enum_<AttrType>(m, "AttrType", "")
-      .value("INT", AttrType::INT)
-      .value("INTS", AttrType::INTS)
-      .value("FLOAT", AttrType::FLOAT)
-      .value("FLOATS", AttrType::FLOATS)
-      .value("STRING", AttrType::STRING)
-      .value("STRINGS", AttrType::STRINGS)
-      .value("BOOL", AttrType::BOOLEAN)
-      .value("BOOLS", AttrType::BOOLEANS)
-      .value("BLOCK", AttrType::BLOCK);
+  py::enum_<proto::AttrType>(m, "AttrType", "")
+      .value("INT", proto::AttrType::INT)
+      .value("INTS", proto::AttrType::INTS)
+      .value("FLOAT", proto::AttrType::FLOAT)
+      .value("FLOATS", proto::AttrType::FLOATS)
+      .value("STRING", proto::AttrType::STRING)
+      .value("STRINGS", proto::AttrType::STRINGS)
+      .value("BOOL", proto::AttrType::BOOLEAN)
+      .value("BOOLS", proto::AttrType::BOOLEANS)
+      .value("BLOCK", proto::AttrType::BLOCK);

  py::class_<OpDescBind> op_desc(m, "OpDesc", "");
  op_desc.def("type", &OpDescBind::Type)

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -16,11 +16,11 @@ limitations under the License. */

 #include <mutex>  // for call_once
 #include <unordered_map>
-#include "gflags/gflags.h"
 #include "paddle/framework/backward.h"
 #include "paddle/framework/executor.h"
 #include "paddle/framework/feed_fetch_method.h"
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/init.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/lod_tensor_array.h"
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/operators/net_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "paddle/pybind/const_value.h"
 #include "paddle/pybind/exception.h"
 #include "paddle/pybind/pybind.h"
 #include "paddle/pybind/tensor_py.h"
@@ -51,24 +52,6 @@ static size_t UniqueIntegerGenerator(const std::string &prefix) {
  return generators[prefix].fetch_add(1);
 }

-std::once_flag gflags_init_flag;
-
-// TODO(qijun) move init gflags to init.cc
-void InitGflags(std::vector<std::string> &argv) {
-  std::call_once(gflags_init_flag, [&]() {
-    int argc = argv.size();
-    char **arr = new char *[argv.size()];
-    std::string line;
-    for (size_t i = 0; i < argv.size(); i++) {
-      arr[i] = &argv[i][0];
-      line += argv[i];
-      line += ' ';
-    }
-    google::ParseCommandLineFlags(&argc, &arr, true);
-    VLOG(1) << "Init commandline: " << line;
-  });
-}
-
 bool IsCompileGPU() {
 #ifndef PADDLE_WITH_CUDA
  return false;
@@ -305,12 +288,12 @@ All parameter, weight, gradient are variables in Paddle.
    for (const auto &t : targets) {
      prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget();
    }
-    ProgramDesc pruned_desc;
+    proto::ProgramDesc pruned_desc;
    Prune(*prog_with_targets.Proto(), &pruned_desc);
    return new ProgramDescBind(pruned_desc);
  });
  m.def("inference_optimize", [](ProgramDescBind &origin) {
-    ProgramDesc pruned_desc;
+    proto::ProgramDesc pruned_desc;
    InferenceOptimize(*(origin.Proto()), &pruned_desc);
    return new ProgramDescBind(pruned_desc);
  });
@@ -362,7 +345,7 @@ All parameter, weight, gradient are variables in Paddle.
  py::class_<OperatorBase>(m, "Operator")
      .def_static("create",
                  [](py::bytes protobin) {
-                    OpDesc desc;
+                    proto::OpDesc desc;
                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                                   "Cannot parse user input to OpDesc");
                    PADDLE_ENFORCE(desc.IsInitialized(),
@@ -415,7 +398,7 @@ All parameter, weight, gradient are variables in Paddle.
  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
      .def_static("create",
                  [](py::bytes protobin) -> operators::CondOp * {
-                    OpDesc desc;
+                    proto::OpDesc desc;
                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                                   "Cannot parse user input to OpDesc");
                    PADDLE_ENFORCE(desc.IsInitialized(),
@@ -438,7 +421,8 @@ All parameter, weight, gradient are variables in Paddle.
      .def("run", &Executor::Run);

  m.def("unique_integer", UniqueIntegerGenerator);
-  m.def("init_gflags", InitGflags);
+  m.def("init_gflags", framework::InitGflags);
+  m.def("init_devices", &framework::InitDevices);

  m.def("is_compile_gpu", IsCompileGPU);
  m.def("set_feed_variable", framework::SetFeedVariable);
@@ -448,6 +432,7 @@ All parameter, weight, gradient are variables in Paddle.
  BindBlockDesc(m);
  BindVarDsec(m);
  BindOpDesc(m);
+  BindConstValue(m);

  py::class_<framework::LoDRankTable>(m, "LodRankTable")
      .def("items", [](framework::LoDRankTable &table) {

--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -14,9 +14,8 @@ make -j `nproc` print_operators_doc
 paddle/pybind/print_operators_doc > doc/en/html/operators.json

 # check websites for broken links
-# It will be failed now!
-#linkchecker doc/en/html/index.html
-#linkchecker doc/cn/html/index.html
+linkchecker doc/en/html/index.html
+linkchecker doc/cn/html/index.html

 # Parse Github URL
 REPO=`git config remote.origin.url`

--- a/python/.gitignore
+++ b/python/.gitignore
@@ -2,6 +2,7 @@
 build
 dist
 paddle.egg-info
+paddlepaddle_gpu.egg-info
 .idea
 paddle/proto/*.py
 paddle/proto/*.pyc
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -25,10 +25,10 @@ from paddle.trainer.config_parser import *
 __all__ = [
    'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
    "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
-    'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
-    'simple_attention', 'dot_product_attention', 'multi_head_attention',
-    'simple_gru2', 'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm',
-    'inputs', 'outputs'
+    'img_conv_group', 'img_separable_conv', 'vgg_16_network', 'gru_unit',
+    'gru_group', 'simple_gru', 'simple_attention', 'dot_product_attention',
+    'multi_head_attention', 'simple_gru2', 'bidirectional_gru',
+    'text_conv_pool', 'bidirectional_lstm', 'inputs', 'outputs'
 ]

 ######################################################
@@ -435,6 +435,85 @@ def img_conv_group(input,
        input=tmp, stride=pool_stride, pool_size=pool_size, pool_type=pool_type)


+@wrap_name_default("separable_conv")
+def img_separable_conv(input,
+                       num_channels,
+                       num_out_channels,
+                       filter_size,
+                       stride=1,
+                       padding=0,
+                       depth_multiplier=1,
+                       act=None,
+                       bias_attr=None,
+                       param_attr=None,
+                       shared_bias=True,
+                       layer_type='exconv',
+                       name=None):
+    """
+    Separable Convolution.
+
+    The separable convolution module is consisted of a depthwise convolution
+    that acts separately on input channels, followed by a pointwise convolution
+    with 1*1 kernels that mixes channels. It is used for Xception:
+    https://arxiv.org/pdf/1610.02357.pdf
+
+    :param input: input layer.
+    :type input: LayerOutput
+    :param num_channels: the number of input channels.
+    :type num_channels: int
+    :param num_out_channels: the number of output channels.
+    :type num_out_channels: int
+    :param filter_size: the filter size for the depthwise convolution.
+    :type filter_size: int|tuple
+    :param stride: the stride size for the depthwise convolution.
+    :type stride: int|tuple
+    :param padding: the padding size for the depthwise convolution.
+    :type padding: int|tuple
+    :param depth_multiplier: the number of filter for one channel in the
+                             depthwize convolution.
+    :type depth_multiplier: int
+    :param act: the activation function for the output.
+    :type act: BaseActivation
+    :param bias_attr: see img_conv_layer for details.
+    :type bias_attr: ParameterAttribute
+    :param param_attr: see img_conv_layer for details.
+    :type param_attr: ParameterAttribute
+    :param shared_bias: see img_conv_layer for details.
+    :type shared_bias: bool
+    :param layer_type: see img_conv_layer for details.
+    :type layer_type: bool
+    :return: layer's output
+    :rtype: LayerOutput
+    """
+    __depthwise_conv__ = img_conv_layer(
+        name="%s_depthwise_conv" % name,
+        input=input,
+        num_channels=num_channels,
+        num_filters=num_channels * depth_multiplier,
+        groups=num_channels,
+        filter_size=filter_size,
+        stride=stride,
+        padding=padding,
+        act=LinearActivation(),
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        shared_biases=shared_bias,
+        layer_type=layer_type)
+    __pointwise_conv__ = img_conv_layer(
+        name="%s_pointwise_conv" % name,
+        input=__depthwise_conv__,
+        num_channels=num_channels * depth_multiplier,
+        num_filters=num_out_channels,
+        filter_size=1,
+        stride=1,
+        padding=0,
+        act=act,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        shared_biases=shared_bias)
+    return __pointwise_conv__
+
+
 def small_vgg(input_image, num_channels, num_classes):
    def __vgg__(ipt, num_filter, times, dropouts, num_channels_=None):
        return img_conv_group(
@@ -1119,8 +1198,9 @@ def simple_gru2(input,
    :param gru_bias_attr: bias parameter attribute of gru layer,
                          False means no bias, None means default bias.
    :type gru_bias_attr: ParameterAttribute|False|None
-    :param gru_layer_attr: Extra attribute of the gru layer.
-    :type gru_layer_attr: ExtraLayerAttribute
+    :param gru_param_attr: param parameter attribute of gru layer,
+                          None means default param.
+    :type gru_param_attr: ParameterAttribute|None
    :return: the gru group.
    :rtype: LayerOutput
    """

--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -16,12 +16,13 @@ import regularizer
 from param_attr import ParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, GPUPlace
+import clip

 Tensor = LoDTensor
 __all__ = framework.__all__ + executor.__all__ + [
    'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
    'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
-    'DataFeeder'
+    'DataFeeder', 'clip'
 ]



--- a/python/paddle/v2/fluid/clip.py
+++ b/python/paddle/v2/fluid/clip.py
+import functools
+import layers
+
+__all__ = ['GradientClipByValue', 'append_gradient_clip_ops']
+
+
+class BaseGradientClipAttr(object):
+    def process_context(self, context, p_g):
+        raise NotImplementedError()
+
+    def create_operators(self, param, grad):
+        raise NotImplementedError()
+
+
+class NullGradientClipAttr(BaseGradientClipAttr):
+    def process_context(self, context, p_g):
+        pass
+
+    def create_operators(self, param, grad):
+        return param, grad
+
+
+class GradientClipByValue(BaseGradientClipAttr):
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def process_context(self, context, p_g):
+        pass
+
+    def create_operators(self, param, grad):
+        new_grad = layers.clip(x=grad, min=self.min, max=self.max)
+        return param, new_grad
+
+
+def append_gradient_clip_ops(param_grad):
+    context = dict()
+    create_op_callbacks = []
+    for p, g in param_grad:
+        clip_attr = getattr(p, 'clip_attr', NullGradientClipAttr())
+        if clip_attr is None:
+            clip_attr = NullGradientClipAttr()
+        if not isinstance(clip_attr, BaseGradientClipAttr):
+            raise TypeError(
+                "clip attribute should be an instance of BaseGradientClippingAttr"
+            )
+
+        clip_attr.process_context(context=context, p_g=param_grad)
+        create_op_callbacks.append(
+            functools.partial(
+                clip_attr.create_operators, param=p, grad=g))
+
+    return [each_callback() for each_callback in create_op_callbacks]
+
+
+ClipByValue = GradientClipByValue
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
 import numpy as np

 import layers
-from framework import Program, unique_name, Variable
+from framework import Program, unique_name, Variable, program_guard
 from layer_helper import LayerHelper

-__all__ = ['Accuracy']
+__all__ = ['Accuracy', 'ChunkEvaluator']


 def _clone_var_(block, var):
@@ -49,15 +49,12 @@ class Evaluator(object):
        if reset_program is None:
            reset_program = Program()

+        with program_guard(main_program=reset_program):
            for var in self.states:
                assert isinstance(var, Variable)
                g_var = _clone_var_(reset_program.current_block(), var)
                layers.fill_constant(
-                shape=g_var.shape,
-                value=0.0,
-                dtype=g_var.dtype,
-                out=g_var,
-                main_program=reset_program)
+                    shape=g_var.shape, value=0.0, dtype=g_var.dtype, out=g_var)

        executor.run(reset_program)

@@ -104,20 +101,14 @@ class Accuracy(Evaluator):
        self.total = self.create_state(dtype='int64', shape=[1], suffix='total')
        self.correct = self.create_state(
            dtype='int64', shape=[1], suffix='correct')
-        kwargs = {'main_program': main_program}
        total = self.helper.create_tmp_variable(dtype='int')
        correct = self.helper.create_tmp_variable(dtype='int')
        acc = layers.accuracy(
-            input=input,
-            label=label,
-            k=k,
-            total=total,
-            correct=correct,
-            **kwargs)
-        total = layers.cast(x=total, dtype='int64', **kwargs)
-        correct = layers.cast(x=correct, dtype='int64', **kwargs)
-        layers.sums(input=[self.total, total], out=self.total, **kwargs)
-        layers.sums(input=[self.correct, correct], out=self.correct, **kwargs)
+            input=input, label=label, k=k, total=total, correct=correct)
+        total = layers.cast(x=total, dtype='int64')
+        correct = layers.cast(x=correct, dtype='int64')
+        layers.sums(input=[self.total, total], out=self.total)
+        layers.sums(input=[self.correct, correct], out=self.correct)

        self.metrics.append(acc)

@@ -125,10 +116,75 @@ class Accuracy(Evaluator):
        if eval_program is None:
            eval_program = Program()
        block = eval_program.current_block()
-        kwargs = {'main_program': eval_program}
+        with program_guard(main_program=eval_program):
            total = _clone_var_(block, self.total)
            correct = _clone_var_(block, self.correct)
-        total = layers.cast(total, dtype='float32', **kwargs)
-        correct = layers.cast(correct, dtype='float32', **kwargs)
-        out = layers.elementwise_div(x=correct, y=total, **kwargs)
+            total = layers.cast(total, dtype='float32')
+            correct = layers.cast(correct, dtype='float32')
+            out = layers.elementwise_div(x=correct, y=total)
        return np.array(executor.run(eval_program, fetch_list=[out])[0])
+
+
+class ChunkEvaluator(Evaluator):
+    """
+    Accumulate counter numbers output by chunk_eval from mini-batches and 
+    compute the precision recall and F1-score using the accumulated counter 
+    numbers.
+    """
+
+    def __init__(
+            self,
+            input,
+            label,
+            chunk_scheme,
+            num_chunk_types,
+            excluded_chunk_types=None, ):
+        super(ChunkEvaluator, self).__init__("chunk_eval")
+        main_program = self.helper.main_program
+        if main_program.current_block().idx != 0:
+            raise ValueError("You can only invoke Evaluator in root block")
+
+        self.num_infer_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_infer_chunks')
+        self.num_label_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_label_chunks')
+        self.num_correct_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_correct_chunks')
+        precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
+            input=input,
+            label=label,
+            chunk_scheme=chunk_scheme,
+            num_chunk_types=num_chunk_types,
+            excluded_chunk_types=excluded_chunk_types, )
+        layers.sums(
+            input=[self.num_infer_chunks, num_infer_chunks],
+            out=self.num_infer_chunks)
+        layers.sums(
+            input=[self.num_label_chunks, num_label_chunks],
+            out=self.num_label_chunks)
+        layers.sums(
+            input=[self.num_correct_chunks, num_correct_chunks],
+            out=self.num_correct_chunks)
+
+        self.metrics.extend([precision, recall, f1_score])
+
+    def eval(self, executor, eval_program=None):
+        if eval_program is None:
+            eval_program = Program()
+        block = eval_program.current_block()
+        num_infer_chunks, num_label_chunks, num_correct_chunks = executor.run(
+            eval_program,
+            fetch_list=[_clone_var_(block, state) for state in self.states])
+        num_infer_chunks = num_infer_chunks[0]
+        num_label_chunks = num_label_chunks[0]
+        num_correct_chunks = num_correct_chunks[0]
+        precision = float(
+            num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0
+        recall = float(
+            num_correct_chunks) / num_label_chunks if num_label_chunks else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if num_correct_chunks else 0
+        return np.array(
+            [precision], dtype='float32'), np.array(
+                [recall], dtype='float32'), np.array(
+                    [f1_score], dtype='float32')
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -46,6 +46,13 @@ class Executor(object):
            p.set_place(each)
            act_places.append(p)

+        # TODO(dzhwinter) : consider that our fluid tests all written in 
+        # GPUPlace(gpu_id), this will be changed in next PR.
+        if core.is_compile_gpu():
+            core.init_devices(["CPU", "GPU:0"])
+        else:
+            core.init_devices(["CPU"])
+
        self.executor = core.Executor(act_places)
        self.places = places


--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
 import collections
+import contextlib

 import numpy as np
-from . import core
+
 import proto.framework_pb2 as framework_pb2
-import google.protobuf.message
-import contextlib
+from . import core

 __all__ = [
    'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
@@ -12,6 +12,18 @@ __all__ = [
    'switch_main_program'
 ]

+EMPTY_VAR_NAME = core.kEmptyVarName()
+TEMP_VAR_NAME = core.kTempVarName()
+GRAD_VAR_SUFFIX = core.kGradVarSuffix()
+ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
+
+
+def grad_var_name(var_name):
+    """
+    return gradient name for a certain var name
+    """
+    return var_name + GRAD_VAR_SUFFIX
+

 def unique_name(prefix):
    """
@@ -704,6 +716,7 @@ class Block(object):
                trainable=p.trainable,
                optimize_attr=p.optimize_attr,
                regularizer=p.regularizer,
+                clip_attr=p.clip_attr,
                name=v.name)
            self.vars[new_p.name] = new_p

@@ -866,6 +879,8 @@ class Parameter(Variable):

        self.regularizer = kwargs.get('regularizer', None)

+        self.clip_attr = kwargs.get('clip_attr', None)
+

 # program is a global instance.
 _main_program_ = Program()

--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -21,19 +21,11 @@ class LayerHelper(object):

    @property
    def main_program(self):
-        prog = self.kwargs.get('main_program', None)
-        if prog is None:
        return default_main_program()
-        else:
-            return prog

    @property
    def startup_program(self):
-        prog = self.kwargs.get('startup_program', None)
-        if prog is None:
        return default_startup_program()
-        else:
-            return prog

    def append_op(self, *args, **kwargs):
        return self.main_program.current_block().append_op(*args, **kwargs)
@@ -151,13 +143,6 @@ class LayerHelper(object):
            persistable=True,
            initializer=initializer)

-    @property
-    def to_kwargs(self):
-        return {
-            'main_program': self.main_program,
-            'startup_program': self.startup_program
-        }
-
    def append_bias_op(self, input_var, dim_start=1, dim_end=None):
        """
        Append bias operator and return its output. If the user does not set

--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -14,11 +14,7 @@ __all__ = [
 ]


-def split_lod_tensor(input,
-                     mask,
-                     level=0,
-                     main_program=None,
-                     startup_program=None):
+def split_lod_tensor(input, mask, level=0):
    helper = LayerHelper('split_lod_tensor', **locals())
    out_true = helper.create_tmp_variable(dtype=input.dtype)
    out_false = helper.create_tmp_variable(dtype=input.dtype)
@@ -34,13 +30,7 @@ def split_lod_tensor(input,
    return out_true, out_false


-def merge_lod_tensor(in_true,
-                     in_false,
-                     x,
-                     mask,
-                     level=0,
-                     main_program=None,
-                     startup_program=None):
+def merge_lod_tensor(in_true, in_false, x, mask, level=0):
    helper = LayerHelper('merge_lod_tensor', **locals())
    out = helper.create_tmp_variable(dtype=in_true.dtype)
    helper.append_op(
@@ -135,9 +125,8 @@ class StaticRNN(object):
    IN_RNN_BLOCK = 1
    AFTER_RNN_BLOCK = 2

-    def __init__(self, name=None, main_program=None):
-        self.helper = LayerHelper(
-            "static_rnn", name=name, main_program=main_program)
+    def __init__(self, name=None):
+        self.helper = LayerHelper("static_rnn", name=name)
        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
        self.inputs = []  # input variable list in current block
        self.outputs = []  # output variable list in parent block
@@ -354,8 +343,8 @@ class While(object):
    IN_WHILE_BLOCK = 1
    AFTER_WHILE_BLOCK = 2

-    def __init__(self, cond, name=None, main_program=None):
-        self.helper = LayerHelper("while", name=name, main_program=main_program)
+    def __init__(self, cond, name=None):
+        self.helper = LayerHelper("while", name=name)
        self.status = While.BEFORE_WHILE_BLOCK
        if not isinstance(cond, Variable):
            raise TypeError("condition should be a variable")
@@ -406,7 +395,7 @@ class While(object):
            attrs={'sub_block': while_block})


-def lod_rank_table(x, level=0, main_program=None):
+def lod_rank_table(x, level=0):
    """
    This function creates an operator for creating a LOD_RANK_TABLE
    using the input x.
@@ -423,7 +412,7 @@ def lod_rank_table(x, level=0, main_program=None):
    return table


-def max_sequence_len(rank_table, main_program=None):
+def max_sequence_len(rank_table):
    """
    This function creates an operator to calculate the length of
    max seqence through input rank_table(should be a lod_rank_table)
@@ -437,7 +426,7 @@ def max_sequence_len(rank_table, main_program=None):
    return res


-def topk(input, k, main_program=None, startup_program=None):
+def topk(input, k):
    helper = LayerHelper('topk', **locals())
    topk_out = helper.create_tmp_variable(dtype=input.data_type)
    topk_indices = helper.create_tmp_variable(dtype='int64')
@@ -450,7 +439,7 @@ def topk(input, k, main_program=None, startup_program=None):
    return topk_out, topk_indices


-def lod_tensor_to_array(x, table, main_program=None):
+def lod_tensor_to_array(x, table):
    """
    This function creates an operator to convert an LOD_Tensor to
    an array.
@@ -468,7 +457,7 @@ def lod_tensor_to_array(x, table, main_program=None):
    return array


-def array_to_lod_tensor(x, table, main_program=None, startup_program=None):
+def array_to_lod_tensor(x, table):
    """
    This function creates an operator to convert an array to a
    LOD_Tensor.
@@ -483,11 +472,7 @@ def array_to_lod_tensor(x, table, main_program=None, startup_program=None):
    return tmp


-def increment(x,
-              value=1.0,
-              in_place=True,
-              main_program=None,
-              startup_program=None):
+def increment(x, value=1.0, in_place=True):
    """
    This function creates an operator to increment each value in the input
    `x` by an amount: `value` as mentioned in the input parameter. This
@@ -506,7 +491,7 @@ def increment(x,
    return out


-def array_write(x, i, array=None, main_program=None, startup_program=None):
+def array_write(x, i, array=None):
    """
    This function creates an operator to write the data out as a
    LOD_TENSOR_ARRAY.
@@ -525,7 +510,7 @@ def array_write(x, i, array=None, main_program=None, startup_program=None):
    return array


-def create_array(dtype, main_program=None):
+def create_array(dtype):
    helper = LayerHelper("array", **locals())
    return helper.create_variable(
        name="{0}.out".format(helper.name),
@@ -533,7 +518,25 @@ def create_array(dtype, main_program=None):
        dtype=dtype)


-def less_than(x, y, cond=None, main_program=None, **ignored):
+def less_than(x, y, cond=None, **ignored):
+    """
+    **Less than**
+
+    This layer returns the truth value of :math:`x < y` elementwise.
+
+    Args:
+        x(Variable): First operand of *less_than*
+        y(Variable): Second operand of *less_than*
+        cond(Variable|None): Optional output variable to store the result of *less_than*
+
+    Returns:
+        Variable: The tensor variable storing the output of *less_than*.
+
+    Examples:
+        .. code-block:: python
+
+          less = fluid.layers.less_than(x=label, y=limit)
+    """
    helper = LayerHelper("less_than", **locals())
    if cond is None:
        cond = helper.create_tmp_variable(dtype='bool')
@@ -545,7 +548,7 @@ def less_than(x, y, cond=None, main_program=None, **ignored):
    return cond


-def array_read(array, i, main_program=None, startup_program=None):
+def array_read(array, i):
    """
    This function creates an operator to read the data in as a
    LOD_TENSOR_ARRAY.
@@ -564,7 +567,7 @@ def array_read(array, i, main_program=None, startup_program=None):
    return out


-def shrink_memory(x, i, table, main_program=None, startup_program=None):
+def shrink_memory(x, i, table):
    """
    This function creates an operator to shrink_rnn_memory using the RankTable
    as mentioned in the input parameter.
@@ -581,7 +584,7 @@ def shrink_memory(x, i, table, main_program=None, startup_program=None):
    return out


-def array_length(array, main_program=None):
+def array_length(array):
    """
    This function creates an operator to find the length of the
    LOD_TENSOR_ARRAY.
@@ -611,20 +614,12 @@ class ConditionalBlockGuard(BlockGuard):


 class ConditionalBlock(object):
-    def __init__(self,
-                 inputs,
-                 name=None,
-                 main_program=None,
-                 startup_program=None):
+    def __init__(self, inputs, name=None):
        for each_input in inputs:
            if not isinstance(each_input, Variable):
                raise TypeError("Each input should be variable")
        self.inputs = inputs
-        self.helper = LayerHelper(
-            'conditional_block',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
+        self.helper = LayerHelper('conditional_block', name=name)

    def block(self):
        return ConditionalBlockGuard(self)
@@ -709,15 +704,10 @@ class IfElse(object):
    IN_IF_ELSE_TRUE_BLOCKS = 1
    IN_IF_ELSE_FALSE_BLOCKS = 2

-    def __init__(self, cond, name=None, main_program=None,
-                 startup_program=None):
+    def __init__(self, cond, name=None):
        if not isinstance(cond, Variable):
            raise TypeError("cond must be a Variable")
-        self.helper = LayerHelper(
-            'ifelse',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
+        self.helper = LayerHelper('ifelse', name=name)
        self.cond = cond
        self.input_table = {}
        self.status = IfElse.OUT_IF_ELSE_BLOCKS
@@ -782,11 +772,7 @@ class IfElse(object):
            out_table.append(outside_out)

            # assign local var to outside
-            assign(
-                input=each_out,
-                output=outside_out,
-                main_program=self.helper.main_program,
-                startup_program=self.helper.startup_program)
+            assign(input=each_out, output=outside_out)

    def __call__(self):
        if self.status != self.OUT_IF_ELSE_BLOCKS:
@@ -810,9 +796,7 @@ class IfElse(object):
                    in_false=false_var,
                    mask=self.cond,
                    x=self.cond,
-                    level=0,
-                    main_program=self.helper.main_program,
-                    startup_program=self.helper.startup_program))
+                    level=0))
        return rlist


@@ -821,12 +805,8 @@ class DynamicRNN(object):
    IN_RNN = 1
    AFTER_RNN = 2

-    def __init__(self, name=None, main_program=None, startup_program=None):
-        self.helper = LayerHelper(
-            'dynamic_rnn',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
+    def __init__(self, name=None):
+        self.helper = LayerHelper('dynamic_rnn', name=name)
        self.status = DynamicRNN.BEFORE_RNN
        self.lod_rank_table = None
        self.max_seq_len = None
@@ -880,8 +860,7 @@ class DynamicRNN(object):
            inputs={'X': x,
                    'RankTable': self.lod_rank_table},
            outputs={'Out': input_array})
-        return array_read(
-            array=input_array, i=self.step_idx, **self.helper.to_kwargs)
+        return array_read(array=input_array, i=self.step_idx)

    @contextlib.contextmanager
    def block(self):
@@ -892,32 +871,18 @@ class DynamicRNN(object):
        self.status = DynamicRNN.IN_RNN
        with self.while_op.block():
            yield
-            increment(
-                x=self.step_idx,
-                value=1.0,
-                in_place=True,
-                **self.helper.to_kwargs)
+            increment(x=self.step_idx, value=1.0, in_place=True)

            for new_mem, mem_array in self.mem_link:
-                array_write(
-                    x=new_mem,
-                    i=self.step_idx,
-                    array=mem_array,
-                    **self.helper.to_kwargs)
-
-            less_than(
-                x=self.step_idx,
-                y=self.max_seq_len,
-                cond=self.cond,
-                **self.helper.to_kwargs)
+                array_write(x=new_mem, i=self.step_idx, array=mem_array)
+
+            less_than(x=self.step_idx, y=self.max_seq_len, cond=self.cond)

        self.status = DynamicRNN.AFTER_RNN
        for each_array in self.output_array:
            self.outputs.append(
                array_to_lod_tensor(
-                    x=each_array,
-                    table=self.lod_rank_table,
-                    **self.helper.to_kwargs))
+                    x=each_array, table=self.lod_rank_table))

    def __call__(self, *args, **kwargs):
        if self.status != DynamicRNN.AFTER_RNN:
@@ -944,13 +909,9 @@ class DynamicRNN(object):
                inputs={'X': init,
                        'I': self.zero_idx},
                outputs={'Out': mem_array})
-            retv = array_read(
-                array=mem_array, i=self.step_idx, **self.helper.to_kwargs)
+            retv = array_read(array=mem_array, i=self.step_idx)
            retv = shrink_memory(
-                x=retv,
-                i=self.step_idx,
-                table=self.lod_rank_table,
-                **self.helper.to_kwargs)
+                x=retv, i=self.step_idx, table=self.lod_rank_table)
            self.mem_dict[retv.name] = mem_array
            return retv
        else:

--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
@@ -10,8 +10,6 @@ def data(name,
         dtype='float32',
         lod_level=0,
         type=core.VarDesc.VarType.LOD_TENSOR,
-         main_program=None,
-         startup_program=None,
         stop_gradient=True):
    """
    Data Layer.

--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -5,12 +5,15 @@ All layers just related to the neural network.
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
+from ..param_attr import ParamAttr
+from tensor import concat

 __all__ = [
    'fc', 'embedding', 'dynamic_lstm', 'gru_unit', 'linear_chain_crf',
    'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy',
    'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
-    'batch_norm', 'beam_search_decode', 'conv2d_transpose'
+    'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand',
+    'lstm_unit', 'reduce_sum'
 ]


@@ -20,36 +23,50 @@ def fc(input,
       param_attr=None,
       bias_attr=None,
       act=None,
-       name=None,
-       main_program=None,
-       startup_program=None):
+       name=None):
    """
-    Fully Connected Layer.
+    **Fully Connected Layer**
+
+    This layer accepts multiple inputs and applies a linear transformation to each input.
+    If activation type is provided, the corresponding activation function is applied to the
+    output of the linear transformation. For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = Act(WX + b)
+
+    In the above equation:
+
+        * :math:`X`: Input value, a tensor with rank at least 2.
+        * :math:`W`: Weight, a 2-D tensor with shape [M, N].
+        * :math:`b`: Bias, a 2-D tensor with shape [M, 1].
+        * :math:`Act`: Activation function.
+        * :math:`Out`: Output value, same shape with :math:`X`.
+
+    All the input variables are passed in as local variables to the LayerHelper
+    constructor.

    Args:
-       input: The input tensor to the function
-       size: The size of the layer
-       num_flatten_dims: Number of columns in input
-       param_attr: The parameters/weights to the FC Layer
-       param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used
-       bias_attr: The bias parameter for the FC layer
-       bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used
-       act: Activation to be applied to the output of FC layer
-       name: Name/alias of the function
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-
-    This function can take in multiple inputs and performs the Fully Connected
-    function (linear transformation) on top of each of them.
-    So for input x, the output will be : Wx + b. Where W is the parameter,
-    b the bias and x is the input.
-
-    The function also applies an activation (non-linearity) on top of the
-    output, if activation is passed in the input.
-
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
+       input(Variable|list): Input tensors. Each tensor has a rank of atleast 2
+       size(int): Output size
+       num_flatten_dims(int): Number of columns in input
+       param_attr(ParamAttr|list): The parameters/weights to the FC Layer
+       bias_attr(ParamAttr|list): Bias parameter for the FC layer
+       act(str): Activation type
+       name(str): Name/alias of the function

+    Returns:
+        Variable: The tensor variable storing the transformation and \
+                  non-linearity activation result.
+
+    Raises:
+        ValueError: If rank of input tensor is less than 2.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
+          fc = fluid.layers.fc(input=data, size=1000, act="tanh")
    """
    helper = LayerHelper('fc', **locals())

@@ -88,33 +105,32 @@ def fc(input,
    return helper.append_activation(pre_activation)


-def embedding(input,
-              size,
-              is_sparse=False,
-              param_attr=None,
-              dtype='float32',
-              main_program=None,
-              startup_program=None):
+def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
    """
-    Embedding Layer.
+    **Embedding Layer**
+
+    This layer is used to lookup a vector of IDs, provided by *input*, in a lookup table.
+    The result of this lookup is the embedding of each ID in the *input*.
+
+    All the input variables are passed in as local variables to the LayerHelper
+    constructor.

    Args:
-       param_initializer:
-       input: The input to the function
-       size: The size of the layer
-       is_sparse: A flag that decleares whether the input is sparse
-       param_attr: Parameters for this layer
-       dtype: The type of data : float32, float_16, int etc
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
+       input(Variable): Input to the function
+       size(int): Output size
+       is_sparse(bool): Boolean flag that specifying whether the input is sparse
+       param_attr(ParamAttr): Parameters for this layer
+       dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc

-    This function can take in the input (which is a vector of IDs) and
-    performs a lookup in the lookup_table using these IDs, to result into
-    the embedding of each ID in the input.
+    Returns:
+        Variable: The tensor variable storing the embeddings of the \
+                  supplied inputs.

-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
+    Examples:
+        .. code-block:: python

+          data = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
+          fc = fluid.layers.embedding(input=data, size=16)
    """

    helper = LayerHelper('embedding', **locals())
@@ -140,9 +156,7 @@ def dynamic_lstm(input,
                 gate_activation='sigmoid',
                 cell_activation='tanh',
                 candidate_activation='tanh',
-                 dtype='float32',
-                 main_program=None,
-                 startup_program=None):
+                 dtype='float32'):
    helper = LayerHelper('lstm', **locals())
    size = size / 4
    weight = helper.create_parameter(
@@ -185,9 +199,7 @@ def gru_unit(input,
             weight=None,
             bias=None,
             activation='tanh',
-             gate_activation='sigmoid',
-             main_program=None,
-             startup_program=None):
+             gate_activation='sigmoid'):
    """
    GRUUnit Operator implements partial calculations of the GRU unit as following:

@@ -250,11 +262,7 @@ def gru_unit(input,
    return updated_hidden, reset_hidden_pre, gate


-def linear_chain_crf(input,
-                     label,
-                     param_attr=None,
-                     main_program=None,
-                     startup_program=None):
+def linear_chain_crf(input, label, param_attr=None):
    helper = LayerHelper('linear_chain_crf', **locals())
    size = input.shape[1]
    transition = helper.create_parameter(
@@ -280,11 +288,7 @@ def linear_chain_crf(input,
    return log_likelihood


-def crf_decoding(input,
-                 param_attr,
-                 label=None,
-                 main_program=None,
-                 startup_program=None):
+def crf_decoding(input, param_attr, label=None):
    helper = LayerHelper('crf_decoding', **locals())
    transition = helper.get_parameter(param_attr.name)
    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -392,8 +396,8 @@ def chunk_eval(input,
               excluded_chunk_types=None,
               **kwargs):
    """
-    This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
+    This function computes and outputs the precision, recall and
+    F1-score of chunk detection.
    """
    helper = LayerHelper("chunk_eval", **kwargs)

@@ -401,6 +405,9 @@ def chunk_eval(input,
    precision = helper.create_tmp_variable(dtype="float32")
    recall = helper.create_tmp_variable(dtype="float32")
    f1_score = helper.create_tmp_variable(dtype="float32")
+    num_infer_chunks = helper.create_tmp_variable(dtype="int64")
+    num_label_chunks = helper.create_tmp_variable(dtype="int64")
+    num_correct_chunks = helper.create_tmp_variable(dtype="int64")

    helper.append_op(
        type="chunk_eval",
@@ -409,14 +416,17 @@ def chunk_eval(input,
        outputs={
            "Precision": [precision],
            "Recall": [recall],
-            "F1-Score": [f1_score]
+            "F1-Score": [f1_score],
+            "NumInferChunks": [num_infer_chunks],
+            "NumLabelChunks": [num_label_chunks],
+            "NumCorrectChunks": [num_correct_chunks]
        },
        attrs={
            "num_chunk_types": num_chunk_types,
-            'chunk_scheme': chunk_scheme,
-            'excluded_chunk_types': excluded_chunk_types or []
+            "chunk_scheme": chunk_scheme,
+            "excluded_chunk_types": excluded_chunk_types or []
        })
-    return precision, recall, f1_score
+    return precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks


 def sequence_conv(input,
@@ -426,9 +436,7 @@ def sequence_conv(input,
                  padding=None,
                  bias_attr=None,
                  param_attr=None,
-                  act=None,
-                  main_program=None,
-                  startup_program=None):
+                  act=None):
    """
    This function creates the op for sequence_conv, using the inputs and
    other convolutional configurations for the filters and stride as given
@@ -471,9 +479,7 @@ def conv2d(input,
           param_attr=None,
           bias_attr=None,
           act=None,
-           name=None,
-           main_program=None,
-           startup_program=None):
+           name=None):
    """
    This function creates the op for a 2-dimensional Convolution.
    This is performed using the parameters of filters(size, dimensionality etc)
@@ -559,9 +565,7 @@ def pool2d(input,
           pool_type,
           pool_stride=None,
           pool_padding=None,
-           global_pooling=False,
-           main_program=None,
-           startup_program=None):
+           global_pooling=False):
    """
    This function adds the operator for pooling in 2 dimensions, using the
    pooling configurations mentioned in input parameters.
@@ -607,9 +611,7 @@ def batch_norm(input,
               epsilon=1e-05,
               param_attr=None,
               bias_attr=None,
-               data_layout='NCHW',
-               main_program=None,
-               startup_program=None):
+               data_layout='NCHW'):
    """
    This function helps create an operator to implement
    the BatchNorm layer using the configurations from the input parameters.
@@ -679,7 +681,7 @@ def batch_norm(input,
    return helper.append_activation(batch_norm_out)


-def beam_search_decode(ids, scores, main_program=None, startup_program=None):
+def beam_search_decode(ids, scores):
    helper = LayerHelper('beam_search_decode', **locals())
    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
@@ -703,9 +705,7 @@ def conv2d_transpose(input,
                     padding=None,
                     stride=None,
                     dilation=None,
-                     param_attr=None,
-                     main_program=None,
-                     startup_program=None):
+                     param_attr=None):
    """
    The transpose of conv2d layer.

@@ -795,3 +795,220 @@ def conv2d_transpose(input,
        attrs=op_attr)

    return out
+
+
+def sequence_expand(x, y):
+    """Sequence Expand Layer. This layer will expand the input variable **x**
+    according to LoD information of **y**. And the following examples will
+    explain how sequence_expand works:
+
+    .. code-block:: text
+
+        * Case 1
+            x is a LoDTensor:
+                x.lod = [[0,       2, 3],
+                         [0, 1,    3, 4]]
+                x.data = [a, b, c, d]
+                x.dims = [4, 1]
+
+            y is a LoDTensor:
+                y.lod = [[0,    2,    4],
+                         [0, 3, 6, 7, 8]]
+
+            with condition len(y.lod[-1]) - 1 == x.dims[0]
+
+            then output is a 2-level LoDTensor:
+                out.lod = [[0,                2,    4],
+                           [0,       3,       6, 7, 8]]
+                out.data = [a, a, a, b, b, b, c, d]
+                out.dims = [8, 1]
+
+        * Case 2
+            x is a Tensor:
+                x.data = [a, b, c]
+                x.dims = [3, 1]
+
+            y is a LoDTensor:
+                y.lod = [[0, 2, 3, 6]]
+
+            with condition len(y.lod[-1]) - 1 == x.dims[0]
+
+            then output is a 1-level LoDTensor:
+                out.lod = [[0,    2, 3,      6]]
+                out.data = [a, a, b, c, c, c]
+                out.dims = [6, 1]
+
+    Args:
+        x (Variable): The input variable which is a Tensor or LoDTensor.
+        y (Variable): The input variable which is a LoDTensor.
+
+    Returns:
+        Variable: The expanded variable which is a LoDTensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[10, 20],
+                             dtype='float32', lod_level=1)
+            out = layers.sequence_expand(x=x, y=y)
+    """
+    helper = LayerHelper('sequence_expand', input=x, **locals())
+    dtype = helper.input_dtype()
+    tmp = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='sequence_expand', inputs={'X': x,
+                                        'Y': y}, outputs={'Out': tmp})
+    return tmp
+
+
+def lstm_unit(x_t,
+              hidden_t_prev,
+              cell_t_prev,
+              forget_bias=0.0,
+              param_attr=None,
+              bias_attr=None):
+    """Lstm unit layer. The equation of a lstm step is:
+
+        .. math::
+
+            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
+
+            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
+
+            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
+
+            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
+
+            h_t & = o_t tanh(c_t)
+
+    The inputs of lstm unit includes :math:`x_t`, :math:`h_{t-1}` and
+    :math:`c_{t-1}`. The implementation separates the linear transformation
+    and non-linear transformation apart. Here, we take :math:`i_t` as an
+    example. The linear transformation is applied by calling a `fc` layer and
+    the equation is:
+
+        .. math::
+
+            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i
+
+    The non-linear transformation is applied by calling `lstm_unit_op` and the
+    equation is:
+
+        .. math::
+
+            i_t = \sigma(L_{i_t})
+
+    This layer has two outputs including :math:`h_t` and :math:`o_t`.
+
+    Args:
+        x_t (Variable): The input value of current step.
+        hidden_t_prev (Variable): The hidden value of lstm unit.
+        cell_t_prev (Variable): The cell value of lstm unit.
+        forget_bias (float): The forget bias of lstm unit.
+        param_attr (ParamAttr): The attributes of parameter weights, used to set
+            initializer, name etc.
+        bias_attr (ParamAttr): The attributes of bias weights, if not False,
+            bias weights will be created and be set to default value.
+
+    Returns:
+        tuple: The hidden value and cell value of lstm unit.
+
+    Raises:
+        ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
+                not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \
+                and **cell_t_prev** not be the same.
+
+    Examples:
+
+        .. code-block:: python
+
+             x_t = fluid.layers.fc(input=x_t_data, size=10)
+             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=20)
+             prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
+             hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t,
+                                                    hidden_t_prev=prev_hidden,
+                                                    cell_t_prev=prev_cell)
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+
+    if len(x_t.shape) != 2:
+        raise ValueError("Rank of x_t must be 2.")
+
+    if len(hidden_t_prev.shape) != 2:
+        raise ValueError("Rank of hidden_t_prev must be 2.")
+
+    if len(cell_t_prev.shape) != 2:
+        raise ValueError("Rank of cell_t_prev must be 2.")
+
+    if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
+            0] != cell_t_prev.shape[0]:
+        raise ValueError("The 1s dimension of x_t, hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    if bias_attr is None:
+        bias_attr = ParamAttr()
+
+    size = cell_t_prev.shape[1]
+    concat_out = concat(input=[x_t, hidden_t_prev], axis=1)
+    fc_out = fc(input=concat_out,
+                size=4 * size,
+                param_attr=param_attr,
+                bias_attr=bias_attr)
+    dtype = x_t.dtype
+    c = helper.create_tmp_variable(dtype)
+    h = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstm_unit',
+        inputs={"X": fc_out,
+                "C_prev": cell_t_prev},
+        outputs={"C": c,
+                 "H": h},
+        attrs={"forget_bias": forget_bias})
+
+    return h, c
+
+
+def reduce_sum(input, dim=None, keep_dim=False):
+    """
+    Computes the sum of tensor elements over the given dimension. 
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the sum is performed. If 
+            :attr:`None`, sum all elements of :attr:`input` and return a 
+            Tensor variable with a single element, otherwise must be in the 
+            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, 
+            the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the 
+            output Tensor. The result tensor will have one fewer dimension 
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+    
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_sum(x)  # [3.5]
+            fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
+            fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
+            fluid.layers.reduce_sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
+    """
+    helper = LayerHelper('reduce_sum', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_sum',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -2,7 +2,7 @@ from ..registry import register_layer
 __all__ = [
    'mean', 'mul', 'dropout', 'reshape', 'sigmoid', 'scale', 'transpose',
    'sigmoid_cross_entropy_with_logits', 'elementwise_add', 'elementwise_div',
-    'elementwise_sub', 'elementwise_mul', 'clip', 'abs'
+    'elementwise_sub', 'elementwise_mul', 'clip', 'abs', 'sequence_softmax'
 ]

 for _OP in set(__all__):

--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -6,12 +6,12 @@ __all__ = [
 ]


-def create_tensor(dtype, name=None, main_program=None, startup_program=None):
+def create_tensor(dtype, name=None):
    helper = LayerHelper("create_tensor", **locals())
    return helper.create_variable(name=helper.name, dtype=dtype)


-def cast(x, dtype, main_program=None):
+def cast(x, dtype):
    """
    This function takes in the input with input_dtype
    and casts it to the output_dtype as the output.
@@ -27,7 +27,7 @@ def cast(x, dtype, main_program=None):
    return out


-def concat(input, axis, main_program=None, startup_program=None):
+def concat(input, axis):
    """
    This function concats the input along the axis mentioned
    and returns that as the output.
@@ -42,7 +42,7 @@ def concat(input, axis, main_program=None, startup_program=None):
    return out


-def sums(input, out=None, main_program=None, startup_program=None):
+def sums(input, out=None):
    """
    This function takes in the input and performs the sum operation on it
    and returns that as the output.
@@ -54,7 +54,7 @@ def sums(input, out=None, main_program=None, startup_program=None):
    return out


-def assign(input, output, main_program=None, startup_program=None):
+def assign(input, output):
    helper = LayerHelper('assign', **locals())
    helper.append_op(
        type='scale',
@@ -64,16 +64,28 @@ def assign(input, output, main_program=None, startup_program=None):
    return output


-def fill_constant(shape,
-                  dtype,
-                  value,
-                  out=None,
-                  main_program=None,
-                  startup_program=None):
+def fill_constant(shape, dtype, value, out=None):
    """
-    This function creates a tensor , with shape as mentioned in the input and
-    specified dtype and fills this up with a constant value that
-    comes in the input. It also sets the stop_gradient to be True.
+    **fill_constant**
+
+    This function creates a tensor of specified *shape* and 
+    *dtype*, and initializes this with a constant supplied in *value*.
+    
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+        value(float): Constant value to initialize the output tensor
+        out(Variable): Output Variable to initialize
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
    """
    helper = LayerHelper("fill_constant", **locals())
    if out is None:
@@ -94,9 +106,32 @@ def fill_constant_batch_size_like(input,
                                  dtype,
                                  value,
                                  input_dim_idx=0,
-                                  output_dim_idx=0,
-                                  main_program=None,
-                                  startup_program=None):
+                                  output_dim_idx=0):
+    """
+    **fill_constant_batch_size_like**
+
+    This function creates a tensor of specified *shape*, *dtype* and batch size, 
+    and initializes this with a constant supplied in *value*. The batch size is 
+    obtained from the `input` tensor. 
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        input(Variable): Tensor whose dimensions will be used to get batch size
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+        value(float): Constant value to initialize the output tensor
+        input_dim_idx(int): Index of input's batch size dimension
+        output_dim_idx(int): Index of output's batch size dimension
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
+    """
    helper = LayerHelper("fill_constant_batch_size_like", **locals())
    out = helper.create_tmp_variable(dtype=dtype)
    helper.append_op(
@@ -114,7 +149,7 @@ def fill_constant_batch_size_like(input,
    return out


-def ones(shape, dtype, main_program=None):
+def ones(shape, dtype):
    """
    This function performs the same function as fill_constant() declared above
    with the constant value being 1.0.
@@ -122,7 +157,7 @@ def ones(shape, dtype, main_program=None):
    return fill_constant(value=1.0, **locals())


-def zeros(shape, dtype, main_program=None):
+def zeros(shape, dtype):
    """
    This function performs the same function as fill_constant() declared above
    with the constant value being 0.0.

--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -10,25 +10,19 @@ def simple_img_conv_pool(input,
                         pool_stride,
                         act,
                         param_attr=None,
-                         pool_type='max',
-                         main_program=None,
-                         startup_program=None):
+                         pool_type='max'):
    conv_out = layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
        param_attr=param_attr,
-        act=act,
-        main_program=main_program,
-        startup_program=startup_program)
+        act=act)

    pool_out = layers.pool2d(
        input=conv_out,
        pool_size=pool_size,
        pool_type=pool_type,
-        pool_stride=pool_stride,
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_stride=pool_stride)
    return pool_out


@@ -42,9 +36,7 @@ def img_conv_group(input,
                   conv_with_batchnorm=False,
                   conv_batchnorm_drop_rate=None,
                   pool_stride=1,
-                   pool_type=None,
-                   main_program=None,
-                   startup_program=None):
+                   pool_type=None):
    """
    Image Convolution Group, Used for vgg net.
    """
@@ -75,31 +67,19 @@ def img_conv_group(input,
            filter_size=conv_filter_size[i],
            padding=conv_padding[i],
            param_attr=param_attr[i],
-            act=local_conv_act,
-            main_program=main_program,
-            startup_program=startup_program)
+            act=local_conv_act)

        if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(
-                input=tmp,
-                act=conv_act,
-                main_program=main_program,
-                startup_program=startup_program)
+            tmp = layers.batch_norm(input=tmp, act=conv_act)
            drop_rate = conv_batchnorm_drop_rate[i]
            if abs(drop_rate) > 1e-5:
-                tmp = layers.dropout(
-                    x=tmp,
-                    dropout_prob=drop_rate,
-                    main_program=main_program,
-                    startup_program=startup_program)
+                tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)

    pool_out = layers.pool2d(
        input=tmp,
        pool_size=pool_size,
        pool_type=pool_type,
-        pool_stride=pool_stride,
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_stride=pool_stride)
    return pool_out


@@ -108,21 +88,13 @@ def sequence_conv_pool(input,
                       filter_size,
                       param_attr=None,
                       act="sigmoid",
-                       pool_type="max",
-                       main_program=None,
-                       startup_program=None):
+                       pool_type="max"):
    conv_out = layers.sequence_conv(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
        param_attr=param_attr,
-        act=act,
-        main_program=main_program,
-        startup_program=startup_program)
+        act=act)

-    pool_out = layers.sequence_pool(
-        input=conv_out,
-        pool_type=pool_type,
-        main_program=main_program,
-        startup_program=startup_program)
+    pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)
    return pool_out
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -2,10 +2,11 @@ from collections import defaultdict

 import framework
 from backward import append_backward_ops
-from framework import unique_name
+from framework import unique_name, program_guard
 from initializer import Constant
 from layer_helper import LayerHelper
 from regularizer import append_regularization_ops
+from clip import append_gradient_clip_ops

 __all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']

@@ -159,10 +160,8 @@ class Optimizer(object):

        # Create any accumulators
        program = loss.block.program
-        self.helper = LayerHelper(
-            self.__class__.__name__,
-            main_program=program,
-            startup_program=startup_program)
+        with program_guard(program, startup_program):
+            self.helper = LayerHelper(self.__class__.__name__)
            self._create_accumulators(loss.block,
                                      [p[0] for p in parameters_and_grads])

@@ -199,9 +198,13 @@ class Optimizer(object):
        `create_optimization_pass()` into one.
        """
        params_grads = append_backward_ops(loss, parameter_list, no_grad_set)
+
+        params_grads = append_gradient_clip_ops(params_grads)
+
        # Add regularization if any
        params_grads = append_regularization_ops(params_grads,
                                                 self.regularization)
+
        optimize_ops = self.create_optimization_pass(params_grads, loss,
                                                     startup_program)
        return optimize_ops

--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
 from initializer import Initializer, Xavier, Constant
 from regularizer import WeightDecayRegularizer

+__all__ = ['ParamAttr']
+

 class ParamAttr(object):
    def __init__(self,
@@ -8,12 +10,14 @@ class ParamAttr(object):
                 initializer=None,
                 learning_rate=1.0,
                 regularizer=None,
-                 trainable=True):
+                 trainable=True,
+                 clip=None):
        self.name = name
        self.initializer = initializer
        self.learning_rate = learning_rate
        self.regularizer = regularizer
        self.trainable = trainable
+        self.clip = clip

    def set_default_initializer(self, initializer):
        if initializer is None:
@@ -56,7 +60,8 @@ class ParamAttr(object):
            'name': self.name,
            'learning_rate': self.learning_rate,
            'regularizer': self.regularizer,
-            'trainable': self.trainable
+            'trainable': self.trainable,
+            'clip_attr': self.clip
        }
        if with_initializer:
            kwargs['initializer'] = self.initializer

--- a/python/paddle/v2/fluid/tests/.gitignore
+++ b/python/paddle/v2/fluid/tests/.gitignore
 image/
 fit_a_line.model/
 tmp
+cuda_profiler.txt
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -150,7 +150,7 @@ def main():
    crf_decode = fluid.layers.crf_decoding(
        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))

-    precision, recall, f1_score = fluid.layers.chunk_eval(
+    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
        input=crf_decode,
        label=target,
        chunk_scheme="IOB",
@@ -176,20 +176,21 @@ def main():

    batch_id = 0
    for pass_id in xrange(PASS_NUM):
+        chunk_evaluator.reset(exe)
        for data in train_data():
-            outs = exe.run(fluid.default_main_program(),
+            cost, precision, recall, f1_score = exe.run(
+                fluid.default_main_program(),
                feed=feeder.feed(data),
-                           fetch_list=[avg_cost, precision, recall, f1_score])
-            avg_cost_val = np.array(outs[0])
-            precision_val = np.array(outs[1])
-            recall_val = np.array(outs[2])
-            f1_score_val = np.array(outs[3])
+                fetch_list=[avg_cost] + chunk_evaluator.metrics)
+            pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
+                exe)

            if batch_id % 10 == 0:
-                print("avg_cost=" + str(avg_cost_val))
-                print("precision_val=" + str(precision_val))
-                print("recall_val:" + str(recall_val))
-                print("f1_score_val:" + str(f1_score_val))
+                print("avg_cost:" + str(cost) + " precision:" + str(
+                    precision) + " recall:" + str(recall) + " f1_score:" + str(
+                        f1_score) + " pass_precision:" + str(
+                            pass_precision) + " pass_recall:" + str(pass_recall)
+                      + " pass_f1_score:" + str(pass_f1_score))

            # exit early for CI
            exit(0)

--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -11,7 +11,9 @@ regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
 hidden1 = fluid.layers.fc(input=image,
                          size=128,
                          act='relu',
-                          param_attr=regularizer)
+                          param_attr=fluid.ParamAttr(
+                              regularizer=regularizer,
+                              clip=fluid.clip.ClipByValue(10)))
 hidden2 = fluid.layers.fc(input=hidden1,
                          size=64,
                          act='relu',
@@ -33,11 +35,10 @@ opts = optimizer.minimize(avg_cost)
 accuracy = fluid.evaluator.Accuracy(input=predict, label=label)

 inference_program = fluid.default_main_program().clone()
-test_accuracy = fluid.evaluator.Accuracy(
-    input=predict, label=label, main_program=inference_program)
-test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
-inference_program = fluid.io.get_inference_program(
-    test_target, main_program=inference_program)
+with fluid.program_guard(inference_program):
+    test_accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
+    inference_program = fluid.io.get_inference_program(test_target)

 train_reader = paddle.batch(
    paddle.reader.shuffle(

--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -4,12 +4,7 @@ import paddle.v2.fluid as fluid
 from paddle.v2.fluid.layer_helper import LayerHelper


-def lstm(x,
-         c_pre_init,
-         hidden_dim,
-         forget_bias=None,
-         main_program=None,
-         startup_program=None):
+def lstm(x, c_pre_init, hidden_dim, forget_bias=None):
    """
    This function helps create an operator for the LSTM (Long Short Term
    Memory) cell that can be used inside an RNN.
@@ -20,15 +15,8 @@ def lstm(x,
        c_pre = rnn.memory(init=c_pre_init)
        x_t = rnn.step_input(x)

-        before_fc = fluid.layers.concat(
-            input=[x_t, c_pre],
-            axis=1,
-            main_program=main_program,
-            startup_program=startup_program)
-        after_fc = fluid.layers.fc(input=before_fc,
-                                   size=hidden_dim * 4,
-                                   main_program=main_program,
-                                   startup_program=startup_program)
+        before_fc = fluid.layers.concat(input=[x_t, c_pre], axis=1)
+        after_fc = fluid.layers.fc(input=before_fc, size=hidden_dim * 4)

        dtype = x.dtype
        c = helper.create_tmp_variable(dtype)

--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -3,10 +3,7 @@ import numpy as np
 from op_test import OpTest
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.op import Operator
-
-
-def grad_var_name(var_name):
-    return var_name + "@GRAD"
+from paddle.v2.fluid.framework import grad_var_name


 def get_backward_op(scope, op, no_grad_set):

--- a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
+++ b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
@@ -147,7 +147,13 @@ class TestChunkEvalOp(OpTest):
            'Recall': np.asarray(
                [recall], dtype='float32'),
            'F1-Score': np.asarray(
-                [f1], dtype='float32')
+                [f1], dtype='float32'),
+            'NumInferChunks': np.asarray(
+                [self.num_infer_chunks], dtype='int64'),
+            'NumLabelChunks': np.asarray(
+                [self.num_label_chunks], dtype='int64'),
+            'NumCorrectChunks': np.asarray(
+                [self.num_correct_chunks], dtype='int64')
        }

    def setUp(self):

--- a/python/paddle/v2/fluid/tests/test_const_value.py
+++ b/python/paddle/v2/fluid/tests/test_const_value.py
+import unittest
+import paddle.v2.fluid.framework as framework
+
+
+class ConditionalBlock(unittest.TestCase):
+    def test_const_value(self):
+        self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
+        self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@")
+        self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
+        self.assertEqual(framework.ZERO_VAR_SUFFIX, "@ZERO")
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
@@ -47,7 +47,9 @@ class TestDropoutOp4(OpTest):
        self.op_type = "dropout"
        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
        self.attrs = {'dropout_prob': 0.35, 'is_test': True}
-        self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
+        self.outputs = {
+            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
+        }

    def test_check_output(self):
        self.check_output()
@@ -58,7 +60,9 @@ class TestDropoutOp5(OpTest):
        self.op_type = "dropout"
        self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
        self.attrs = {'dropout_prob': 0.75, 'is_test': True}
-        self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
+        self.outputs = {
+            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
+        }

    def test_check_output(self):
        self.check_output()

--- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
@@ -5,12 +5,7 @@ import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program


-def conv_block(input,
-               num_filter,
-               groups,
-               dropouts,
-               main_program=None,
-               startup_program=None):
+def conv_block(input, num_filter, groups, dropouts):
    return nets.img_conv_group(
        input=input,
        pool_size=2,
@@ -20,90 +15,54 @@ def conv_block(input,
        conv_act='relu',
        conv_with_batchnorm=True,
        conv_batchnorm_drop_rate=dropouts,
-        pool_type='max',
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_type='max')


 class TestLayer(unittest.TestCase):
    def test_batch_norm_layer(self):
        main_program = Program()
        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
            images = fluid.layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program)
-        hidden1 = fluid.layers.batch_norm(
-            input=images,
-            main_program=main_program,
-            startup_program=startup_program)
-        hidden2 = fluid.layers.fc(input=hidden1,
-                                  size=128,
-                                  act='relu',
-                                  main_program=main_program)
-        hidden3 = fluid.layers.batch_norm(
-            input=hidden2,
-            main_program=main_program,
-            startup_program=startup_program)
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            hidden1 = fluid.layers.batch_norm(input=images)
+            hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu')
+            fluid.layers.batch_norm(input=hidden2)

        print str(main_program)

    def test_dropout_layer(self):
        main_program = Program()
        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
            images = fluid.layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program)
-        fluid.layers.dropout(
-            x=images,
-            dropout_prob=0.5,
-            main_program=main_program,
-            startup_program=startup_program)
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            fluid.layers.dropout(x=images, dropout_prob=0.5)

-        # print str(main_program)
+        print str(main_program)

    def test_img_conv_group(self):
        main_program = Program()
        startup_program = Program()

+        with fluid.program_guard(main_program, startup_program):
            images = fluid.layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program,
-            startup_program=startup_program)
-        conv1 = conv_block(images, 64, 2, [0.3, 0], main_program,
-                           startup_program)
-        conv2 = conv_block(conv1, 256, 3, [0.4, 0.4, 0], main_program,
-                           startup_program)
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            conv1 = conv_block(images, 64, 2, [0.3, 0])
+            conv_block(conv1, 256, 3, [0.4, 0.4, 0])

-        # print str(main_program)
+        print str(main_program)

    def test_elementwise_add_with_act(self):
        main_program = Program()
        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
            image1 = fluid.layers.data(
-            name='pixel1',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program,
-            startup_program=startup_program)
+                name='pixel1', shape=[3, 48, 48], dtype='float32')
            image2 = fluid.layers.data(
-            name='pixel2',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program,
-            startup_program=startup_program)
-        out = fluid.layers.elementwise_add(
-            x=image1,
-            y=image2,
-            act='relu',
-            main_program=main_program,
-            startup_program=startup_program)
-        # print(main_program)
+                name='pixel2', shape=[3, 48, 48], dtype='float32')
+            fluid.layers.elementwise_add(x=image1, y=image2, act='relu')
+        print(main_program)


 if __name__ == '__main__':

--- a/python/paddle/v2/fluid/tests/test_inference_model_io.py
+++ b/python/paddle/v2/fluid/tests/test_inference_model_io.py
@@ -6,7 +6,7 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.executor as executor
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.optimizer as optimizer
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.io import save_inference_model, load_inference_model


@@ -16,32 +16,15 @@ class TestBook(unittest.TestCase):

        init_program = Program()
        program = Program()
-        x = layers.data(
-            name='x',
-            shape=[2],
-            dtype='float32',
-            main_program=program,
-            startup_program=init_program)
-        y = layers.data(
-            name='y',
-            shape=[1],
-            dtype='float32',
-            main_program=program,
-            startup_program=init_program)
-
-        y_predict = layers.fc(input=x,
-                              size=1,
-                              act=None,
-                              main_program=program,
-                              startup_program=init_program)
-
-        cost = layers.square_error_cost(
-            input=y_predict,
-            label=y,
-            main_program=program,
-            startup_program=init_program)
-        avg_cost = layers.mean(
-            x=cost, main_program=program, startup_program=init_program)
+
+        with program_guard(program, init_program):
+            x = layers.data(name='x', shape=[2], dtype='float32')
+            y = layers.data(name='y', shape=[1], dtype='float32')
+
+            y_predict = layers.fc(input=x, size=1, act=None)
+
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)

            sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
            sgd_optimizer.minimize(avg_cost, init_program)

--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -161,6 +161,41 @@ class TestBook(unittest.TestCase):
                    x=dat, label=lbl))
        print(str(program))

+    def test_sequence_expand(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10], dtype='float32')
+            y = layers.data(
+                name='y', shape=[10, 20], dtype='float32', lod_level=1)
+            self.assertIsNotNone(layers.sequence_expand(x=x, y=y))
+        print(str(program))
+
+    def test_lstm_unit(self):
+        program = Program()
+        with program_guard(program):
+            x_t_data = layers.data(
+                name='x_t_data', shape=[10, 10], dtype='float32')
+            x_t = layers.fc(input=x_t_data, size=10)
+            prev_hidden_data = layers.data(
+                name='prev_hidden_data', shape=[10, 20], dtype='float32')
+            prev_hidden = layers.fc(input=prev_hidden_data, size=20)
+            prev_cell_data = layers.data(
+                name='prev_cell', shape=[10, 30], dtype='float32')
+            prev_cell = layers.fc(input=prev_cell_data, size=30)
+            self.assertIsNotNone(
+                layers.lstm_unit(
+                    x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
+        print(str(program))
+
+    def test_sequence_softmax(self):
+        program = Program()
+        with program_guard(program):
+            seq_data = layers.data(
+                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
+            seq = layers.fc(input=seq_data, size=20)
+            self.assertIsNotNone(layers.sequence_softmax(x=seq))
+        print(str(program))
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.core as core
 import numpy
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops

@@ -118,15 +118,16 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
    def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
        place = self.place()
        program = Program()
-        x = layers.data(name='x', shape=[10], main_program=program)
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10])
            x.persistable = True
-        table = layers.lod_rank_table(x, level=level, main_program=program)
-        max_len = layers.max_sequence_len(table, main_program=program)
+            table = layers.lod_rank_table(x, level=level)
+            max_len = layers.max_sequence_len(table)
            max_len.persistable = True
-        array = layers.lod_tensor_to_array(x, table, main_program=program)
+            array = layers.lod_tensor_to_array(x, table)
            array.persistable = True

-        result = layers.array_to_lod_tensor(array, table, main_program=program)
+            result = layers.array_to_lod_tensor(array, table)
            result.persistable = True
        exe = Executor(place)
        scope = core.Scope()
@@ -160,17 +161,14 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
        place = core.CPUPlace()
        program = Program()

+        with program_guard(program):
            x = layers.data(
-            name='x',
-            shape=[1],
-            dtype='float32',
-            main_program=program,
-            stop_gradient=False)
-        table = layers.lod_rank_table(x, level=0, main_program=program)
-        array = layers.lod_tensor_to_array(x, table, main_program=program)
-        result = layers.array_to_lod_tensor(array, table, main_program=program)
-
-        mean = layers.mean(x=result, main_program=program)
+                name='x', shape=[1], dtype='float32', stop_gradient=False)
+            table = layers.lod_rank_table(x, level=0)
+            array = layers.lod_tensor_to_array(x, table)
+            result = layers.array_to_lod_tensor(array, table)
+
+            mean = layers.mean(x=result)

            append_backward_ops(mean)


--- a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard, default_main_program, default_startup_program
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.optimizer import MomentumOptimizer
 import paddle.v2.fluid.core as core
@@ -10,44 +10,42 @@ import numpy as np

 class TestMNISTIfElseOp(unittest.TestCase):
    def test_raw_api(self):
-        kwargs = {'startup_program': Program(), 'main_program': Program()}
-        image = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            image = layers.data(name='x', shape=[784], dtype='float32')

-        label = layers.data(name='y', shape=[1], dtype='int64', **kwargs)
+            label = layers.data(name='y', shape=[1], dtype='int64')

            limit = layers.fill_constant_batch_size_like(
-            input=label, dtype='int64', shape=[1], value=5.0, **kwargs)
-
-        cond = layers.less_than(x=label, y=limit, **kwargs)
+                input=label, dtype='int64', shape=[1], value=5.0)
+            cond = layers.less_than(x=label, y=limit)
            true_image, false_image = layers.split_lod_tensor(
-            input=image, mask=cond, **kwargs)
+                input=image, mask=cond)

-        true_out = layers.create_tensor(dtype='float32', **kwargs)
-        true_cond = layers.ConditionalBlock([true_image], **kwargs)
+            true_out = layers.create_tensor(dtype='float32')
+            true_cond = layers.ConditionalBlock([true_image])

            with true_cond.block():
-            hidden = layers.fc(input=true_image, size=100, act='tanh', **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
-            layers.assign(input=prob, output=true_out, **kwargs)
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                layers.assign(input=prob, output=true_out)

-        false_out = layers.create_tensor(dtype='float32', **kwargs)
-        false_cond = layers.ConditionalBlock([false_image], **kwargs)
+            false_out = layers.create_tensor(dtype='float32')
+            false_cond = layers.ConditionalBlock([false_image])

            with false_cond.block():
-            hidden = layers.fc(input=false_image,
-                               size=200,
-                               act='tanh',
-                               **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
-            layers.assign(input=prob, output=false_out, **kwargs)
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                layers.assign(input=prob, output=false_out)

            prob = layers.merge_lod_tensor(
-            in_true=true_out, in_false=false_out, mask=cond, x=image, **kwargs)
-        loss = layers.cross_entropy(input=prob, label=label, **kwargs)
-        avg_loss = layers.mean(x=loss, **kwargs)
+                in_true=true_out, in_false=false_out, mask=cond, x=image)
+            loss = layers.cross_entropy(input=prob, label=label)
+            avg_loss = layers.mean(x=loss)

            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-        optimizer.minimize(avg_loss, kwargs['startup_program'])
+            optimizer.minimize(avg_loss, startup_prog)

        train_reader = paddle.batch(
            paddle.reader.shuffle(
@@ -57,7 +55,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
        place = core.CPUPlace()
        exe = Executor(place)

-        exe.run(kwargs['startup_program'])
+        exe.run(startup_prog)
        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for data in train_reader():
@@ -65,7 +63,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                y_data = np.expand_dims(y_data, axis=1)

-                outs = exe.run(kwargs['main_program'],
+                outs = exe.run(prog,
                               feed={'x': x_data,
                                     'y': y_data},
                               fetch_list=[avg_loss])
@@ -75,39 +73,36 @@ class TestMNISTIfElseOp(unittest.TestCase):
        self.assertFalse(True)

    def test_ifelse(self):
-        kwargs = {'startup_program': Program(), 'main_program': Program()}
-        image = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            image = layers.data(name='x', shape=[784], dtype='float32')

-        label = layers.data(name='y', shape=[1], dtype='int64', **kwargs)
+            label = layers.data(name='y', shape=[1], dtype='int64')

            limit = layers.fill_constant_batch_size_like(
-            input=label, dtype='int64', shape=[1], value=5.0, **kwargs)
-
-        cond = layers.less_than(x=label, y=limit, **kwargs)
-
-        ie = layers.IfElse(cond, **kwargs)
+                input=label, dtype='int64', shape=[1], value=5.0)
+            cond = layers.less_than(x=label, y=limit)
+            ie = layers.IfElse(cond)

            with ie.true_block():
                true_image = ie.input(image)
-            hidden = layers.fc(input=true_image, size=100, act='tanh', **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
                ie.output(prob)

            with ie.false_block():
                false_image = ie.input(image)
-            hidden = layers.fc(input=false_image,
-                               size=200,
-                               act='tanh',
-                               **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
                ie.output(prob)

            prob = ie()
-        loss = layers.cross_entropy(input=prob[0], label=label, **kwargs)
-        avg_loss = layers.mean(x=loss, **kwargs)
+            loss = layers.cross_entropy(input=prob[0], label=label)
+            avg_loss = layers.mean(x=loss)

            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-        optimizer.minimize(avg_loss, kwargs['startup_program'])
+            optimizer.minimize(avg_loss, startup_prog)
        train_reader = paddle.batch(
            paddle.reader.shuffle(
                paddle.dataset.mnist.train(), buf_size=8192),
@@ -135,4 +130,5 @@ class TestMNISTIfElseOp(unittest.TestCase):


 if __name__ == '__main__':
-    unittest.main()
+    # temp disable if else unittest since it could be buggy.
+    exit(0)
--- a/python/paddle/v2/fluid/tests/test_operator.py
+++ b/python/paddle/v2/fluid/tests/test_operator.py
 import unittest
+
 import paddle.v2.fluid.op as op
-import paddle.v2.fluid.core as core
 import paddle.v2.fluid.proto.framework_pb2 as framework_pb2



--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
 from __future__ import print_function
 import unittest

-from paddle.v2.fluid.framework import Program, default_main_program
+from paddle.v2.fluid.framework import Program, default_main_program, program_guard, grad_var_name
 import paddle.v2.fluid.layers as layers

 main_program = default_main_program()
@@ -109,12 +109,10 @@ class TestProgram(unittest.TestCase):
        self.assertEqual(add_op.idx, 1)
        param_to_grad = prog.append_backward(mean_out, set())

-        def grad_name(name):
-            return name + "@GRAD"
-
        for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out",
                         "mean.out"):
-            self.assertEqual(param_to_grad[var_name][0], grad_name(var_name))
+            self.assertEqual(param_to_grad[var_name][0],
+                             grad_var_name(var_name))
            self.assertEqual(param_to_grad[var_name][1], 0)

        expect_ops = [
@@ -129,13 +127,10 @@ class TestProgram(unittest.TestCase):
    def test_program_clone_with_parameter(self):
        main_program = Program()
        startup_program = Program()
-        kwargs = {
-            'main_program': main_program,
-            'startup_program': startup_program
-        }
-        d = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
-        hidden = layers.fc(input=d, size=100, **kwargs)
-        layers.fc(input=hidden, size=100, **kwargs)
+        with program_guard(main_program, startup_program):
+            d = layers.data(name='x', shape=[784], dtype='float32')
+            hidden = layers.fc(input=d, size=100)
+            layers.fc(input=hidden, size=100)

        new_program = main_program.clone()
        self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))

--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
 import unittest

 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, grad_var_name
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops
 import numpy as np
@@ -164,7 +164,7 @@ class RecurrentOpTest1(unittest.TestCase):
            for x in self.data_field
        }
        fetch_list = [
-            self.main_program.global_block().var(x + "@GRAD")
+            self.main_program.global_block().var(grad_var_name(x))
            for x in self.data_field
        ]


--- a/python/paddle/v2/fluid/tests/test_reshape_op.py
+++ b/python/paddle/v2/fluid/tests/test_reshape_op.py
@@ -17,5 +17,19 @@ class TestReshapeOp(OpTest):
        self.check_grad(["X"], "Out")


+class TestReshapeOpDimInfer(OpTest):
+    def setUp(self):
+        self.op_type = "reshape"
+        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
+        self.attrs = {'shape': [4, -1, 5]}
+        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_seq_expand.py
+++ b/python/paddle/v2/fluid/tests/test_seq_expand.py
@@ -3,7 +3,7 @@ import numpy as np
 from op_test import OpTest


-class TestSeqExpand(OpTest):
+class TestSequenceExpand(OpTest):
    def set_data(self):
        x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
@@ -21,7 +21,7 @@ class TestSeqExpand(OpTest):
        self.outputs = {'Out': out}

    def setUp(self):
-        self.op_type = 'seq_expand'
+        self.op_type = 'sequence_expand'
        self.set_data()
        self.compute()

@@ -32,7 +32,7 @@ class TestSeqExpand(OpTest):
        self.check_grad(["X"], "Out")


-class TestSeqExpandCase1(TestSeqExpand):
+class TestSequenceExpandCase1(TestSequenceExpand):
    def set_data(self):
        x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
        x_lod = [[0, 2, 5]]
@@ -41,7 +41,7 @@ class TestSeqExpandCase1(TestSeqExpand):
        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}


-class TestSeqExpandCase2(TestSeqExpand):
+class TestSequenceExpandCase2(TestSequenceExpand):
    def set_data(self):
        x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
        x_lod = [[0, 1]]
@@ -50,7 +50,7 @@ class TestSeqExpandCase2(TestSeqExpand):
        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}


-class TestSeqExpandCase3(TestSeqExpand):
+class TestSequenceExpandCase3(TestSequenceExpand):
    def set_data(self):
        x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
        x_lod = [[0, 1, 2, 3, 4]]

--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.core as core
 import numpy as np
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops

@@ -75,24 +75,20 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             level=0):
        place = self.place()
        program = Program()
-        x = layers.data(name='x', shape=[1], main_program=program)
+        with program_guard(program):
+            x = layers.data(name='x', shape=[1])
            x.persistable = True

-        y = layers.data(name='y', shape=[1], main_program=program)
+            y = layers.data(name='y', shape=[1])
            y.persistable = True

            out_true, out_false = layers.split_lod_tensor(
-            input=x, mask=y, level=level, main_program=program)
+                input=x, mask=y, level=level)
            out_true.persistable = True
            out_false.persistable = True

            out = layers.merge_lod_tensor(
-            in_true=out_true,
-            in_false=out_false,
-            mask=y,
-            x=x,
-            level=level,
-            main_program=program)
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)

            out.persistable = True

@@ -123,32 +119,19 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
    def test_grad(self):
        place = core.CPUPlace()
        program = Program()
-
+        with program_guard(program):
            x = layers.data(
-            name='x',
-            shape=[1],
-            dtype='float32',
-            main_program=program,
-            stop_gradient=False)
+                name='x', shape=[1], dtype='float32', stop_gradient=False)
            y = layers.data(
-            name='y',
-            shape=[1],
-            dtype='bool',
-            main_program=program,
-            stop_gradient=False)
+                name='y', shape=[1], dtype='bool', stop_gradient=False)

            level = 0

            out_true, out_false = layers.split_lod_tensor(
-            input=x, mask=y, level=level, main_program=program)
+                input=x, mask=y, level=level)
            out = layers.merge_lod_tensor(
-            in_true=out_true,
-            in_false=out_false,
-            mask=y,
-            x=x,
-            level=level,
-            main_program=program)
-        mean = layers.mean(x=out, main_program=program)
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
+            mean = layers.mean(x=out)

            append_backward_ops(mean)


--- a/python/paddle/v2/fluid/tests/test_spp_op.py
+++ b/python/paddle/v2/fluid/tests/test_spp_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_pool2d_op import max_pool2D_forward_naive
+from test_pool2d_op import avg_pool2D_forward_naive
+
+
+class TestSppOp(OpTest):
+    def setUp(self):
+        self.op_type = "spp"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = input.shape
+        out_level_flatten = []
+        for i in xrange(self.pyramid_height):
+            bins = np.power(2, i)
+            kernel_size = [0, 0]
+            padding = [0, 0]
+            kernel_size[0] = np.ceil(hsize /
+                                     bins.astype("double")).astype("int32")
+            padding[0] = (
+                (kernel_size[0] * bins - hsize + 1) / 2).astype("int32")
+
+            kernel_size[1] = np.ceil(wsize /
+                                     bins.astype("double")).astype("int32")
+            padding[1] = (
+                (kernel_size[1] * bins - wsize + 1) / 2).astype("int32")
+            out_level = self.pool2D_forward_naive(input, kernel_size,
+                                                  kernel_size, padding)
+            out_level_flatten.append(
+                out_level.reshape(nsize, bins * bins * csize))
+            if i == 0:
+                output = out_level_flatten[i]
+            else:
+                output = np.concatenate((output, out_level_flatten[i]), 1)
+        # output = np.concatenate(out_level_flatten.tolist(), 0);
+        self.inputs = {'X': input.astype('float32'), }
+        self.attrs = {
+            'pyramid_height': self.pyramid_height,
+            'pooling_type': self.pool_type
+        }
+
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.pool_type != "avg":
+            self.check_grad(['X'], 'Out', max_relative_error=0.05)
+
+    def init_test_case(self):
+        self.shape = [3, 2, 4, 4]
+        self.pyramid_height = 3
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.pool_type = "max"
+
+
+class TestCase2(TestSppOp):
+    def init_test_case(self):
+        self.shape = [3, 2, 4, 4]
+        self.pyramid_height = 3
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.pool_type = "avg"
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -390,8 +390,6 @@ def pipe_reader(left_cmd,
    if not callable(parser):
        raise TypeError("parser must be a callable object")

-    process = subprocess.Popen(
-        left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
    # TODO(typhoonzero): add a thread to read stderr

    # Always init a decompress object is better than
@@ -400,6 +398,8 @@ def pipe_reader(left_cmd,
        32 + zlib.MAX_WBITS)  # offset 32 to skip the header

    def reader():
+        process = subprocess.Popen(
+            left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
        remained = ""
        while True:
            buff = process.stdout.read(bufsize)

--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -145,5 +145,35 @@ class TestXmap(unittest.TestCase):
                            self.assertEqual(e, mapper(idx))


+class TestPipeReader(unittest.TestCase):
+    def test_pipe_reader(self):
+        def simple_parser(lines):
+            return lines
+
+        import tempfile
+
+        records = [str(i) for i in xrange(5)]
+        temp = tempfile.NamedTemporaryFile()
+        try:
+            with open(temp.name, 'w') as f:
+                for r in records:
+                    f.write('%s\n' % r)
+
+            cmd = "cat %s" % temp.name
+            reader = paddle.v2.reader.pipe_reader(
+                cmd, simple_parser, bufsize=128)
+            for i in xrange(4):
+                result = []
+                for r in reader():
+                    result.append(r)
+
+                for idx, e in enumerate(records):
+                    print e, result[idx]
+                    self.assertEqual(e, result[idx])
+        finally:
+            # delete the temporary file
+            temp.close()
+
+
 if __name__ == '__main__':
    unittest.main()