diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 8fdc382f0c1c453a01dba884a3dad216e1c3092c..b21fc43904d9aafe9f7d019dfbe5b1c0d3f9e2d6 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -1,17 +1,12 @@
 # Find the CBlas and lapack libraries
 #
-# It will search MKL, atlas, OpenBlas, reference-cblas in order.
+# It will search MKLML, atlas, OpenBlas, reference-cblas in order.
 #
 # If any cblas implementation found, the following variable will be set.
-#    CBLAS_PROVIDER  # one of MKL, ATLAS, OPENBLAS, REFERENCE
+#    CBLAS_PROVIDER  # one of MKLML, ATLAS, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
 #    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
-#
-# User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT
-# during cmake. If none of them set, it will try to find cblas implementation in
-# system paths.
-#
 
 set(CBLAS_FOUND OFF)
 
@@ -30,44 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
   return()
 endif()
 
-## Then find MKL.
-set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
-set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
-
-set(MKL_INCLUDE_SEARCH_PATHS
-  ${MKL_ROOT}/include
-  ${INTEL_MKL_ROOT}/include)
-set(MKL_LIB_SEARCH_PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64
-  ${INTEL_MKL_ROOT}/lib
-  ${INTEL_MKL_ROOT}/lib/intel64)
-
-find_path(MKL_INC_DIR mkl.h PATHS
-  ${MKL_INCLUDE_SEARCH_PATHS})
-find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
-  ${MKL_INCLUDE_SEARCH_PATHS})
-find_library(MKL_CORE_LIB NAMES mkl_core PATHS
-  ${MKL_LIB_SEARCH_PATHS})
-find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
-  ${MKL_LIB_SEARCH_PATHS})
-find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
-  ${MKL_LIB_SEARCH_PATHS})
-
-if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER MKL)
-  set(CBLAS_INC_DIR ${MKL_INC_DIR} ${MKL_LAPACK_INC_DIR})
-  set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} ${MKL_SEQUENTIAL_LIB} ${MKL_CORE_LIB})
-
-  add_definitions(-DPADDLE_USE_MKL)
-  add_definitions(-DLAPACK_FOUND)
-
-  message(STATUS "Found MKL (include: ${MKL_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
-  return()
-endif()
-
 ## Then find atlas.
 set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
 set(ATLAS_INCLUDE_SEARCH_PATHS
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 9686df00219001769d074ee815d9cc8db0258496..5a06825beb73e85d8a55b7b578b187bee2c4340c 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -46,16 +46,20 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
     MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
 ENDIF()
 
+SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
+SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow")
 ExternalProject_Add(
     ${MKLDNN_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.10"
+    GIT_TAG             "v0.11"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
     CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
                         -DMKLROOT:PATH=${MKLDNN_MKLROOT}
 )
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 74f3279831357c21038df133df0f5a432a6dfd20..20dbc32a738d982df2d3f035206279c82c8de264 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -27,8 +27,8 @@ ENDIF()
 INCLUDE(ExternalProject)
 
 SET(MKLML_PROJECT       "extern_mklml")
-SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
+SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 3f86e456cfbe55fe47e5b18e755e34829ebe9930..05d83ad58ef8485d36829e7aeede79f625cfdc43 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -86,7 +86,7 @@ IF(NOT ${CBLAS_FOUND})
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
     )
-
+    SET(CBLAS_PROVIDER openblas)
     IF(WITH_C_API)
         INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
         # Because libopenblas.a is a symbolic link of another library, thus need to
@@ -115,7 +115,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF(${CBLAS_PROVIDER} MATCHES MKL)
+IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
     ADD_LIBRARY(cblas SHARED ${dummyfile})
 ELSE()
     ADD_LIBRARY(cblas STATIC ${dummyfile})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c311783aa3187678c31c27ddbbd074790ca444f3..b9c1dde97bc444d793d67ff622fd6b13c6435a9a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,7 +93,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 if(NOT APPLE AND NOT ANDROID)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
-    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)
 
 function(merge_static_libs TARGET_NAME)
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index d4e9d53e5c0955912a594fe8cd9cd41a4080a2d2..203506d7ab84e5a5be2232b077eac2d433a99766 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -82,6 +82,11 @@ maxout
 ..  autoclass:: paddle.v2.layer.maxout
     :noindex:
 
+roi_pool
+--------
+..  autoclass:: paddle.v2.layer.roi_pool
+    :noindex:
+
 Norm Layer
 ==========
 
diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst
index fef87c4fbdb452771ecdb361c6eeae5b32bcee14..b56c7332cc284649c7e04328e51a7faa78593a39 100644
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
@@ -2,112 +2,9 @@
 Data Reader Interface and DataSets
 ==================================
 
+..  toctree::
+    :maxdepth: 1
 
-DataTypes
-=========
-
-..  automodule:: paddle.v2.data_type
-    :members:
-    :noindex:
-
-DataFeeder
-==========
-
-..  automodule:: paddle.v2.data_feeder
-    :members:
-    :noindex:
-
-Reader
-======
-
-..  automodule:: paddle.v2.reader
-    :members:
-    :noindex:
-
-..  automodule:: paddle.v2.reader.creator
-    :members:
-    :noindex:
-
-minibatch
-=========
-
-..  automodule:: paddle.v2.minibatch
-    :members:
-    :noindex:
-
-Dataset
-=======
-
-..  automodule:: paddle.v2.dataset
-    :members:
-    :noindex:
-
-mnist
-+++++
-
-..  automodule:: paddle.v2.dataset.mnist
-    :members:
-    :noindex:
-
-cifar
-+++++
-
-..  automodule:: paddle.v2.dataset.cifar
-    :members:
-    :noindex:
-
-conll05
-+++++++
-
-..  automodule:: paddle.v2.dataset.conll05
-    :members: get_dict,get_embedding,test
-    :noindex:
-
-imdb
-++++
-
-..  automodule:: paddle.v2.dataset.imdb
-    :members:
-    :noindex:
-
-imikolov
-++++++++
-
-..  automodule:: paddle.v2.dataset.imikolov
-    :members:
-    :noindex:
-
-movielens
-+++++++++
-
-..  automodule:: paddle.v2.dataset.movielens
-    :members:
-    :noindex:
-
-..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
-    :noindex:
-    
-..  autoclass:: paddle.v2.dataset.movielens.UserInfo
-    :noindex:
-
-sentiment
-+++++++++
-
-..  automodule:: paddle.v2.dataset.sentiment
-    :members:
-    :noindex:
-
-uci_housing
-+++++++++++
-
-..  automodule:: paddle.v2.dataset.uci_housing
-    :members:
-    :noindex:
-
-wmt14
-+++++
-
-..  automodule:: paddle.v2.dataset.wmt14
-    :members:
-    :noindex:
-
+    data/data_reader.rst
+    data/image.rst
+    data/dataset.rst
diff --git a/doc/api/v2/data/data_reader.rst b/doc/api/v2/data/data_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2ccfec9c284877a7576e9751526b169a4ac78d8e
--- /dev/null
+++ b/doc/api/v2/data/data_reader.rst
@@ -0,0 +1,36 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+..  automodule:: paddle.v2.data_type
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.v2.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.v2.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a8ecc5bb1d855e0ded3719943ab3adb810de365
--- /dev/null
+++ b/doc/api/v2/data/dataset.rst
@@ -0,0 +1,75 @@
+Dataset
+=======
+
+..  automodule:: paddle.v2.dataset
+    :members:
+    :noindex:
+
+mnist
++++++
+
+..  automodule:: paddle.v2.dataset.mnist
+    :members:
+    :noindex:
+
+cifar
++++++
+
+..  automodule:: paddle.v2.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
++++++++
+
+..  automodule:: paddle.v2.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+
+imdb
+++++
+
+..  automodule:: paddle.v2.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
+++++++++
+
+..  automodule:: paddle.v2.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
++++++++++
+
+..  automodule:: paddle.v2.dataset.movielens
+    :members:
+    :noindex:
+
+..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
+    :noindex:
+    
+..  autoclass:: paddle.v2.dataset.movielens.UserInfo
+    :noindex:
+
+sentiment
++++++++++
+
+..  automodule:: paddle.v2.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
++++++++++++
+
+..  automodule:: paddle.v2.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
++++++
+
+..  automodule:: paddle.v2.dataset.wmt14
+    :members:
+    :noindex:
diff --git a/doc/api/v2/data/image.rst b/doc/api/v2/data/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49
--- /dev/null
+++ b/doc/api/v2/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+..  automodule:: paddle.v2.image
+    :members:
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
index fe8da907d9d45a2164031430ac5b7a3d5523967a..16236763a73770f3fe5eadf67645765d0456f875 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -15,6 +15,7 @@
  	- [CMake](#cmake)
 	- [Layers](#layers)
 	- [Activations](#activations)
+	- [Weights](#weights)
 	- [Unit Tests](#unit-tests)
 	- [Protobuf Messages](#protobuf-messages)
 	- [Python API](#python-api)
@@ -45,17 +46,23 @@ Figure 1. PaddlePaddle on IA.
 
 ### Layers
 所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
-`paddle/gserver/layers`中，并且文件名都会一以*Mkldnn*开头。
+`paddle/gserver/layers`中，并且文件名都会一以*MKLDNN*开头。
 
-所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类，该父类继承于PaddlePaddle的基类`Layer`。
+所有MKL-DNN的layers都会继承于一个叫做`MKLDNNLayer`的父类，该父类继承于PaddlePaddle的基类`Layer`。
+
+在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑。部分函数定义为纯虚函数，子类只需要实现这些函数即可。
 
 ### Activations
-由于在PaddlePaddle中，激活函数是独立于layer概念的，所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口，实现方法还是会在`ActivationFunction.cpp`文件。
+由于在PaddlePaddle中，激活函数是独立于layer概念的，所以会在`paddle/gserver/activations`目录下添加`MKLDNNActivation.h`和`MKLDNNActivation.cpp`文件用于定义和使用MKL-DNN的接口。
 
-### Unit Tests
-会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp`和`MkldnnTester.*`用于MKL-DNN的测试。
+### Weights
+由于有些layer是含有参数的，我们会尽量让MKL-DNN的参数与PaddlePaddle中`parameter`共享一块内存。
+同时，由于MKL-DNN在训练时使用的参数layout可能与PaddlePaddle默认的`nchw`不一致，我们会在网络训练的开始和结束时分别转换这个layout，使得最终保存的参数格式与PaddlePaddle一致。
 
-Activation的测试，计划在PaddlePaddle原有的测试文件上直接添加新的测试type。
+### Unit Tests
+会在`paddle/gserver/test`目录下添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+测试分为每个layer(或activation)的单元测试和简单网络的整体测试。
+每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
 
 ### Protobuf Messages
 根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
@@ -82,7 +89,7 @@ if use_mkldnn
 会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹，里面放入一些用于MKL-DNN测试的demo脚本。
 
 ### Benchmarking
-会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`，添加使用MKL-DNN的测试。
+会添加`benchmark/paddle/image/run_mkldnn.sh`，用于测试使用MKL-DNN之后的性能。
 
 ### Others
 1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为64。
@@ -94,14 +101,16 @@ if use_mkldnn
 
 我们总结出一些特别需要注意的点：
 
-1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MkldnnLayer`特有的设备ID。
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
 2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
-3. 创建`MkldnnMatrix`，用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
-4. 创建`MkldnnBase`，定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream`和`CpuEngine`，和未来可能还会用到`FPGAEngine`等。
-5. 在**Argument**里添加两个`MkldnnMatrixPtr`，取名为`mkldnnValue`和`mkldnnGrad`，用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名)，用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
-6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑，用于判断`deviceId`，并针对device在MKL-DNN和CPU之间不统一的情况，做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
-7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
-8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况，所以需要在保存参数时同时保存该格式信息。目前准备扩展[Header](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247)里面的`int32_t version`。这个值不管是在v1还是在v2里面，一直保存的是0，所以可以充分利用这个信息，定义一个枚举处理所有MKLDNN的参数格式，从而`MKLDNNLayer`就可以从输入的参数中获取需要的格式信息。
+3. 创建`MKLDNNMatrix`，同时继承`CpuMatrix`和`mkldnn::memory`。用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
+4. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
+5. 每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表input value， input gradient，output value和output gradient。他们会存放MKL-DNN用到的internal memory。同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory)，主要是在格式与PaddlePaddle默认的`nchw`格式不匹配时，用于转换内存的工作。必要的转换函数也会在`MKLDNNLayer`中提前定义好，每个子类只需要调用定义好的reset buffer函数即可。
+6. 每个`MKLDNNlayer`的resetbuffer相关的函数（包括reset input、output的Value和grad），他们会根据输入参数reset internal和external的memory，当然这两者也可以相等，即表示不需要转换。只需要把握一个原则，每个`MKLDNNlayer`的子类，只需要使用internal的memory就可以了，所有external的转换工作在父类的reset函数中都提前准备好了。
+7. 一般来说，external的memory会尽量与PaddlePaddle中的`value`和`grad`共享内存。同时每个`MKLDNNLayer`中的external output value和gradient(也就是`extOutVal_`和`extOutGrad_`)必须分别与`output_.value`和`output_.grad`共享内存，因为PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`。如果不需要external的buffer用于转换，那么internal的buffer也会与他们共享内存。
+8. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，同时数据格式就是`nchw`，这样下一个cpu device就能拿到正确的数据。在有cpu device的时候，external的memory的格式始终是`nchw`或者`nc`。
+9. 由于MKL-DNN的输出操作都是覆盖data的，不是在原来的数据上累加，所以当网络出现分支时，在`backward`时会需要merge不同layer的梯度。`MKLDNNlayer`中会实现merge的方法，此时每个小分支的input gradient会先临时保存在一个`MKLDNNMatrix`中，由分支处的layer负责求和，并把结果放到这个layer的`output_.grad`中。所以整体上，每个子类并不会需要关心分支的事情，也是在父类都实现好了。
+10. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
 
 ## References
 
diff --git a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455
Binary files /dev/null and b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md
new file mode 100644
index 0000000000000000000000000000000000000000..9007aae7a8355ed06c6720a921351f81b859c1fe
--- /dev/null
+++ b/doc/design/ops/sequence_decoder.md
@@ -0,0 +1,245 @@
+# Design: Sequence Decoder Generating LoDTensors
+In tasks such as machine translation and image to text, 
+a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences.
+
+This documentation describes how to implement the sequence decoder as an operator.
+
+## Beam Search based Decoder
+The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences, 
+it is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
+
+In the old version of PaddlePaddle, a C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, 
+due to the complexity, the implementation relays on a lot of special data structures, 
+quite trivial and hard to be customized by users.
+
+There are a lot of heuristic tricks in the sequence generation tasks, 
+so the flexibility of sequence decoder is very important to users.
+
+During PaddlePaddle's refactoring work,
+some new concept is proposed such as [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support sequence usage,
+and they can help to make the implementation of beam search based sequence decoder **more transparent and modular** .
+
+For example, the RNN sates, candidates IDs and probabilities of beam search can be represented as `LoDTensors`;
+the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
+
+## Changing LoD's absolute offset to relative offsets
+The current `LoDTensor` is designed to store levels of variable-length sequences,
+it stores several arrays of integers each represents a level.
+
+The integers in each level represents the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, 
+let's call this format the **absolute-offset LoD** for clear.
+
+The relative-offset LoD can fast retrieve any sequence but fails to represent empty sequences, for example, a two-level LoD is as follows
+```python
+[[0, 3, 9]
+ [0, 2, 3, 3, 3, 9]]
+```
+The first level tells that there are two sequences:
+- the first's offset is `[0, 3)`
+- the second's offset is `[3, 9)`
+
+while on the second level, there are several empty sequences that both begin and end at `3`.
+It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
+
+There are many scenarios that relay on empty sequence representation,
+such as machine translation or image to text, one instance has no translations or the empty candidate set for a prefix.
+
+So let's introduce another format of LoD, 
+it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
+
+For example, to represent the same sequences of the above data
+
+```python
+[[0, 3, 6]
+ [0, 2, 3, 3, 3, 9]]
+```
+
+the first level represents that there are two sequences, 
+their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
+
+The second level is the same with the relative offset example because the lower level is a tensor.
+It is easy to find out the second sequence in the first-level LoD has two empty sequences.
+
+The following demos are based on relative-offset LoD.
+
+## Usage in a simple machine translation model
+Let's start from a simple machine translation model that is simplified from [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a simple blueprint of what a sequence decoder can do and how to use it.
+
+The model has an encoder that learns the semantic vector from a sequence,
+and a decoder which uses the sequence decoder to generate new sentences.
+
+**Encoder**
+```python
+import paddle as pd
+
+dict_size = 8000
+source_dict_size = dict_size
+target_dict_size = dict_size
+word_vector_dim = 128
+encoder_dim = 128
+decoder_dim = 128
+beam_size = 5
+max_length = 120
+
+# encoder
+src_word_id = pd.data(
+    name='source_language_word',
+    type=pd.data.integer_value_sequence(source_dict_dim))
+src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
+
+src_word_vec = pd.lookup(src_embedding, src_word_id)
+
+encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
+
+encoder_ctx = pd.last_seq(encoder_out_seq)
+# encoder_ctx_proj is the learned semantic vector
+encoder_ctx_proj = pd.fc(
+    encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
+```
+
+**Decoder**
+
+```python
+def generate():
+    decoder = pd.while_loop()
+    with decoder.step():
+        decoder_mem = decoder.memory(init=encoder_ctx)  # mark the memory
+        generated_ids = decoder.memory() # TODO init to batch_size <s>s
+        generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
+
+        target_word = pd.lookup(trg_embedding, gendrated_ids)
+        # expand encoder_ctx's batch to fit target_word's lod
+        # for example
+        # decoder_mem.lod is
+        # [[0 1 3],
+        #  [0 1 3 6]]
+        # its tensor content is [a1 a2 a3 a4 a5]
+        # which means there are 2 sentences to translate
+        #   - the first sentence has 1 translation prefixes, the offsets are [0, 1)
+        #   - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
+        # the target_word.lod is 
+        # [[0, 1, 6]
+        #  [0, 2, 4, 7, 9 12]]
+        # which means 2 sentences to translate, each has 1 and 5 prefixes
+        # the first prefix has 2 candidates
+        # the following has 2, 3, 2, 3 candidates
+        # the encoder_ctx_expanded's content will be
+        # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
+        encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
+        decoder_input = pd.fc(
+            act=pd.activation.Linear(),
+            input=[target_word, encoder_ctx],
+            size=3 * decoder_dim)
+        gru_out, cur_mem = pd.gru_step(
+            decoder_input, mem=decoder_mem, size=decoder_dim)
+        scores = pd.fc(
+            gru_out,
+            size=trg_dic_size,
+            bias=None,
+            act=pd.activation.Softmax())
+        # K is an config
+        topk_scores, topk_ids = pd.top_k(scores, K)
+        topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
+
+        selected_ids, selected_generation_scores = decoder.beam_search(
+            topk_ids, topk_generated_scores)
+
+        # update the states
+        decoder_mem.update(cur_mem)  # tells how to update state
+        generated_ids.update(selected_ids)
+        generated_scores.update(selected_generation_scores)
+
+        decoder.output(selected_ids)
+        decoder.output(selected_generation_scores)
+
+translation_ids, translation_scores = decoder()
+```
+The `decoder.beam_search` is a operator that given the candidates and the scores of translations including the candidates,
+return the result of the beam search algorithm.
+
+In this way, users can customize anything on the inputs or outputs of beam search, for example, two ways to prune some translation prefixes
+
+1. meke the correspondind elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
+2. remove some specific candidate in `selected_ids`
+3. get the final `translation_ids`, remove the translation sequence in it.
+
+The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
+so the python syntax is quite similar to a [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
+
+Both of them are two-level `LoDTensors`
+
+- the first level represents `batch_size` of (source) sentences;
+- the second level represents the candidate ID sets for translation prefix.
+
+for example, 3 source sentences to translate, and has 2, 3, 1 candidates.
+
+Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape,
+a `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
+
+For example, the previous state
+
+* LoD is `[0, 1, 3][0, 2, 5, 6]`
+* content of tensor is `a1 a2 b1 b2 b3 c1`
+
+the current state stored in `encoder_ctx_expanded`
+
+* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
+* the content is 
+  - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
+  - a2 a2
+  - b1 b1 b1
+  - b2
+  - b3 b3
+  - None (c1 has 0 candidates, so c1 is dropped)
+
+Benefit from the relative offset LoD, empty candidate set can be represented naturally.
+
+the status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor, the corresponding syntax is 
+
+```python
+decoder.output(selected_ids)
+decoder.output(selected_generation_scores)
+```
+
+the `selected_ids` is the candidate ids for the prefixes, 
+it will be `Packed` by `TensorArray` to a two-level `LoDTensor`,
+the first level represents the source sequences,
+the second level represents generated sequences.
+
+Pack the `selected_scores` will get a `LoDTensor` that stores scores of each candidate of translations.
+
+Pack the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
+
+## LoD and shape changes during decoding
+<p align="center">
+  <img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
+</p>
+
+According the image above, the only phrase to change LoD is beam search.
+
+## Beam search design
+The beam search algorthm will be implemented as one method of the sequence decoder, it has 3 inputs
+
+1. `topk_ids`, top K candidate ids for each prefix.
+2. `topk_scores`, the corresponding scores for `topk_ids`
+3. `generated_scores`, the score of the prefixes.
+
+All of the are LoDTensors, so that the sequence affilication is clear.
+Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
+
+It will return three variables
+
+1. `selected_ids`, the final candidate beam search function selected for the next step.
+2. `selected_scores`, the scores for the candidates.
+3. `generated_scores`, the updated scores for each prefixes (with the new candidates appended).
+
+## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
+The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors,
+and they exist in each time step,
+so it is natural to store them in arrays.
+
+Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors,
+the results of beam search are better to store in a `TensorArray`.
+
+The `Pack` and `UnPack` in `TensorArray` are used to package tensors in the array to a `LoDTensor` or split the `LoDTensor` to an array of tensors. 
+It needs some extensions to support pack or unpack an array of `LoDTensors`.
diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
index 0e939a2671ace8682c90cdc1c1bb2da1dda0d568..b331d9d36e6a279881c3b1a5586835e7186957fb 100644
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -99,7 +99,7 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa
 利用更多的计算资源
 ++++++++++++++++++
 
-利用更多的计算资源可以分为一下几个方式来进行\:
+利用更多的计算资源可以分为以下几个方式来进行\:
 
 * 单机CPU训练
 
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index c823d7e9fcd63dd7719ac1403952b03c2d2f03c0..6cfc9536f20e88571a9845a50be0341fe4d9f78b 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -214,7 +214,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
     ```cpp
     // if use Eigen unsupported module before include head files
-    #define EIGEN_USE_GPU
+    // #define EIGEN_USE_GPU
 
     namespace ops = paddle::operators;
     REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index 4547afaf1dc9af8bc7909a684db766fdd7b159c0..53a36f8f20d1143470928f57eda6f575d9048236 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -54,6 +54,46 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
   return kPD_NO_ERROR;
 }
 
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+                                          paddle_real* value) {
+  if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  paddle::real* buf = ptr->mat->getRowBuf(0);
+  size_t width = ptr->mat->getWidth();
+  size_t height = ptr->mat->getHeight();
+  if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+    hl_memcpy(buf, value, sizeof(paddle::real) * width * height);
+#else
+    return kPD_NOT_SUPPORTED;
+#endif
+  } else {
+    std::copy(value, value + width * height, buf);
+  }
+  return kPD_NO_ERROR;
+}
+
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+                                          paddle_real* result) {
+  if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  paddle::real* buf = ptr->mat->getRowBuf(0);
+  size_t width = ptr->mat->getWidth();
+  size_t height = ptr->mat->getHeight();
+  if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+    hl_memcpy(result, buf, width * height * sizeof(paddle::real));
+#else
+    return kPD_NOT_SUPPORTED;
+#endif
+  } else {
+    std::copy(buf, buf + width * height, result);
+  }
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                    uint64_t rowID,
                                    paddle_real** rawRowBuffer) {
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
index 3e6bd5285058a297c4574631e2a5c033b83936e8..876af2aa7615c098d225b56ce2ea0b1529a6e3c6 100644
--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -27,18 +27,20 @@ int main() {
   CHECK(paddle_arguments_resize(in_args, 1));
 
   // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 10,
                                            /* size */ 784,
                                            /* useGPU */ false);
   srand(time(0));
-  paddle_real* array;
 
-  // Get First row.
-  CHECK(paddle_matrix_get_row(mat, 0, &array));
+  std::vector<paddle_real> input;
+  input.resize(784 * 10);
 
-  for (int i = 0; i < 784; ++i) {
-    array[i] = rand() / ((float)RAND_MAX);
+  for (int i = 0; i < input.size(); ++i) {
+    input[i] = rand() / ((float)RAND_MAX);
   }
+  
+  // Set value for the input matrix
+  CHECK(paddle_matrix_set_value(mat, input.data()));
 
   CHECK(paddle_arguments_set_value(in_args, 0, mat));
 
@@ -51,11 +53,17 @@ int main() {
 
   CHECK(paddle_arguments_get_value(out_args, 0, prob));
 
-  CHECK(paddle_matrix_get_row(prob, 0, &array));
+  std::std::vector<paddle_real> result;
+  int height;
+  int width;
+
+  CHECK(paddle_matrix_get_shape(prob, &height, &width);
+  result.resize(height * width);
+  CHECK(paddle_matrix_get_value(prob, result.data()));
 
   printf("Prob: ");
-  for (int i = 0; i < 10; ++i) {
-    printf("%.2f ", array[i]);
+  for (int i = 0; i < height * width; ++i) {
+    printf("%.2f ", result[i]);
   }
   printf("\n");
 
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index f15f7f3bbbd1457617111f827d2182ae6b7d9fdb..bb5223f8a275fa2550bf8b7e94a9c4333de4c8c9 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -70,6 +70,16 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
                                           uint64_t rowID,
                                           paddle_real* rowArray);
 
+/**
+ * @brief paddle_matrix_set_value Set value to matrix.
+ * @param mat Target Matrix
+ * @param value Row data.
+ * @return paddle_error
+ * @note  value should contain enough element of data to init the mat
+ */
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+                                          paddle_real* value);
+
 /**
  * @brief PDMatGetRow Get raw row buffer from matrix
  * @param [in] mat Target matrix
@@ -81,6 +91,15 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                           uint64_t rowID,
                                           paddle_real** rawRowBuffer);
 
+/**
+ * @brief copy data from the matrix 
+ * @param [in] mat Target matrix
+ * @param [out] result pointer to store the matrix data 
+ * @return paddle_error
+ * @note the space of the result should allocated before invoke this API
+ */
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+                                          paddle_real* result);
 /**
  * @brief PDMatCreateNone Create None Matrix
  * @return
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/capi/tests/test_Matrix.cpp
index 4bf9a9d6a9f9161561e9e5612edd2c93cab7ac5b..6940c28448a897cecd78b718fe720441086a5a99 100644
--- a/paddle/capi/tests/test_Matrix.cpp
+++ b/paddle/capi/tests/test_Matrix.cpp
@@ -45,3 +45,49 @@ TEST(CAPIMatrix, createNone) {
   paddle_matrix mat = paddle_matrix_create_none();
   ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
 }
+
+TEST(CAPIMatrix, cpu_get_set_value) {
+  paddle_matrix mat = paddle_matrix_create(128, 32, false);
+  std::vector<paddle_real> sample;
+  std::vector<paddle_real> result;
+  sample.resize(128 * 32);
+  result.resize(128 * 32);
+  for (size_t i = 0; i < sample.size(); ++i) {
+    sample[i] = 1.0 / (i + 1.0);
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+  for (size_t i = 0; i < sample.size(); ++i) {
+    ASSERT_NEAR(sample[i], result[i], 1e-5);
+  }
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(128UL, height);
+  ASSERT_EQ(32UL, width);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(CAPIMatrix, gpu_get_set_value) {
+  paddle_matrix mat = paddle_matrix_create(128, 32, true);
+  std::vector<paddle_real> sample;
+  std::vector<paddle_real> result;
+  sample.resize(128 * 32);
+  result.resize(128 * 32);
+  for (size_t i = 0; i < sample.size(); ++i) {
+    sample[i] = 1.0 / (i + 1.0);
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+  for (size_t i = 0; i < sample.size(); ++i) {
+    ASSERT_NEAR(sample[i], result[i], 1e-5);
+  }
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(128UL, height);
+  ASSERT_EQ(32UL, width);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+#endif
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index ed94540c268e5ed990c1d92859c6a2093c052868..913cd0f81eaef37014f38c71e7c3d23bfeec1466 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -321,8 +321,6 @@ static void CreateGradVarInBlock(
         auto* param = block_desc->FindVarRecursive(pname);
         auto* grad = block_desc->FindVar(arg);
         if (param == nullptr) {
-          LOG(WARNING) << "Cannot find forward variable of " << arg
-                       << ". Set its gradient to FP32";
           grad->SetDataType(DataType::FP32);
         } else {
           grad->SetDataType(param->GetDataType());
@@ -408,6 +406,11 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
 
     for (const auto& desc : op_grads) {
       for (const std::string& out_name : desc->OutputArgumentNames()) {
+        if (out_name.find("@GRAD") == std::string::npos) {
+          // Not all outputs of a backward operator is a gradient. Only gradient
+          // need to be sum. Skip variables are not gradient.
+          continue;
+        }
         dup_out_ops[out_name].emplace_back(grad_desc_idx);
       }
       ++grad_desc_idx;
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 4e8d630c2634682ff63b38182108eadebb5c7ff9..d485cdf6109274377ad0057223bdd8401e964aa7 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -21,7 +21,7 @@
 #include "paddle/framework/var_desc.h"
 #include "paddle/operators/net_op.h"
 
-USE_OP(fill_constant);
+USE_NO_KERNEL_OP(fill_constant);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 9e3d597f3a2c84623a1ce9e4b6f4b956cffde211..11764810e1d40e5e6eb3cd0d8e9b4b63a79855b4 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -50,6 +50,15 @@ VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
   return it->second.get();
 }
 
+VarDescBind *BlockDescBind::FindRecursiveOrCreateVar(
+    const std::string &name_bytes) {
+  VarDescBind *res = FindVarRecursive(name_bytes);
+  if (res == nullptr) {
+    res = Var(name_bytes);
+  }
+  return res;
+}
+
 bool BlockDescBind::HasVarRecursive(const std::string &name) const {
   return FindVarRecursive(name) != nullptr;
 }
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 26adf6a20ff09483b84f479db08efcf402135053..8e967e5378eb47a7869efb59cc96a271f1cbb9a1 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -58,6 +58,8 @@ class BlockDescBind {
 
   VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
 
+  VarDescBind *FindRecursiveOrCreateVar(const std::string &name_bytes);
+
   bool HasVarRecursive(const std::string &var_name) const;
 
   std::set<std::string> LocalVarNames() const {
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index c5ae7b185460c8b0d68ba38bb9db9bd3d3fb14ea..3ec88d7a72c3339bf5e7d0ca3957a3f608f039b7 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -34,6 +34,21 @@ inline DataType ToDataType(std::type_index type) {
   }
 }
 
+inline std::type_index ToTypeIndex(DataType type) {
+  switch (type) {
+    case DataType::FP32:
+      return typeid(float);
+    case DataType::FP64:
+      return typeid(double);
+    case DataType::INT32:
+      return typeid(int);
+    case DataType::INT64:
+      return typeid(int64_t);
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
 template <typename Visitor>
 inline void VisitDataType(DataType type, Visitor visitor) {
   switch (type) {
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 10c785e04c4fa2192f9c95513009cf7d8c123868..53b899a23997b71e723a298ec360a4e018d89878 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -79,6 +79,13 @@ DDim make_ddim(const std::vector<int64_t>& dims) {
   return result;
 }
 
+DDim make_ddim(const std::vector<int>& dims) {
+  std::vector<int64_t> res(dims.size());
+  std::transform(dims.begin(), dims.end(), res.begin(),
+                 [](int d) { return static_cast<int64_t>(d); });
+  return make_ddim(res);
+}
+
 /// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index aa773868ab4b68acbc46dfa2cd2569d8b8b7789d..4ca5e49566b7ec006eba80f3f9808bacb1ff2615 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -81,6 +81,8 @@ struct DDim {
  */
 DDim make_ddim(const std::vector<int64_t>& dims);
 
+DDim make_ddim(const std::vector<int>& dims);
+
 /**
  * \brief Make a DDim from an initializer list
  *
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index e7cba9e702ce0f96a9680169f0593130df2fd096..39c8def82e1ebb10a0e357a648af760099020c32 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -357,7 +357,8 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
                 "LOD_TENSOR";
     for (auto &out_pair : this->outputs_) {
       for (auto &out_var_name : out_pair.second) {
-        block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
+        block->FindRecursiveOrCreateVar(out_var_name)
+            ->SetType(VarDesc::LOD_TENSOR);
       }
     }
   }
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index fb2c69105627f663ddcce07d31526c9e4278e863..9428b8a07ea0af005f6e960ddaa02da624ad9d97 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -98,5 +98,23 @@ void Scope::DeleteScope(Scope* scope) {
   delete scope;
 }
 
+void Scope::Rename(const std::string& origin_name,
+                   const std::string& new_name) const {
+  auto origin_it = vars_.find(origin_name);
+  PADDLE_ENFORCE(origin_it != vars_.end(),
+                 "Cannot find original variable with name %s", origin_name);
+  auto new_it = vars_.find(new_name);
+  PADDLE_ENFORCE(new_it == vars_.end(),
+                 "The variable with name %s is already in the scope", new_name);
+  vars_[new_name] = origin_it->second;
+  vars_.erase(origin_it);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  Rename(origin_name, var_name);
+  return var_name;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index fb660949394149ebf2c6172a0ac3f4c7594f4286..c2aafb6ad825f9bd9ffef754923a15afdeaa8e5c 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -68,11 +68,18 @@ class Scope {
   // enumerate all the variables current contains.
   std::vector<std::string> GetAllNames(bool recursive = false) const;
 
+  // Rename variable to a new name
+  void Rename(const std::string& origin_name,
+              const std::string& new_name) const;
+
+  // Rename variable to a new name and return the new name
+  std::string Rename(const std::string& origin_name) const;
+
  private:
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
 
-  std::unordered_map<std::string, Variable*> vars_;
+  mutable std::unordered_map<std::string, Variable*> vars_;
   mutable std::list<Scope*> kids_;
   Scope const* parent_{nullptr};
 
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 4fd72d64a90ae6f16dd1499ceb7fba6e40fe4cea..9b2779b42cad324253dadf27dbff20fd8e8c8e16 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -45,6 +45,7 @@ if(WITH_GPU)
     add_simple_unittest(BlockExpandOpTest)
     add_simple_unittest(CropOpTest)
     add_simple_unittest(SwitchOpTest)
+    add_simple_unittest(ScaleSubRegionOpTest)
 endif()
 
 add_simple_unittest(Im2ColTest)
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index ba446bf92da264fafa1fb47a2c30da9cb13176ce..370940532ef40335be54a3e6467de0409e923ec4 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -110,6 +110,7 @@ public:
         function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
     function1_->init(config);
     function2_->init(config);
+    initArgsCallback_ = nullptr;
   }
 
   ~Compare2Function() {}
@@ -170,6 +171,10 @@ public:
                                       *seq2_));
   }
 
+  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
+    initArgsCallback_ = callback;
+  }
+
   // output need only contains shape, do not contains data.
   void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
     size_t size =
@@ -340,6 +345,10 @@ protected:
         initArg(*func1Inputs_[i]);
       }
 
+      if (initArgsCallback_ != nullptr) {
+        initArgsCallback_(*func1Inputs_[i], i);
+      }
+
       copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
     }
   }
@@ -386,6 +395,7 @@ protected:
   std::shared_ptr<SequenceIdArg> seq1_;
   std::shared_ptr<SequenceIdArg> seq2_;
   test::CopyArgument<DType1, DType2> copyArg_;
+  std::function<void(BufferArg&, size_t)> initArgsCallback_;
 };
 
 class CpuGpuFuncCompare
diff --git a/paddle/function/ScaleSubRegionOp.cpp b/paddle/function/ScaleSubRegionOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a080505d7df83a6c0a9d88fbcb7863fc0e1f7b21
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "paddle/function/TensorShape.h"
+
+namespace paddle {
+
+template <>
+void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
+                                     const real* inputs,
+                                     const real* indices,
+                                     const TensorShape shape,
+                                     const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
+
+  for (int n = 0; n < number; ++n) {
+    // indices start from 1
+    int offset = n * 6;
+    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
+      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
+        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          outputs[idx] *= value;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                                         real* outGrad,
+                                         const real* indices,
+                                         const TensorShape shape,
+                                         const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  for (int n = 0; n < number; ++n) {
+    for (int c = 0; c < channel; ++c) {
+      for (int h = 0; h < height; ++h) {
+        for (int w = 0; w < width; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          int offset = n * 6;
+          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+              h >= (indices[offset + 2] - 1) &&
+              h <= (indices[offset + 3] - 1) &&
+              w >= (indices[offset + 4] - 1) &&
+              w <= (indices[offset + 5] - 1)) {
+            outGrad[idx] += inGrad[idx] * value;
+          } else {
+            outGrad[idx] += inGrad[idx];
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief For each instance, ScaleSubRegion can be used to multiply a value to
+ *        a specified sub continuous region. By providing start index and end
+ *        index for C/H/W, you can specify the location and shape of the region.
+ *
+ * Argument in this Function:
+ * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
+ * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs   A 4-D tensor with same shape as inputs, output value.
+ */
+template <DeviceType Device>
+class ScaleSubRegionFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegion<Device>(outputs[0].data<real>(),
+                           inputs[0].data<real>(),
+                           inputs[1].data<real>(),
+                           shape,
+                           conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of ScaleSubRegion Function.
+ *
+ * Argument in this Function:
+ * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
+ */
+
+template <DeviceType Device>
+class ScaleSubRegionGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
+                               outputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               shape,
+                               conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.h b/paddle/function/ScaleSubRegionOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..0480c8577f3fbf3bc9e94b635df96a31b103e9e3
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief Function to multiply a value to values in specified sub continuous
+ *        region. Indices must be provided to indcate the location and shape of
+ *        the region and the multiplied value is passed by configure variable.
+ *
+ *
+ * \param[out] outputs  Output value.
+ * \param[in]  inputs   Input data which contains NCHW information.
+ * \param[in]  indices  Indices data to indcate the sub region.
+ * \param[in]  shape    Tensor shape of input value.
+ * \param[in]  conf     Configure variable which contains the multiplied value.
+ */
+template <DeviceType Device>
+void ScaleSubRegion(real* outputs,
+                    const real* inputs,
+                    const real* indices,
+                    const TensorShape shape,
+                    const FuncConfig& conf);
+
+/**
+ * \brief Backward propagation function of ScaleSubRegion.
+ *
+ * \param[out] inGrad   Gradients of previous layer.
+ * \param[in]  outGrad  Output gradient.
+ * \param[in]  indices  Indices data.
+ * \param[in]  shape    The Shape of input tensor.
+ * \param[in]  conf     Configure variable.
+ */
+template <DeviceType Device>
+void ScaleSubRegionGrad(const real* inGrad,
+                        real* outGrad,
+                        const real* indices,
+                        const TensorShape shape,
+                        const FuncConfig& conf);
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpGpu.cu b/paddle/function/ScaleSubRegionOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8aae2e44c3fdc8b516e66ecfd2e04f466a17dde9
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpGpu.cu
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "hl_base.h"
+
+namespace paddle {
+
+__global__ void KeScaleSubRegion(real* outputs,
+                                 const real* inputs,
+                                 const real* indices,
+                                 real value,
+                                 int channel,
+                                 int height,
+                                 int width,
+                                 int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int c = (idx / width / height) % channel;
+    const int n = idx / width / height / channel;
+
+    const int offset = n * 6;
+    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+      outputs[idx] = inputs[idx] * value;
+    } else {
+      outputs[idx] = inputs[idx];
+    }
+  }
+}
+
+template <>
+void ScaleSubRegion<DEVICE_TYPE_GPU>(real* outputs,
+                                     const real* inputs,
+                                     const real* indices,
+                                     const TensorShape shape,
+                                     const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  size_t nth = number * channel * height * width;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeScaleSubRegion<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      outputs, inputs, indices, value, channel, height, width, nth);
+  CHECK_SYNC("ScaleSubRegion");
+}
+
+__global__ void KeScaleSubRegionDiff(const real* inGrad,
+                                     real* outGrad,
+                                     const real* indices,
+                                     real value,
+                                     int channel,
+                                     int height,
+                                     int width,
+                                     int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int c = (idx / width / height) % channel;
+    const int n = idx / width / height / channel;
+
+    const int offset = n * 6;
+    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+      outGrad[idx] += inGrad[idx] * value;
+    } else {
+      outGrad[idx] += inGrad[idx];
+    }
+  }
+}
+
+template <>
+void ScaleSubRegionGrad<DEVICE_TYPE_GPU>(const real* inGrad,
+                                         real* outGrad,
+                                         const real* indices,
+                                         const TensorShape shape,
+                                         const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  size_t nth = number * channel * height * width;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeScaleSubRegionDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      inGrad, outGrad, indices, value, channel, height, width, nth);
+  CHECK_SYNC("ScaleSubRegionGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpTest.cpp b/paddle/function/ScaleSubRegionOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..43331f258dddaa43cbc8cc77519e299de7e98290
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpTest.cpp
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(ScaleSubRegion, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {5, 32}) {
+      for (size_t imgSizeH : {5, 33}) {
+        for (size_t imgSizeW : {5, 32}) {
+          for (real value : {-0.5, 0.0, 0.5}) {
+            for (bool firstHalf : {false, true}) {
+              VLOG(3) << " numSamples=" << numSamples
+                      << " channels=" << channels << " imgSizeH=" << imgSizeH
+                      << " imgSizeW=" << imgSizeW;
+
+              for (bool testGrad : {false, true}) {
+                CpuGpuFuncCompare compare(
+                    testGrad ? "ScaleSubRegionGrad" : "ScaleSubRegion",
+                    FuncConfig().set<real>("value", value));
+
+                TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+                TensorShape indicesShape{numSamples, 6};
+
+                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, indicesShape));
+
+                compare.registerInitCallback([=](BufferArg& arg, size_t index) {
+                  if (index == 1) {
+                    real* data = (real*)arg.data();
+
+                    for (size_t i = 0; i < numSamples; ++i) {
+                      size_t offset = i * 6;
+                      data[offset] = firstHalf ? 1 : channels / 2;
+                      data[offset + 1] = firstHalf ? channels / 2 : channels;
+                      data[offset + 2] = firstHalf ? 1 : imgSizeH / 2;
+                      data[offset + 3] = firstHalf ? imgSizeH / 2 : imgSizeH;
+                      data[offset + 4] = firstHalf ? 1 : imgSizeW / 2;
+                      data[offset + 5] = firstHalf ? imgSizeW / 2 : imgSizeW;
+                    }
+                  }
+                });
+
+                compare.addOutputs(
+                    BufferArg(
+                        VALUE_TYPE_FLOAT, shape, testGrad ? ADD_TO : ASSIGN_TO),
+                    testGrad ? ADD_TO : ASSIGN_TO);
+                compare.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
index 8eb700723f2cf7dda969739bb5e3d48358d278a0..6ffe4fbec643e50d27924a989875454d307f5b9b 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -62,16 +62,14 @@ void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
                                 MKLDNNMatrixPtr& wgt,
                                 MKLDNNMatrixPtr& bias,
                                 MKLDNNMatrixPtr& out) {
-  if (biases_) {
-    LOG(FATAL) << "not implemented yet";
-  }
-  resetFwdBuffers(inVals_, out);
+  resetFwdBuffers(inVals_, bias, out);
   in = inVals_[0];
 
   std::shared_ptr<sum::primitive_desc> fwdPD;
-  resetFwdPD(fwdPD, inVals_, out);
+  std::shared_ptr<sum::primitive_desc> biasPD;
+  resetFwdPD(fwdPD, biasPD, inVals_, bias, out);
 
-  resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+  resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
 }
 
 void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -79,7 +77,7 @@ void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
                                 MKLDNNMatrixPtr& wgt,
                                 MKLDNNMatrixPtr& bias,
                                 MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inGrads_, out);
+  resetBwdBuffers(inGrads_, bias, out);
   in = inGrads_[0];
 
   // backward only need share output grad to input grad
@@ -89,6 +87,20 @@ void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
       inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
     }
   }
+
+  // backward bias
+  bwdBias_ = nullptr;
+  if (bias) {
+    std::vector<float> scales(bs_, 1.0);
+    std::vector<memory::primitive_desc> srcPDs(bs_, bias->getPrimitiveDesc());
+    auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
+    std::vector<primitive::at> srcs;
+    for (size_t i = 0; i < grads_.size(); ++i) {
+      srcs.push_back(*(grads_[i]));
+    }
+    bwdBias_.reset(new sum(biasPD, srcs, *bias));
+    pipeline.push_back(*bwdBias_);
+  }
 }
 
 void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
@@ -97,7 +109,25 @@ void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
   }
 }
 
+void MKLDNNAddtoLayer::prepareBias(MKLDNNMatrixPtr& bias,
+                                   const MatrixPtr& biasMat,
+                                   const MKLDNNMatrixPtr& out,
+                                   std::vector<MKLDNNMatrixPtr>& outs) {
+  auto pd = MKLDNNMatrix::createPrimitiveDesc(
+      {(int)layerSize_}, memory::format::x, engine_);
+  bias = MKLDNNMatrix::create(pd, biasMat);
+  outs.clear();
+  real* data = out->getData();
+  CHECK_EQ(bs_ * layerSize_, out->getElementCnt());
+  for (int i = 0; i < bs_; ++i) {
+    MatrixPtr tmp =
+        Matrix::create(data + i * layerSize_, 1, layerSize_, false, false);
+    outs.push_back(MKLDNNMatrix::create(bias->getPrimitiveDesc(), tmp));
+  }
+}
+
 void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& bias,
                                        MKLDNNMatrixPtr& out) {
   inputs.resize(inputLayers_.size());
   for (size_t i = 0; i < inputs.size(); i++) {
@@ -110,12 +140,20 @@ void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
   }
 
   resetOutValue(out, inputs[0]->getPrimitiveDesc());
+
+  if (biases_ && biases_->getW()) {
+    prepareBias(bias, biases_->getW(), out, vals_);
+  } else {
+    bias = nullptr;
+  }
 }
 
 void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
+                                  std::shared_ptr<sum::primitive_desc>& biasPD,
                                   std::vector<MKLDNNMatrixPtr>& inputs,
+                                  MKLDNNMatrixPtr bias,
                                   MKLDNNMatrixPtr out) {
-  std::vector<double> scales(inputs.size(), 1.0);
+  std::vector<float> scales(inputs.size(), 1.0);
   std::vector<memory::primitive_desc> srcPDs;
   for (size_t i = 0; i < inputs.size(); i++) {
     srcPDs.push_back(inputs[i]->getPrimitiveDesc());
@@ -123,12 +161,23 @@ void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
   CHECK(out);
   pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
   CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+
+  biasPD = nullptr;
+  if (bias) {
+    std::vector<float> scales(2, 1.0);
+    std::vector<memory::primitive_desc> srcPDs(2, bias->getPrimitiveDesc());
+    biasPD.reset(
+        new sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs));
+    CHECK_PRIMITIVE_DESC_EQ(bias, biasPD->dst_primitive_desc());
+  }
 }
 
 void MKLDNNAddtoLayer::resetFwdPipeline(
     std::vector<primitive>& pipeline,
     std::shared_ptr<sum::primitive_desc>& pd,
+    std::shared_ptr<sum::primitive_desc>& biasPD,
     std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
   std::vector<primitive::at> srcs;
   for (size_t i = 0; i < inputs.size(); i++) {
@@ -136,9 +185,23 @@ void MKLDNNAddtoLayer::resetFwdPipeline(
   }
   fwd_.reset(new sum(*pd, srcs, *out));
   pipeline.push_back(*fwd_);
+
+  fwdBias_.clear();
+  if (biasPD == nullptr || bias == nullptr) {
+    return;
+  }
+  fwdBias_.resize(vals_.size());
+  for (size_t i = 0; i < vals_.size(); ++i) {
+    std::vector<primitive::at> srcs;
+    srcs.push_back(*(vals_[i]));
+    srcs.push_back(*bias);
+    fwdBias_[i].reset(new sum(*biasPD, srcs, *vals_[i]));
+    pipeline.push_back(*fwdBias_[i]);
+  }
 }
 
 void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& bias,
                                        MKLDNNMatrixPtr& out) {
   CHECK(outVal_);
   resetOutGrad(out, outVal_->getPrimitiveDesc());
@@ -149,6 +212,12 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
     resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
     CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
   }
+
+  if (biases_ && biases_->getWGrad()) {
+    prepareBias(bias, biases_->getWGrad(), out, grads_);
+  } else {
+    bias = nullptr;
+  }
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/gserver/layers/MKLDNNAddtoLayer.h
index 15f74ec5bdf3d1e4ae5e09051be6be418590a67a..24504b7b4f50726e2b2757ca3029461cdc27b411 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -32,9 +32,15 @@ protected:
   // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
   size_t layerSize_;
 
-  // TODO(TJ): this part has not been optimized by MKL-DNN
   std::unique_ptr<Weight> biases_;
 
+  // buffers for adding bias
+  std::vector<MKLDNNMatrixPtr> vals_;
+  std::vector<MKLDNNMatrixPtr> grads_;
+  // primitives for adding bias
+  std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
+  std::shared_ptr<mkldnn::primitive> bwdBias_;
+
 public:
   explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
 
@@ -91,20 +97,34 @@ protected:
    *                    reset pipeline.
    */
   void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                  std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
                   std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr bias,
                   MKLDNNMatrixPtr out);
   void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
                         std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
                         std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
 
   /**
    * Backward functions: reset buffers(inputs, output, bias)
    */
   void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
+
+  /**
+   * prepare for bias
+   */
+  void prepareBias(MKLDNNMatrixPtr& bias,
+                   const MatrixPtr& biasMat,
+                   const MKLDNNMatrixPtr& out,
+                   std::vector<MKLDNNMatrixPtr>& outs);
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
index 9b0ae20f089e34a719883bc65e88e33ab9334e39..ed3887cbf653878623764a310c9f364f4d8be27f 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -119,7 +119,7 @@ void MKLDNNBatchNormLayer::reshape(
     int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
   reshapeInput(bs, ih, iw);
   oh = ih;
-  ow = ow;
+  ow = iw;
   // ic_ and oc can not be changed
   CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
       << "Input channel can not be changed";
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index 82ef344c7b2aa0093a5f0a28780592dea5d51efe..e75ac5ba4647a8267b7bc189893bd7adb5c3053f 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -287,7 +287,7 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
     return;
   }
   CHECK(out) << "should have reset internal ouput grad";
-  std::vector<double> scales(outputMap_.size(), 1.0);
+  std::vector<float> scales(outputMap_.size(), 1.0);
   std::vector<memory::primitive_desc> srcPDs;
   std::vector<primitive::at> srcs;
   for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..99cfddb0cf3337745a716a8c329713c18b99eda3
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ROIPoolLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(roi_pool, ROIPoolLayer);
+
+bool ROIPoolLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+  pooledWidth_ = layerConf.pooled_width();
+  pooledHeight_ = layerConf.pooled_height();
+  spatialScale_ = layerConf.spatial_scale();
+
+  return true;
+}
+
+void ROIPoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+  height_ = getInput(0).getFrameHeight();
+  if (!height_) height_ = layerConf.height();
+  width_ = getInput(0).getFrameWidth();
+  if (!width_) width_ = layerConf.width();
+  channels_ = getInputValue(0)->getWidth() / width_ / height_;
+
+  size_t batchSize = getInput(0).getBatchSize();
+  size_t numROIs = getInput(1).getBatchSize();
+
+  MatrixPtr dataValue = getInputValue(0);
+  MatrixPtr roiValue = getInputValue(1);
+  resetOutput(numROIs, channels_ * pooledHeight_ * pooledWidth_);
+  MatrixPtr outputValue = getOutputValue();
+
+  if (useGpu_) {  // TODO(guosheng): implement on GPU later
+    MatrixPtr dataCpuBuffer;
+    Matrix::resizeOrCreate(dataCpuBuffer,
+                           dataValue->getHeight(),
+                           dataValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr roiCpuBuffer;
+    Matrix::resizeOrCreate(roiCpuBuffer,
+                           roiValue->getHeight(),
+                           roiValue->getWidth(),
+                           false,
+                           false);
+    dataCpuBuffer->copyFrom(*dataValue);
+    roiCpuBuffer->copyFrom(*roiValue);
+    dataValue = dataCpuBuffer;
+    roiValue = roiCpuBuffer;
+    MatrixPtr outputCpuBuffer;
+    Matrix::resizeOrCreate(outputCpuBuffer,
+                           outputValue->getHeight(),
+                           outputValue->getWidth(),
+                           false,
+                           false);
+    outputCpuBuffer->copyFrom(*outputValue);
+    outputValue = outputCpuBuffer;
+  }
+
+  real* bottomData = dataValue->getData();
+  size_t batchOffset = dataValue->getWidth();
+  size_t channelOffset = height_ * width_;
+  real* bottomROIs = roiValue->getData();
+  size_t roiOffset = roiValue->getWidth();
+  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+
+  real* outputData = outputValue->getData();
+  Matrix::resizeOrCreate(maxIdxs_,
+                         numROIs,
+                         channels_ * pooledHeight_ * pooledWidth_,
+                         false,
+                         false);
+  real* argmaxData = maxIdxs_->getData();
+
+  for (size_t n = 0; n < numROIs; ++n) {
+    // the first five elememts of each RoI should be:
+    // batch_idx, roi_x_start, roi_y_start, roi_x_end, roi_y_end
+    size_t roiBatchIdx = bottomROIs[0];
+    size_t roiStartW = round(bottomROIs[1] * spatialScale_);
+    size_t roiStartH = round(bottomROIs[2] * spatialScale_);
+    size_t roiEndW = round(bottomROIs[3] * spatialScale_);
+    size_t roiEndH = round(bottomROIs[4] * spatialScale_);
+    CHECK_GE(roiBatchIdx, 0);
+    CHECK_LT(roiBatchIdx, batchSize);
+    size_t roiHeight = std::max(roiEndH - roiStartH + 1, 1UL);
+    size_t roiWidth = std::max(roiEndW - roiStartW + 1, 1UL);
+    real binSizeH =
+        static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_);
+    real binSizeW =
+        static_cast<real>(roiWidth) / static_cast<real>(pooledWidth_);
+    real* batchData = bottomData + batchOffset * roiBatchIdx;
+    for (size_t c = 0; c < channels_; ++c) {
+      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+          size_t hstart = static_cast<size_t>(std::floor(ph * binSizeH));
+          size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW));
+          size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH));
+          size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW));
+          hstart = std::min(std::max(hstart + roiStartH, 0UL), height_);
+          wstart = std::min(std::max(wstart + roiStartW, 0UL), width_);
+          hend = std::min(std::max(hend + roiStartH, 0UL), height_);
+          wend = std::min(std::max(wend + roiStartW, 0UL), width_);
+
+          bool isEmpty = (hend <= hstart) || (wend <= wstart);
+          size_t poolIndex = ph * pooledWidth_ + pw;
+          if (isEmpty) {
+            outputData[poolIndex] = 0;
+            argmaxData[poolIndex] = -1;
+          }
+
+          for (size_t h = hstart; h < hend; ++h) {
+            for (size_t w = wstart; w < wend; ++w) {
+              size_t index = h * width_ + w;
+              if (batchData[index] > outputData[poolIndex]) {
+                outputData[poolIndex] = batchData[index];
+                argmaxData[poolIndex] = index;
+              }
+            }
+          }
+        }
+      }
+      batchData += channelOffset;
+      outputData += poolChannelOffset;
+      argmaxData += poolChannelOffset;
+    }
+    bottomROIs += roiOffset;
+  }
+  if (useGpu_) {
+    getOutputValue()->copyFrom(*outputValue);
+  }
+}
+
+void ROIPoolLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inGradValue = getInputGrad(0);
+  MatrixPtr outGradValue = getOutputGrad();
+  MatrixPtr roiValue = getInputValue(1);
+
+  if (useGpu_) {
+    MatrixPtr inGradCpuBuffer;
+    Matrix::resizeOrCreate(inGradCpuBuffer,
+                           inGradValue->getHeight(),
+                           inGradValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr outGradCpuBuffer;
+    Matrix::resizeOrCreate(outGradCpuBuffer,
+                           outGradValue->getHeight(),
+                           outGradValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr roiCpuBuffer;
+    Matrix::resizeOrCreate(roiCpuBuffer,
+                           roiValue->getHeight(),
+                           roiValue->getWidth(),
+                           false,
+                           false);
+    inGradCpuBuffer->copyFrom(*inGradValue);
+    outGradCpuBuffer->copyFrom(*outGradValue);
+    roiCpuBuffer->copyFrom(*roiValue);
+    inGradValue = inGradCpuBuffer;
+    outGradValue = outGradCpuBuffer;
+    roiValue = roiCpuBuffer;
+  }
+
+  real* bottomROIs = roiValue->getData();
+  size_t numROIs = getInput(1).getBatchSize();
+  size_t roiOffset = getInputValue(1)->getWidth();
+
+  real* inDiffData = inGradValue->getData();
+  size_t batchOffset = getInputValue(0)->getWidth();
+  size_t channelOffset = height_ * width_;
+
+  real* outDiffData = outGradValue->getData();
+  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+  real* argmaxData = maxIdxs_->getData();
+
+  for (size_t n = 0; n < numROIs; ++n) {
+    size_t roiBatchIdx = bottomROIs[0];
+    real* batchDiffData = inDiffData + batchOffset * roiBatchIdx;
+    for (size_t c = 0; c < channels_; ++c) {
+      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+          size_t poolIndex = ph * pooledWidth_ + pw;
+          if (argmaxData[poolIndex] > 0) {
+            size_t index = static_cast<size_t>(argmaxData[poolIndex]);
+            batchDiffData[index] += outDiffData[poolIndex];
+          }
+        }
+      }
+      batchDiffData += channelOffset;
+      outDiffData += poolChannelOffset;
+      argmaxData += poolChannelOffset;
+    }
+    bottomROIs += roiOffset;
+  }
+
+  if (useGpu_) {
+    getInputGrad(0)->copyFrom(*inGradValue);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/gserver/layers/ROIPoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f07e49d6fd1eda9fa7bd46e4cec771a75f571be
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer used by Fast R-CNN to extract feature maps of ROIs from the last
+ * feature map.
+ * - Input: This layer needs two input layers: The first input layer is a
+ *          convolution layer; The second input layer contains the ROI data
+ *          which is the output of ProposalLayer in Faster R-CNN. layers for
+ *          generating bbox location offset and the classification confidence.
+ * - Output: The ROIs' feature map.
+ * Reference:
+ *    Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
+ *    Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
+ * Networks
+ */
+
+class ROIPoolLayer : public Layer {
+protected:
+  size_t channels_;
+  size_t width_;
+  size_t height_;
+  size_t pooledWidth_;
+  size_t pooledHeight_;
+  real spatialScale_;
+
+  // Since there is no int matrix, use real maxtrix instead.
+  MatrixPtr maxIdxs_;
+
+public:
+  explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa6778aef4e893208fd064ca22e217c6c4d960f9
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionLayer.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
+
+bool ScaleSubRegionLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(static_cast<int>(inputLayers_.size()), 2);
+  auto& conf = config_.inputs(0).scale_sub_region_conf();
+  value_ = conf.value();
+
+  createFunction(forward_, "ScaleSubRegion", FuncConfig().set("value", value_));
+  createFunction(
+      backward_, "ScaleSubRegionGrad", FuncConfig().set("value", value_));
+
+  return true;
+}
+
+void ScaleSubRegionLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  auto in0 = getInput(0);
+  imgH_ = in0.getFrameHeight();
+  imgW_ = in0.getFrameWidth();
+  if (imgH_ == 0 || imgW_ == 0) {
+    auto& conf = config_.inputs(0).scale_sub_region_conf();
+    imgH_ = conf.image_conf().img_size_y();
+    imgW_ = conf.image_conf().img_size();
+  }
+  MatrixPtr imgV = in0.value;
+  size_t batchSize = imgV->getHeight();
+  size_t spatialSize = imgH_ * imgW_;
+  channelsNum_ = imgV->getWidth() / spatialSize;
+  shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});
+
+  resetOutput(batchSize, imgV->getWidth());
+  auto& out = getOutput();
+  out.setFrameHeight(imgH_);
+  out.setFrameWidth(imgW_);
+
+  MatrixPtr indicesV = getInputValue(1);
+  indicesShape_ = TensorShape({batchSize, 6});
+
+  REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
+  BufferArgs inArgs;
+  BufferArgs outArgs;
+  inArgs.addArg(*imgV, shape_);
+  inArgs.addArg(*indicesV, indicesShape_);
+  outArgs.addArg(*out.value, shape_, ASSIGN_TO);
+  forward_[0]->calc(inArgs, outArgs);
+}
+
+void ScaleSubRegionLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("ScaleSubRegionBackward", getName().c_str());
+  BufferArgs inArgs;
+  BufferArgs outArgs;
+  inArgs.addArg(*getOutputGrad(), shape_);
+  inArgs.addArg(*getInputValue(1), indicesShape_);
+  outArgs.addArg(*getInputGrad(0), shape_, ADD_TO);
+  backward_[0]->calc(inArgs, outArgs);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.h b/paddle/gserver/layers/ScaleSubRegionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a27c56de93bb6fdde0f95cd4c5abe5dfabe4e858
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  For each instance, this layer can be used to multiply a value to a
+ *         specified sub continuous region. By providing start index and end
+ *         index for C/H/W, you can specify the location and shape of the
+ *         region.
+ *
+ *         input_0: Input value.
+ *         input_1: Indices value to specify the location an shape of the
+ *                  region.
+ */
+class ScaleSubRegionLayer : public Layer {
+public:
+  explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ScaleSubRegionLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+  TensorShape shape_;
+  TensorShape indicesShape_;
+  size_t imgH_;
+  size_t imgW_;
+  size_t channelsNum_;
+  real value_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 1a46fb49153a0aa4228f58db481b950bc2d6de83..fcbcb5b0f1f4cb07066363c9fa93fb1726459f30 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -53,7 +53,7 @@ TEST(Operator, dot_mul) {
 TEST(Projection, context) {
   for (auto contextStart : {-5, -3, -1, 0, 3}) {
     for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto batchSize : {1, 2, 5, 20, 50}) {
+      for (auto batchSize : {1, 2, 5, 20}) {
         for (auto trainablePadding : {false, true}) {
           LOG(INFO) << " contextStart=" << contextStart
                     << " contextLength=" << contextLength
@@ -585,14 +585,14 @@ TEST(Layer, maxoutLayer) {
 }
 void testFcLayer(string format, size_t nnz) {
   TestConfig config;
-  config.biasSize = 4096;
+  config.biasSize = 1024;
   config.layerConfig.set_type("fc");
-  config.layerConfig.set_size(4096);
+  config.layerConfig.set_size(1024);
   config.layerConfig.set_active_type("sigmoid");
   config.layerConfig.set_drop_rate(0.1);
 
   config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+      {INPUT_DATA, "layer_0", 2048, nnz, ParaSparse(format)});
   config.layerConfig.add_inputs();
 
   LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
@@ -609,9 +609,9 @@ void testFcLayer(string format, size_t nnz) {
 }
 
 TEST(Layer, fcLayer) {
-  testFcLayer("", 4096 * 4096 * 2);
-  testFcLayer("csc", 4096 * 40);
-  testFcLayer("csr", 4096 * 40);
+  testFcLayer("", 1024 * 1024 * 2);
+  testFcLayer("csc", 1024 * 10);
+  testFcLayer("csr", 1024 * 10);
 }
 
 TEST(Layer, SelectiveFullyConnectedLayer) {
@@ -1995,7 +1995,7 @@ TEST(Layer, multibox_loss) {
 TEST(Layer, TransLayer) {
   TestConfig config;
   const int height = 128;
-  const int width = 1028;
+  const int width = 256;
   config.layerConfig.set_type("trans");
   config.layerConfig.set_size(width);
 
@@ -2056,6 +2056,43 @@ TEST(Layer, CropLayer) {
   }
 }
 
+TEST(Layer, roi_pool) {
+  TestConfig config;
+  config.layerConfig.set_type("roi_pool");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
+  roiPoolConf->set_pooled_width(7);
+  roiPoolConf->set_pooled_height(7);
+  roiPoolConf->set_spatial_scale(1. / 16);
+  roiPoolConf->set_width(14);
+  roiPoolConf->set_height(14);
+
+  const size_t roiNum = 10;
+  const size_t roiDim = 10;
+  const size_t batchSize = 5;
+  MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
+  roiValue->zeroMem();
+  real* roiData = roiValue->getData();
+  for (size_t i = 0; i < roiNum; ++i) {
+    roiData[i * roiDim + 0] = std::rand() % batchSize;
+    roiData[i * roiDim + 1] = std::rand() % 224;  // xMin
+    roiData[i * roiDim + 2] = std::rand() % 224;  // yMin
+    size_t xMin = static_cast<size_t>(roiData[i * roiDim + 1]);
+    size_t yMin = static_cast<size_t>(roiData[i * roiDim + 2]);
+    roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin);  // xMax
+    roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin);  // yMax
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
+  }
+}
+
 TEST(Layer, SwitchOrderLayer) {
   TestConfig config;
   // config input_0
@@ -2358,6 +2395,38 @@ TEST(Layer, ScaleShiftLayer) {
   }
 }
 
+TEST(Layer, ScaleSubRegionLayer) {
+  const size_t batchSize = 64;
+  const size_t size = 4096;
+  TestConfig config;
+  config.layerConfig.set_type("scale_sub_region");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
+  auto* data = indicesV->getData();
+  for (size_t i = 0; i < batchSize; ++i) {
+    data[i * 2] = 2;
+    data[i * 2 + 1] = 4;
+    data[i * 2 + 2] = 16;
+    data[i * 2 + 3] = 32;
+    data[i * 2 + 4] = 16;
+    data[i * 2 + 5] = 32;
+  }
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ScaleSubRegionConfig* scaleSubRegionConf =
+      input->mutable_scale_sub_region_conf();
+  ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
+  imgConf->set_img_size(32);
+  imgConf->set_img_size_y(32);
+  imgConf->set_channels(4);
+  scaleSubRegionConf->set_value(2.0);
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 2e8d9f3333b36005c9b3b28449c76a4a44c74cc6..a0e039c2a33b586e21775ad06c1278a10804d654 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -269,6 +269,7 @@ void testBatchNormLayer(const testBatchNormDesc& pm) {
 TEST(MKLDNNLayer, BatchNormLayer) {
   testBatchNormLayer({4, 10, 6, 6});
   testBatchNormLayer({16, 32, 16, 16});
+  testBatchNormLayer({4, 16, 8, 10});
 }
 
 struct testImageDesc {
@@ -300,13 +301,8 @@ void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
   TestConfig dnnConfig;
   getAddtoConfig(dnnConfig, pm, nInputs);
   dnnConfig.layerConfig.set_type("mkldnn_addto");
-  // TODO(TJ): test with bias
-  for (auto withBias : {false}) {
-    if (withBias) {
-      dnnConfig.biasSize = pm.ic * pm.ih * pm.iw;
-    } else {
-      dnnConfig.biasSize = 0;
-    }
+  for (auto withBias : {false, true}) {
+    dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
     RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
   }
 }
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 21a8f73c3e650d4b3c3b86247594cd965f4ead35..a710479bab82ed52122cf59bb14a05ccbd4aa05c 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -152,12 +152,7 @@ void MKLDNNMatrix::downSpatial() {
   }
   memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
   memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
-  mkldnn_primitive_t result;
-  mkldnn::error::wrap_c_api(
-      mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
-      "could not create a memory primitive");
-  reset(result);
-  set_data_handle(data_);
+  resetMKLDNNMemory(pd, data_);
 }
 
 }  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 54cfefe23b3dc70fd12fd2ca8886c941047b59f7..39d40a1f61609a649d3341c170d24b0604921ac2 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -145,6 +145,27 @@ public:
     m_.reset();
   }
 
+  /**
+   * override the CpuMatrix::resize
+   */
+  void resize(size_t newHeight, size_t newWidth) override {
+    m_->resize(newHeight, newWidth);
+    if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
+      return;
+    }
+    CpuMatrix::setData(data_);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+    auto pd = mkldnn::memory::primitive_desc(
+        mkldnn::memory::desc({(int)newHeight, (int)newWidth},
+                             getDtype(),
+                             mkldnn::memory::format::nc),
+        getEngine());
+    resetMKLDNNMemory(pd, data_);
+  }
+
   /**
    * override Matrix::getData
    * check data before return
@@ -215,6 +236,17 @@ protected:
                    memory::format srcFmt,
                    memory::format dstFmt,
                    memory::dims dm);
+  /**
+   * reset this MKLDNN Memory from primitve desc
+   */
+  void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
+    mkldnn_primitive_t result;
+    mkldnn::error::wrap_c_api(
+        mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+        "could not create a memory primitive");
+    reset(result);
+    set_data_handle(data);
+  }
 
 private:
   // save the CpuMatrixPtr in case the buffer released outside
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index c2f17beeb87942ea681f5d388659c0d280157b26..ba86eacbb5d53ee43a60d2cd1dd922333a5d48f0 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -206,7 +206,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
 }
 #endif
 
-#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
+#if defined(PADDLE_USE_MKLML)
 
 template <>
 void vExp<float>(const int n, const float* a, float* r) {
@@ -295,38 +295,6 @@ template void vAdd(const int n, const double* a, const double* b, double* r);
 
 #endif
 
-#ifdef PADDLE_USE_MKL
-template <>
-void vInvSqrt<float>(const int n, const float* a, float* r) {
-  vsInvSqrt(n, a, r);
-}
-
-template <>
-void vInvSqrt<double>(const int n, const double* a, double* r) {
-  vdInvSqrt(n, a, r);
-}
-
-template <>
-void vLog1p<float>(const int n, const float* a, float* r) {
-  vsLog1p(n, a, r);
-}
-
-template <>
-void vLog1p<double>(const int n, const double* a, double* r) {
-  vdLog1p(n, a, r);
-}
-
-template <>
-void vTanh<float>(const int n, const float* a, float* r) {
-  vsTanh(n, a, r);
-}
-
-template <>
-void vTanh<double>(const int n, const double* a, double* r) {
-  vdTanh(n, a, r);
-}
-#else
-
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
 template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
@@ -357,6 +325,4 @@ template void vLog1p(const int n, const double* a, double* r);
 template void vTanh(const int n, const float* a, float* r);
 template void vTanh(const int n, const double* a, double* r);
 
-#endif
-
 }  // namespace paddle
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 8193aa4adffc0409d8ea68417c68fa153a2942d8..f6e77029bdd75a602f88b688ca810f47ba4ee615 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -21,11 +21,6 @@ limitations under the License. */
 #include <mkl_vml_functions.h>
 #endif
 
-#ifdef PADDLE_USE_MKL
-#include <mkl.h>
-#include <mkl_lapacke.h>
-#endif
-
 #if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB)
 extern "C" {
 #include <cblas.h>
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
index 5bc4a03067a75527fa30e5bb5526f93dc7b9fdcc..b998e5772e70d0a0ec79dc4064dcbaa2c302efd2 100644
--- a/paddle/math/tests/TensorCheck.h
+++ b/paddle/math/tests/TensorCheck.h
@@ -169,7 +169,7 @@ void TensorCheck(AssertEq compare,
       count++;
     }
   }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
 }
 
 template <typename AssertEq, typename Tensor1, typename Tensor2>
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index d0c4c0d25d6f4e3ab7acd72d62a8a17fa102637b..1776f33105367447759aa91c25263dfc53bd2f99 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -65,7 +65,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 
     size_t num_samples = inference->dims()[0];
     size_t infer_width = inference->dims()[1];
-    cudaMemset((void**)&accuracy_data, 0, sizeof(float));
+    PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float)));
 
     if (num_samples == 0) {
       return;
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
index 1968b53d19acfddaa514eca6e24d98a298d8d311..28dbc77f64842a62e88ae8df4ead7adc3b03764b 100644
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 
 namespace paddle {
@@ -22,18 +21,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-
 template <typename Place, typename T>
 class AccuracyKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..666043e824f885e9c0e79e319d0a38ba108c209a
--- /dev/null
+++ b/paddle/operators/array_operator.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+class ArrayOp : public framework::OperatorBase {
+ public:
+  ArrayOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  size_t GetOffset(const framework::Scope &scope,
+                   const platform::DeviceContext &dev_ctx) const {
+    auto *i = scope.FindVar(Input("I"));
+    PADDLE_ENFORCE(i != nullptr, "I must be set");
+    auto &i_tensor = i->Get<framework::LoDTensor>();
+    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+    size_t offset;
+    if (platform::is_gpu_place(i_tensor.place())) {
+      // FIXME: Avoid copy from GPU to CPU
+      framework::Tensor t;
+      t.CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx);
+      dev_ctx.Wait();
+      offset = static_cast<size_t>(*t.data<int64_t>());
+    } else {
+      offset = static_cast<size_t>(*i_tensor.data<int64_t>());
+    }
+    return offset;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
index 6cd9c06b8ae3d3b17be83268c2f5d4002705b111..c0903bb4e5ca7f160e19eefab99af7e3e4a8ed76 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -140,6 +140,23 @@ class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
                    "ArrayToLoDTensorOp must has input X.");
     PADDLE_ENFORCE(context->HasInput("RankTable"),
                    "ArrayToLoDTensorOp must has input RankTable.");
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("lod_tensor_to_array");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
   }
 };
 
@@ -149,4 +166,5 @@ class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp,
                   ops::ArrayToLoDTensorOpProtoMaker,
-                  ops::ArrayToLoDTensorInferShape);
+                  ops::ArrayToLoDTensorInferShape,
+                  ops::ArrayToLoDTensorGradMaker);
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index 8721ca352848fc4d69b206d4ea0ab7c581c8d055..f884e6efa917ce3f8554dce0e248f2b29273e3f3 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -19,9 +19,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
 using EigenArrayMap =
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..309660b01fe7052de2f9300acdf00779d0228221
--- /dev/null
+++ b/paddle/operators/chunk_eval_op.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/chunk_eval_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ChunkEvalOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Inference"),
+                   "Input(Inference) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input(Label) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Precision"),
+                   "Output(Precision) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Recall"),
+                   "Output(Recall) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
+                   "Output(F1-Score) of ChunkEvalOp should not be null.");
+
+    auto inference_dim = ctx->GetInputDim("Inference");
+    auto label_dim = ctx->GetInputDim("Label");
+
+    PADDLE_ENFORCE(inference_dim == label_dim,
+                   "Inference's shape must be the same as Label's shape.");
+
+    ctx->SetOutputDim("Precision", {1});
+    ctx->SetOutputDim("Recall", {1});
+    ctx->SetOutputDim("F1-Score", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(framework::DataType::FP32,
+                                   ctx.device_context());
+  }
+};
+
+class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChunkEvalOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Inference",
+             "(Tensor, default: Tensor<int>). Predictions from the network.");
+    AddInput("Label",
+             "(Tensor, default: Tensor<int>). The true tag sequences.");
+    AddOutput("Precision",
+              "(float). The evaluated precision (called positive predictive "
+              "value) of chunks on the given mini-batch.");
+    AddOutput("Recall",
+              "(float). The evaluated recall (true positive rate or "
+              "sensitivity) of chunks on the given mini-batch.");
+    AddOutput("F1-Score",
+              "(float). The evaluated F1-Score on the given mini-batch.");
+    AddAttr<int>("num_chunk_types",
+                 "(int). The number of chunk type. See below for details.");
+    AddAttr<std::string>(
+        "chunk_scheme",
+        "(string, default IOB). The labeling scheme indicating "
+        "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below "
+        "for details.")
+        .SetDefault("IOB");
+    AddAttr<std::vector<int>>("excluded_chunk_types",
+                              "(list<int>) A list including chunk type ids "
+                              "indicating chunk types that are not counted. "
+                              "See below for details.")
+        .SetDefault(std::vector<int>{});
+    AddComment(R"DOC(
+For some basics of chunking, please refer to
+‘Chunking with Support Vector Mechines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
+
+
+CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
+and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
+Here is a NER example of labeling for these tagging schemes:
+
+ 	     Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
+  IO:    I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
+  IOB:   B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
+  IOE:   I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
+  IOBES: B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
+
+There are three chunk types(named entity types) including PER(person), ORG(orgnazation)
+and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
+
+Since the calculations actually use label ids rather than labels, extra attention
+should be paid when mapping labels to ids to make CheckEvalOp work. The key point
+is that the listed equations are satisfied by ids.
+
+    tag_type = label % num_tag_type
+    chunk_type = label / num_tag_type
+
+where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
+is the num of chunk types, and `tag_type` get its value from the following table.
+
+    Scheme Begin Inside End   Single
+     plain   0     -      -     -
+     IOB     0     1      -     -
+     IOE     -     0      1     -
+     IOBES   0     1      2     3
+
+Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
+PER and LOC. To satisfy the above equations, the label map can be like this:
+
+    B-ORG  0
+    I-ORG  1
+    B-PER  2
+    I-PER  3
+    B-LOC  4
+    I-LOC  5
+    O      6
+
+It’s not hard to verify the equations noting that the num of chunk types
+is 3 and the num of tag types in IOB scheme is 2. For example, the label
+id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
+I-LOC is 2, which consistent with the results from the equations.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(chunk_eval, ops::ChunkEvalOp,
+                             ops::ChunkEvalOpMaker);
+REGISTER_OP_CPU_KERNEL(chunk_eval,
+                       ops::ChunkEvalKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..81aa07817b673b2ff85a35a51cc43742b7ad7fed
--- /dev/null
+++ b/paddle/operators/chunk_eval_op.h
@@ -0,0 +1,219 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <set>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename Place, typename T>
+class ChunkEvalKernel : public framework::OpKernel<T> {
+ public:
+  struct Segment {
+    int begin;
+    int end;
+    int type;
+    bool operator==(const Segment& y) const {
+      return begin == y.begin && end == y.end && type == y.type;
+    }
+  };
+
+  void GetSegments(const int* label, int length, std::vector<Segment>& segments,
+                   int num_chunk_types, int num_tag_types, int other_chunk_type,
+                   int tag_begin, int tag_inside, int tag_end,
+                   int tag_single) const {
+    segments.clear();
+    segments.reserve(length);
+    int chunk_start = 0;
+    bool in_chunk = false;
+    int tag = -1;
+    int type = other_chunk_type;
+    for (int i = 0; i < length; ++i) {
+      int prev_tag = tag;
+      int prev_type = type;
+      PADDLE_ENFORCE_LE(label[i], num_chunk_types * num_tag_types);
+      tag = label[i] % num_tag_types;
+      type = label[i] / num_tag_types;
+      if (in_chunk && ChunkEnd(prev_tag, prev_type, tag, type, other_chunk_type,
+                               tag_begin, tag_inside, tag_end, tag_single)) {
+        Segment segment{
+            chunk_start,  // begin
+            i - 1,        // end
+            prev_type,
+        };
+        segments.push_back(segment);
+        in_chunk = false;
+      }
+      if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
+                     tag_begin, tag_inside, tag_end, tag_single)) {
+        chunk_start = i;
+        in_chunk = true;
+      }
+    }
+    if (in_chunk) {
+      Segment segment{
+          chunk_start,  // begin
+          length - 1,   // end
+          type,
+      };
+      segments.push_back(segment);
+    }
+  }
+
+  bool ChunkEnd(int prev_tag, int prev_type, int tag, int type,
+                int other_chunk_type, int tag_begin, int tag_inside,
+                int tag_end, int tag_single) const {
+    if (prev_type == other_chunk_type) return false;
+    if (type == other_chunk_type) return true;
+    if (type != prev_type) return true;
+    if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single;
+    if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single;
+    if (prev_tag == tag_end) return true;
+    if (prev_tag == tag_single) return true;
+    return false;
+  }
+
+  bool ChunkBegin(int prev_tag, int prev_type, int tag, int type,
+                  int other_chunk_type, int tag_begin, int tag_inside,
+                  int tag_end, int tag_single) const {
+    if (prev_type == other_chunk_type) return type != other_chunk_type;
+    if (type == other_chunk_type) return false;
+    if (type != prev_type) return true;
+    if (tag == tag_begin) return true;
+    if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single;
+    if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single;
+    if (tag == tag_single) return true;
+    return false;
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    // initialize to parse configurations
+    int num_chunk_types, num_tag_types;
+    int other_chunk_type;
+    int tag_begin, tag_inside, tag_end, tag_single;
+    std::vector<Segment> label_segments;
+    std::vector<Segment> output_segments;
+    std::set<int> excluded_chunk_types;
+    int64_t num_output_segments = 0;
+    int64_t num_label_segments = 0;
+    int64_t num_correct = 0;
+    if (context.Attr<std::string>("chunk_scheme") == "IOB") {
+      num_tag_types = 2;
+      tag_begin = 0;
+      tag_inside = 1;
+      tag_end = -1;
+      tag_single = -1;
+    } else if (context.Attr<std::string>("chunk_scheme") == "IOE") {
+      num_tag_types = 2;
+      tag_begin = -1;
+      tag_inside = 0;
+      tag_end = 1;
+      tag_single = -1;
+    } else if (context.Attr<std::string>("chunk_scheme") == "IOBES") {
+      num_tag_types = 4;
+      tag_begin = 0;
+      tag_inside = 1;
+      tag_end = 2;
+      tag_single = 3;
+    } else if (context.Attr<std::string>("chunk_scheme") == "plain") {
+      num_tag_types = 1;
+      tag_begin = -1;
+      tag_inside = -1;
+      tag_end = -1;
+      tag_single = -1;
+    } else {
+      PADDLE_THROW("Unknown chunk scheme.");
+    }
+    other_chunk_type = num_chunk_types = context.Attr<int>("num_chunk_types");
+    excluded_chunk_types.insert(
+        context.Attr<std::vector<int>>("excluded_chunk_types").begin(),
+        context.Attr<std::vector<int>>("excluded_chunk_types").end());
+
+    auto* inference = context.Input<LoDTensor>("Inference");
+    auto* label = context.Input<LoDTensor>("Label");
+    auto* precision = context.Output<Tensor>("Precision");
+    auto* recall = context.Output<Tensor>("Recall");
+    auto* f1 = context.Output<Tensor>("F1-Score");
+
+    const int* inference_data = inference->data<int>();
+    const int* label_data = label->data<int>();
+    T* precision_data = precision->mutable_data<T>(context.GetPlace());
+    T* racall_data = recall->mutable_data<T>(context.GetPlace());
+    T* f1_data = f1->mutable_data<T>(context.GetPlace());
+
+    auto lod = label->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE(lod == inference->lod(),
+                   "LoD must be same between Inference and Label.");
+    int num_sequences = lod[0].size() - 1;
+    for (int i = 0; i < num_sequences; ++i) {
+      int seq_length = lod[0][i + 1] - lod[0][i];
+      EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
+                 output_segments, label_segments, num_output_segments,
+                 num_label_segments, num_correct, num_chunk_types,
+                 num_tag_types, other_chunk_type, tag_begin, tag_inside,
+                 tag_end, tag_single, excluded_chunk_types);
+    }
+    *precision_data = !num_output_segments ? 0 : static_cast<T>(num_correct) /
+                                                     num_output_segments;
+    *racall_data = !num_label_segments ? 0 : static_cast<T>(num_correct) /
+                                                 num_label_segments;
+    *f1_data = !num_correct ? 0 : 2 * (*precision_data) * (*racall_data) /
+                                      ((*precision_data) + (*racall_data));
+  }
+
+  void EvalOneSeq(const int* output, const int* label, int length,
+                  std::vector<Segment>& output_segments,
+                  std::vector<Segment>& label_segments,
+                  int64_t& num_output_segments, int64_t& num_label_segments,
+                  int64_t& num_correct, int num_chunk_types, int num_tag_types,
+                  int other_chunk_type, int tag_begin, int tag_inside,
+                  int tag_end, int tag_single,
+                  const std::set<int>& excluded_chunk_types) const {
+    GetSegments(output, length, output_segments, num_chunk_types, num_tag_types,
+                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
+    GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
+                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
+    size_t i = 0, j = 0;
+    while (i < output_segments.size() && j < label_segments.size()) {
+      if (output_segments[i] == label_segments[j] &&
+          excluded_chunk_types.count(output_segments[i].type) != 1) {
+        ++num_correct;
+      }
+      if (output_segments[i].end < label_segments[j].end) {
+        ++i;
+      } else if (output_segments[i].end > label_segments[j].end) {
+        ++j;
+      } else {
+        ++i;
+        ++j;
+      }
+    }
+    for (auto& segment : label_segments) {
+      if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments;
+    }
+    for (auto& segment : output_segments) {
+      if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9fc532e39500fa397be80396b075e866bad9362
--- /dev/null
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/clip_by_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ClipByNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipByNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipByNormOp should not be null.");
+    auto max_norm = ctx->Attrs().Get<float>("max_norm");
+    PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0.");
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ClipByNormOpMaker(framework::OpProto* proto,
+                    framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input of clip_by_norm op."
+             "The number of dimensions must be between [1, 9].");
+    AddOutput("Out",
+              "(Tensor) The output of clip_by_norm op with shape as input(X)");
+    AddAttr<float>("max_norm", "(float) The maximum norm value.");
+    AddComment(R"DOC(
+ClipByNorm operator limits the L2 norm of the input 'X' within 'max_norm'. 
+If the L2 norm of 'X' is less than or equal to 'max_norm', 'Out' will be 
+the same as 'X'. If the L2 norm of 'X' is greater than 'max_norm', 'X' will 
+be linearly scaled to make the L2 norm of 'Out' equal to 'max_norm', as 
+shown in the following formula：
+
+'Out' = 'max_norm' * 'X' / norm('X'),
+
+where norm('X') represents the L2 norm of 'X'.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
+                             ops::ClipByNormOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    clip_by_norm, ops::ClipByNormKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/increment_op.cu b/paddle/operators/clip_by_norm_op.cu
similarity index 64%
rename from paddle/operators/increment_op.cu
rename to paddle/operators/clip_by_norm_op.cu
index f97a6c468522f033687bd83ae5b1a1bc7d86fa80..2593a24ebbf56ecd286a726e527d2414247576e8 100644
--- a/paddle/operators/increment_op.cu
+++ b/paddle/operators/clip_by_norm_op.cu
@@ -12,11 +12,8 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/increment_op.h"
+#include "paddle/operators/clip_by_norm_op.h"
 
+namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    increment,
-    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>,
-    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, double>,
-    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, int>,
-    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, int64_t>);
+    clip_by_norm, ops::ClipByNormKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b26476cae9b5b2fa290bc9186b9a64c48ba703d6
--- /dev/null
+++ b/paddle/operators/clip_by_norm_op.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class ClipByNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max_norm = context.Attr<T>("max_norm");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto x = EigenVector<T>::Flatten(*input);
+    auto out = EigenVector<T>::Flatten(*output);
+    auto x_norm = x.square().sum().sqrt();
+    auto place = context.GetEigenDevice<Place>();
+
+    auto temp = (x_norm <= max_norm).template cast<T>().eval();
+    auto scaling = temp + (static_cast<T>(1) - temp) * max_norm / x_norm;
+    Eigen::array<int, 1> one_dim{{1}};
+    Eigen::DSizes<int, 1> m_dsize(input->numel());
+    out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
index 8b425d14df3bc484437dc72f29abf13b887006bd..716b5ee92d0d8737d2069460f53989f691ff7c77 100644
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/operators/compare_op.h"
 #include "paddle/framework/op_registry.h"
+
 namespace paddle {
 namespace operators {
 template <typename OpComment>
@@ -61,19 +62,34 @@ class CompareOpInferShape : public framework::InferShapeBase {
   }
 };
 
+class CompareOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx);
+    // CompareOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_LOGICAL_OP(op_type, _equation)                               \
-  struct _##op_type##Comment {                                                \
-    static char type[];                                                       \
-    static char equation[];                                                   \
-  };                                                                          \
-  char _##op_type##Comment::type[]{#op_type};                                 \
-  char _##op_type##Comment::equation[]{_equation};                            \
-  REGISTER_OP_WITH_KERNEL(                                                    \
-      op_type, ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
-      ::paddle::operators::CompareOpInferShape<_##op_type##Comment>,          \
+#define REGISTER_LOGICAL_OP(op_type, _equation)                      \
+  struct _##op_type##Comment {                                       \
+    static char type[];                                              \
+    static char equation[];                                          \
+  };                                                                 \
+  char _##op_type##Comment::type[]{#op_type};                        \
+  char _##op_type##Comment::equation[]{_equation};                   \
+  REGISTER_OPERATOR(                                                 \
+      op_type, ::paddle::operators::CompareOp,                       \
+      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
       ::paddle::framework::EmptyGradOpMaker);
 
 REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..282775fcda45fe3bbd72bf04a7ae828f2c840ab7
--- /dev/null
+++ b/paddle/operators/expand_op.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ExpandOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto x_dims = ctx->GetInputDim("X");
+
+    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims.size()), expand_times.size(),
+                      "The number of Attr(expand_times)'s value must be equal "
+                      "to the rank of Input(X).");
+    PADDLE_ENFORCE_LE(x_dims.size(), 6,
+                      "The rank of Input(X) must not be greater than 6.");
+
+    std::vector<int64_t> out_shape(x_dims.size());
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_GE(expand_times[i], 1,
+                        "Each value of Attr(expand_times) should not be "
+                        "less than 1.");
+      out_shape[i] = x_dims[i] * expand_times[i];
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+    if (out_shape[0] == x_dims[0]) {
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+};
+
+class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ExpandOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
+             "X is the input tensor to be expanded.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
+              "The rank of Output(Out) is same as Input(X) except that each "
+              "dimension size of Output(Out) is equal to corresponding "
+              "dimension size of Input(X) multiplying corresponding value of "
+              "Attr(expand_times).");
+    AddAttr<std::vector<int>>("expand_times",
+                              "Expand times number for each dimension.");
+    AddComment(R"DOC(
+Expand operator tiles the input by given times number. You should set times
+number for each dimension by providing attribute 'expand_times'. The rank of X
+should be in [1, 6]. Please notice that size of 'expand_times' must be same with
+X's rank. Following is a using case:
+
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+
+        [
+           [[1], [2], [3]],
+           [[4], [5], [6]]
+        ]
+
+Attr(expand_times):  [1, 2, 2]
+
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+
+        [
+            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+        ]
+
+)DOC");
+  }
+};
+
+class ExpandGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
+                        "Each dimension size of Input(Out@GRAD) should be "
+                        "equal to multiplication of crroresponding dimension "
+                        "size of Input(X) and Attr(expand_times) value.");
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
+            ops::ExpandGradOp);
+REGISTER_OP_CPU_KERNEL(expand,
+                       ops::ExpandKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    expand_grad, ops::ExpandGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/fill_constant_op.cu b/paddle/operators/expand_op.cu
similarity index 65%
rename from paddle/operators/fill_constant_op.cu
rename to paddle/operators/expand_op.cu
index bca402a8b988b570a083e9ce253342304f4b8946..6744562b6c21dd8bfeb7e4cb6b809dc7913aa3a5 100644
--- a/paddle/operators/fill_constant_op.cu
+++ b/paddle/operators/expand_op.cu
@@ -13,12 +13,11 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/fill_constant_op.h"
+
+#include "paddle/operators/expand_op.h"
 
 namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(expand,
+                       ops::ExpandKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    fill_constant, ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>,
-    ops::FillConstantOpKernel<paddle::platform::GPUPlace, double>,
-    ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>,
-    ops::FillConstantOpKernel<paddle::platform::GPUPlace, int64_t>);
+    expand_grad, ops::ExpandGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ae2c11a5d31dafc1b90d129054ebfabfb761bfe
--- /dev/null
+++ b/paddle/operators/expand_op.h
@@ -0,0 +1,172 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   You may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <boost/preprocessor/arithmetic/div.hpp>
+#include <boost/preprocessor/arithmetic/mod.hpp>
+#include <boost/preprocessor/comparison/greater.hpp>
+#include <boost/preprocessor/comparison/greater_equal.hpp>
+#include <boost/preprocessor/control/if.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include <iostream>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define EXPAND_TEMPLATE(z, n, data) \
+  case n + 1: {                     \
+    Expand<n + 1>(context);         \
+    break;                          \
+  }
+#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
+#define COND(n)                                               \
+  BOOST_PP_GREATER_EQUAL(BOOST_PP_DIV(n, MAX_RANK_SUPPORTED), \
+                         BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define EXPAND_GRAD_CASE(n)                                        \
+  case n: {                                                        \
+    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                         \
+  }
+#define EXPAND_GRAD_TEMPLATE(z, n, data) \
+  BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
+#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class ExpandKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    switch (rank) {
+      REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED)
+      default:
+        PADDLE_ENFORCE(false,
+                       "Only support tensor with rank being between 1 and 6.");
+    }
+  }
+
+ protected:
+  template <int Rank>
+  void Expand(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto* out0 = context.Output<Tensor>("Out");
+    Eigen::DSizes<int, Rank> bcast_dims;
+    auto x_dims = in0->dims();
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      bcast_dims[i] = expand_times[i];
+    }
+    auto x = EigenTensor<T, Rank>::From(*in0);
+    out0->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, Rank>::From(*out0);
+    auto place = context.GetEigenDevice<Place>();
+    y.device(place) = x.broadcast(bcast_dims);
+  }
+};
+
+template <typename Place, typename T>
+class ExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto x_dims = in0->dims();
+    // 1. reshape_dims_vec is the broadcast parameter. For each dimension i,
+    //    if expand_times[i] > 1 and x_dims[i] > 1, i will be splitted to two
+    //    dimensions [expand_times[i], x_dims[i]].
+    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+    //    each dimension expanded, the gradients should be summed to original
+    //    size.
+    std::vector<int> reshape_dims_vec;
+    std::vector<int> reduce_dims_vec;
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      if (expand_times[i] == 1) {
+        reshape_dims_vec.push_back(x_dims[i]);
+      } else {
+        if (x_dims[i] == 1) {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+        } else {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+          reshape_dims_vec.push_back(x_dims[i]);
+        }
+      }
+    }
+
+    int dims = reshape_dims_vec.size() * MAX_RANK_SUPPORTED +
+               reduce_dims_vec.size() - MAX_RANK_SUPPORTED - 1;
+    // no need reduce, just copy
+    if (reduce_dims_vec.size() == 0) {
+      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      out0->mutable_data<T>(context.GetPlace());
+      out0->CopyFrom(*in0, context.GetPlace(), context.device_context());
+    } else {
+      switch (dims) {
+        REP_EXPAND_GRAD_TEMPLATE(72)
+        default:
+          PADDLE_ENFORCE(
+              false, "Only support tensor with rank being between 1 and 6.");
+      }
+    }
+  }
+
+ protected:
+  template <int Dims>
+  void ExpandBackward(const framework::ExecutionContext& context,
+                      const std::vector<int>& reshape_dims_vec,
+                      const std::vector<int>& reduce_dims_vec) const {
+    size_t reshape_size = Dims / MAX_RANK_SUPPORTED + 1;
+    size_t reduce_size = Dims % MAX_RANK_SUPPORTED + 1;
+    PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reshape dimensions.");
+    PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reduce dimensions.");
+    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto x = EigenVector<T>::Flatten(*(context.Input<Tensor>("X")));
+    out0->mutable_data<T>(context.GetPlace());
+    auto x_grad = EigenVector<T>::Flatten(*out0);
+    Eigen::DSizes<int, Dims / MAX_RANK_SUPPORTED + 1> reshape_dims;
+    for (size_t i = 0; i < reshape_size; ++i) {
+      reshape_dims[i] = reshape_dims_vec[i];
+    }
+    Eigen::DSizes<int, Dims % MAX_RANK_SUPPORTED + 1> reduce_dims;
+    for (size_t i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = reduce_dims_vec[i];
+    }
+    auto out_grad = EigenVector<T>::Flatten(*in0);
+    x_grad.device(context.GetEigenDevice<Place>()) =
+        out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index f86ee3c3d88670c0e43f20fdf35b8424438e0486..85871ebbfcd8ee38ef5e8078d1d6cb6bdda46a7b 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -75,10 +75,10 @@ class FillConstantBatchSizeLikeOpMaker
               "with the specified value");
     AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
     AddAttr<int>("input_dim_idx",
-                 "(int, default 0) the index of input's batch size dimension")
+                 "(int, default 0) The index of input's batch size dimension")
         .SetDefault(0);
     AddAttr<int>("output_dim_idx",
-                 "(int, default 0) the index of output's batch size dimension")
+                 "(int, default 0) The index of output's batch size dimension")
         .SetDefault(0);
     AddAttr<float>("value", "(float, default 0) The value to be filled")
         .SetDefault(0.0f);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu b/paddle/operators/fill_constant_batch_size_like_op.cu
index cfa5df001e9d6c606751e3ca3cddda02812ef180..298c196f1dfef388640e34153264986bd518a11a 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cu
+++ b/paddle/operators/fill_constant_batch_size_like_op.cu
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/fill_constant_batch_size_like_op.h"
 
diff --git a/paddle/operators/fill_constant_batch_size_like_op.h b/paddle/operators/fill_constant_batch_size_like_op.h
index a360e6683ec7204ea5bdbe27ca88a0ac51c983ac..339d97a30a5819ab488e83990651ba99212239ec 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/operators/fill_constant_batch_size_like_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -27,9 +27,8 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(ctx.GetPlace());
     auto value = ctx.Attr<float>("value");
 
-    auto out_eigen = framework::EigenVector<T>::Flatten(*out);
-    auto place = ctx.GetEigenDevice<Place>();
-    out_eigen.device(place) = out_eigen.constant(static_cast<T>(value));
+    math::SetConstant<Place, T> setter;
+    setter(ctx.device_context(), out, static_cast<T>(value));
   }
 };
 
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
index 5a1cba51f83bb8577bc94ae23d1a44bb801ae4c7..818f113b90a4c239a857791fb9957e51d3287b97 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -12,33 +12,41 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/fill_constant_op.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
-class FillConstantOp : public framework::OperatorWithKernel {
+class FillConstantInferShape : public framework::InferShapeBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillConstantOp should not be null.");
     auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto dims = framework::make_ddim(shape_int64);
-    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
   }
+};
 
- protected:
-  framework::OpKernelType GetKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    int data_type = ctx.Attr<int>("data_type");
-    VLOG(10) << " FillConstant data_type = " << data_type;
-    return framework::OpKernelType(static_cast<framework::DataType>(data_type),
-                                   ctx.device_context());
+class FillConstantOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto data_type = static_cast<framework::DataType>(Attr<int>("data_type"));
+    auto value = Attr<float>("value");
+    auto force_cpu = Attr<bool>("force_cpu");
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    if (force_cpu) {
+      auto cpu = platform::CPUPlace();
+      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
+    } else {
+      out.mutable_data(dev_ctx.GetPlace(), framework::ToTypeIndex(data_type));
+    }
+    math::set_constant(dev_ctx, &out, value);
   }
 };
 
@@ -54,6 +62,11 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
     AddAttr<float>("value", "(float, default 0) The value to be filled")
         .SetDefault(0.0f);
+    AddAttr<bool>("force_cpu",
+                  "(bool, default false) Force fill output variable to cpu "
+                  "memory. Otherwise, fill output variable to the running "
+                  "device")
+        .SetDefault(false);
     AddOutput("Out",
               "(Tensor) Tensor of specified shape will be filled "
               "with the specified value");
@@ -69,10 +82,6 @@ Fill up a variable with specified constant value.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
-                             ops::FillConstantOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fill_constant, ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>,
-    ops::FillConstantOpKernel<paddle::platform::CPUPlace, double>,
-    ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>,
-    ops::FillConstantOpKernel<paddle::platform::CPUPlace, int64_t>);
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
+                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/fill_constant_op.h b/paddle/operators/fill_constant_op.h
deleted file mode 100644
index 3668f42f1c29541e29463ff3969064e80703fa04..0000000000000000000000000000000000000000
--- a/paddle/operators/fill_constant_op.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename Place, typename T>
-class FillConstantOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    auto value = ctx.Attr<float>("value");
-
-    auto out_eigen = framework::EigenVector<T>::Flatten(*out);
-    auto place = ctx.GetEigenDevice<Place>();
-    out_eigen.device(place) = out_eigen.constant(static_cast<T>(value));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/fill_zeros_like_op.cu b/paddle/operators/fill_zeros_like_op.cu
index fdbcf520a0d7b4ddfe3fc1837a21e0ce88b8e8fa..a6d4ba64bde534ea76867c456537b130a45b9496 100644
--- a/paddle/operators/fill_zeros_like_op.cu
+++ b/paddle/operators/fill_zeros_like_op.cu
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/fill_zeros_like_op.h"
 
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index cdf56a723b117fe7b08ef2749aa2c2978c923d44..7e7d78eea2bce427d6ad4dfb77bcb4ace35cd287 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -23,10 +23,11 @@ template <typename Place, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* output = context.Output<framework::Tensor>("Y");
-    output->mutable_data<T>(context.GetPlace());
-    auto t = framework::EigenVector<T>::Flatten(*output);
-    t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+    auto* out = context.Output<framework::Tensor>("Y");
+    out->mutable_data<T>(context.GetPlace());
+
+    math::SetConstant<Place, T> setter;
+    setter(context.device_context(), out, static_cast<T>(0));
   }
 };
 
diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc
index deb02bf2bf82a22a2b59b1ee16c222a162863144..35efb12932f1d61fdb511b4ee2cdab3891507c61 100644
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -12,22 +12,57 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/increment_op.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
-class IncrementOp : public framework::OperatorWithKernel {
+class IncrementInferShape : public framework::InferShapeBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of IncrementOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of IncrementOp should not be null.");
+    PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X")));
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+struct IncrementFunctor {
+  IncrementFunctor(const framework::LoDTensor &x, framework::LoDTensor *out,
+                   float value)
+      : x_(x), out_(out), value_(value) {}
+
+  template <typename T>
+  void operator()() const {
+    *out_->data<T>() = *x_.data<T>() + static_cast<T>(value_);
+  }
+
+  const framework::LoDTensor &x_;
+  framework::LoDTensor *out_;
+  float value_;
+};
+
+class IncrementOp : public framework::OperatorBase {
+ public:
+  IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    PADDLE_ENFORCE(platform::is_cpu_place(x.place()));
+    out.Resize(x.dims());
+    out.mutable_data(x.place(), x.type());
+    float value = Attr<float>("step");
+    framework::VisitDataType(framework::ToDataType(out.type()),
+                             IncrementFunctor(x, &out, value));
   }
 };
 
@@ -59,10 +94,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
 
   std::unique_ptr<framework::OpDescBind> Apply() const override {
     auto *grad_op = new framework::OpDescBind();
-    grad_op->SetType("scale");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttr("scale", 1.0f);
+    grad_op->SetType("increment");
+    grad_op->SetInput("X", Output("Out"));
+    grad_op->SetOutput("Out", Input("X"));
+    grad_op->SetAttr("step", -boost::get<float>(GetAttr("step")));
     return std::unique_ptr<framework::OpDescBind>(grad_op);
   }
 };
@@ -71,11 +106,5 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
-                  ops::IncrementGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    increment, ops::IncrementKernel<paddle::platform::CPUPlace, float>,
-    ops::IncrementKernel<paddle::platform::CPUPlace, double>,
-    ops::IncrementKernel<paddle::platform::CPUPlace, int>,
-    ops::IncrementKernel<paddle::platform::CPUPlace, int64_t>);
+REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementInferShape,
+                  ops::IncrementOpMaker, ops::IncrementGradOpMaker);
diff --git a/paddle/operators/increment_op.h b/paddle/operators/increment_op.h
deleted file mode 100644
index 3d53256dd1a277d7face8b43860d6672d7a68cfb..0000000000000000000000000000000000000000
--- a/paddle/operators/increment_op.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename Place, typename T>
-class IncrementKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    tensor->mutable_data<T>(in->place());
-
-    auto step = static_cast<T>(context.Attr<float>("step"));
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place = context.GetEigenDevice<Place>();
-    eigen_out.device(place) = eigen_in + step;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/lod_array_length_op.cc b/paddle/operators/lod_array_length_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80445eb575703be3354595672a4c064b30e0f18c
--- /dev/null
+++ b/paddle/operators/lod_array_length_op.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class LoDArrayLengthOp : public framework::OperatorBase {
+ public:
+  LoDArrayLengthOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    out.Resize({1});
+    auto cpu = platform::CPUPlace();
+    *out.mutable_data<int64_t>(cpu) = static_cast<int64_t>(x.size());
+  }
+};
+
+class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDArrayLengthProtoMaker(framework::OpProto *proto,
+                           framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensorArray) The input tensor array.");
+    AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");
+    AddComment(R"DOC(Get the length of lod tensor array
+
+Out = len(X)
+
+NOTE: The output is a CPU Tensor since the control variable should be only in
+CPU and the length of LoDTensorArray should be used as control variables.
+)DOC");
+  }
+};
+
+class LoDArrayLengthInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasOutput("Out"));
+    context->SetOutputDim("Out", {1});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lod_array_length, ops::LoDArrayLengthOp,
+                  ops::LoDArrayLengthInferShape, ops::LoDArrayLengthProtoMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
index ce010fcb91873b3099f6bf52cfe20c1ff61846ea..f7d4db1947b83fecf57575e17fafe26795c92bdd 100644
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -66,7 +66,8 @@ class LoDRankTableInferVarType : public framework::VarTypeInference {
   void operator()(const framework::OpDescBind &op_desc,
                   framework::BlockDescBind *block) const override {
     for (auto &o : op_desc.Output("Out")) {
-      block->Var(o)->SetType(framework::VarDesc::LOD_RANK_TABLE);
+      block->FindRecursiveOrCreateVar(o)->SetType(
+          framework::VarDesc::LOD_RANK_TABLE);
     }
   }
 };
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
index 5f02f5e8a12831a33683cdc53cf0feb7cb908da5..58af35564d83b9699af4f7783fb6367ff9590682 100644
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -133,6 +133,22 @@ class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
   }
 };
 
+class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("array_to_lod_tensor");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -140,4 +156,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(lod_tensor_to_array, ops::LoDTensorToArrayOp,
                   ops::LoDTensorToArrayOpProtoMaker,
                   ops::LoDTensorToArrayInferShape,
-                  ops::LoDTensorToArrayInferVarType);
+                  ops::LoDTensorToArrayInferVarType,
+                  ops::LoDTensorToArrayGradMaker);
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
index 6b859dbbe7f760a93133e0cb12b6bd3fc5fd88e0..4cbb60f3fdab968e8c36d4fbad55fd3efc7b1d0d 100644
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -24,6 +24,11 @@ class LSTMOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Input"),
                    "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTM should not be null.");
+
     PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                    "Output(Hidden) of LSTM should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Cell"),
@@ -59,11 +64,13 @@ class LSTMOp : public framework::OperatorWithKernel {
                       "The second dimension of Input(Weight) "
                       "should be 4 * %d.",
                       frame_size);
+
     auto b_dims = ctx->GetInputDim("Bias");
     PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
     PADDLE_ENFORCE_EQ(b_dims[0], 1,
                       "The first dimension of Input(Bias) should be 1.");
-    if (ctx->Attrs().Get<bool>("usePeepholes")) {
+
+    if (ctx->Attrs().Get<bool>("use_peepholes")) {
       PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
                         "The second dimension of Input(Bias) should be "
                         "7 * %d if enable peepholes connection",
@@ -74,6 +81,7 @@ class LSTMOp : public framework::OperatorWithKernel {
                         "4 * %d if disable peepholes connection",
                         frame_size);
     }
+
     framework::DDim out_dims({in_dims[0], frame_size});
     ctx->SetOutputDim("Hidden", out_dims);
     ctx->SetOutputDim("Cell", out_dims);
@@ -118,14 +126,13 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Bias",
              "(Tensor) the learnable weights, which contains two parts: "
              "input-hidden bias weight and peephole connections weight if "
-             "setting `usePeepholes` True. "
-             "1. `usePeepholes = False` "
+             "setting `use_peepholes` True. "
+             "1. `use_peepholes = False` "
              " - The shape is (1 x 4D). "
              " - Bias = {b_c, b_i, b_f, b_o}."
-             "2. `usePeepholes = True` "
+             "2. `use_peepholes = True` "
              " - The shape is (1 x 7D). "
-             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.")
-        .AsDispensable();
+             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
     AddOutput("Hidden",
               "(LoDTensor) the hidden state of LSTM operator. "
               "The shape is (T x D), and lod is the same with the `Input`.");
@@ -145,29 +152,32 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
               "(LoDTensor) This LoDTensor is obtained in the forward and used "
               "in the backward.")
         .AsIntermediate();
-    AddAttr<bool>("usePeepholes",
-                  "(bool, default True) "
+    AddAttr<bool>("use_peepholes",
+                  "(bool, defalut: True) "
                   "whether to enable diagonal/peephole connections.")
         .SetDefault(true);
-    AddAttr<bool>("isReverse",
-                  "(bool, default False) "
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
                   "whether to compute reversed LSTM.")
         .SetDefault(false);
     AddAttr<std::string>(
-        "gateActivation",
-        "(string, default sigmoid)"
+        "gate_activation",
+        "(string, default: sigmoid)"
         "The activation for input gate, forget gate and output "
         "gate, `sigmoid` by default.")
-        .SetDefault("sigmoid");
-    AddAttr<std::string>("cellActivation",
-                         "(string, default tanh)"
+        .SetDefault("sigmoid")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("cell_activation",
+                         "(string, default: tanh)"
                          "The activation for cell output, `tanh` by defalut.")
-        .SetDefault("tanh");
-    AddAttr<std::string>("candidateActivation",
-                         "(string, default tanh)"
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("candidate_activation",
+                         "(string, default: tanh)"
                          "The activation for candidate hidden state, "
                          "`tanh` by default.")
-        .SetDefault("tanh");
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
     AddComment(R"DOC(
 Long-Short Term Memory (LSTM) Operator.
 
@@ -203,7 +213,7 @@ are the cell input and cell output activation functions and `tanh` is usually
 used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state,
 which is computed based on the current input and the previous hidden state.
 
-Set usePeepholes False to disable peephole connection 
+Set `use_peepholes` False to disable peephole connection 
 (http://www.bioinf.jku.at/publications/older/2604.pdf). The formula
 is omitted here.
 
@@ -226,23 +236,27 @@ class LSTMGradOp : public framework::OperatorWithKernel {
                    "Input(Hidden) of LSTM should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Cell"),
                    "Input(Cell) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTM should not be null.");
 
     PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
                    "Input(BatchGate) of LSTM should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
                    "Input(BatchGate) of LSTM should not be null.");
 
-    auto in_g_name = framework::GradVarName("Input");
-    if (ctx->HasOutput(in_g_name))
-      ctx->SetOutputDim(in_g_name, ctx->GetInputDim("Input"));
-
-    auto w_g_name = framework::GradVarName("Weight");
-    if (ctx->HasOutput(w_g_name))
-      ctx->SetOutputDim(w_g_name, ctx->GetInputDim("Weight"));
-
-    auto b_g_name = framework::GradVarName("Bias");
-    if (ctx->HasOutput(b_g_name))
-      ctx->SetOutputDim(b_g_name, ctx->GetInputDim("Bias"));
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name))
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+    };
+
+    SetOutGradDim("Input");
+    SetOutGradDim("Weight");
+    SetOutGradDim("Bias");
+    SetOutGradDim("H0");
+    SetOutGradDim("C0");
   }
 
  protected:
diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h
index af088b80b4283cf221a1dff74546d73d977fada3..fca84e2d8fa832a3780eab7e0fa2facceb4d613b 100644
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
@@ -28,6 +28,15 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename Place, typename T>
+inline void ReorderInitState(const platform::DeviceContext& ctx,
+                             const framework::Tensor& src, const size_t* index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
+
 template <typename Place, typename T>
 class LSTMKernel : public framework::OpKernel<T> {
  public:
@@ -36,6 +45,9 @@ class LSTMKernel : public framework::OpKernel<T> {
     auto* weight = ctx.Input<Tensor>("Weight");
     auto* bias = ctx.Input<Tensor>("Bias");
 
+    auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* cell_t0 = ctx.Input<Tensor>("C0");
+
     auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
     batch_gate->mutable_data<T>(ctx.GetPlace());
     auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
@@ -43,12 +55,7 @@ class LSTMKernel : public framework::OpKernel<T> {
     auto* cell_out = ctx.Output<LoDTensor>("Cell");
     cell_out->mutable_data<T>(ctx.GetPlace());
 
-    // Now the function ShareLoD in InferShape is not implemented.
-    // So copy LoD here.
-    ctx.ShareLoD("Input", "Hidden");
-    ctx.ShareLoD("Input", "Cell");
-
-    bool is_reverse = ctx.Attr<bool>("isReverse");
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
     math::LoDTensor2BatchFunctor<Place, T> to_batch;
     auto& device_ctx = ctx.device_context();
     to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
@@ -71,7 +78,7 @@ class LSTMKernel : public framework::OpKernel<T> {
     }
 
     math::LstmMetaValue<T> lstm_value;
-    if (bias) {
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
       T* bias_data = const_cast<T*>(bias->data<T>());
       // the code style in LstmMetaValue will be updated later.
 
@@ -84,6 +91,16 @@ class LSTMKernel : public framework::OpKernel<T> {
       lstm_value.checkOg = nullptr;
     }
     lstm_value.prevStateValue = nullptr;
+    Tensor ordered_c0;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (cell_t0) {
+      // Since the batch computing for LSTM reorders the input sequence
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<Place, T>(device_ctx, *cell_t0, order, &ordered_c0,
+                                 true);
+      lstm_value.prevStateValue = ordered_c0.data<T>();
+    }
 
     // Use the local variable as here.
     LoDTensor batch_hidden, batch_cell;
@@ -94,9 +111,9 @@ class LSTMKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = ctx.Attr<std::string>("gateActivation");
-    auto cell_act = ctx.Attr<std::string>("cellActivation");
-    auto cand_act = ctx.Attr<std::string>("candidateActivation");
+    auto gate_act = ctx.Attr<std::string>("gate_activation");
+    auto cell_act = ctx.Attr<std::string>("cell_activation");
+    auto cand_act = ctx.Attr<std::string>("candidate_activation");
 
     for (size_t n = 0; n < num_batch; n++) {
       int bstart = static_cast<int>(batch_starts[n]);
@@ -109,15 +126,28 @@ class LSTMKernel : public framework::OpKernel<T> {
 
       int cur_batch_size = bend - bstart;
 
-      if (n != 0) {
+      if (n > 0) {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
         math::matmul<Place, T>(device_ctx, pre_hidden_t, false, *weight, false,
                                static_cast<T>(1.0), &gate_t,
                                static_cast<T>(1.0));
+      } else if (hidden_t0) {
+        // If n == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros, the calculation W_h * H0 will be skiped.
+        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+        // Since the batch computing for LSTM reorders the input sequence
+        // according to their length. The initialized hidden state also needs
+        // to reorder.
+        Tensor ordered_h0;
+        ReorderInitState<Place, T>(device_ctx, *hidden_t0, order, &ordered_h0,
+                                   true);
+        math::matmul<Place, T>(device_ctx, ordered_h0, false, *weight, false,
+                               static_cast<T>(1.0), &gate_t,
+                               static_cast<T>(1.0));
       }
-      // else if : FIXME support the initial hidden and cell
 
       lstm_value.gateValue = gate_t.data<T>();
       lstm_value.outputValue = out_t.data<T>();
@@ -160,6 +190,12 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
     auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+
+    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
+    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
+
     auto& device_ctx = ctx.device_context();
     math::SetConstant<Place, T> zero;
     if (weight_g) {
@@ -167,13 +203,25 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       zero(device_ctx, weight_g, static_cast<T>(0.0));
     }
 
+    // ordered_h0/c0 is the reordered hidden/cell initialization.
+    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
+    // initialization.
+    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (c0) {
+      ReorderInitState<Place, T>(device_ctx, *c0, order, &ordered_c0, true);
+    }
+    if (c0 && c0_g) {
+      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
+    }
+
     auto in_dims = input->dims();
     auto out_dims = hidden_g->dims();
     int frame_size = static_cast<int>(in_dims[1] / 4);
     PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
 
     math::LstmMetaValue<T> lstm_value;
-    if (bias) {
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
       T* bias_data = const_cast<T*>(bias->data<T>());
       lstm_value.checkIg = bias_data + 4 * frame_size;
       lstm_value.checkFg = lstm_value.checkIg + frame_size;
@@ -185,9 +233,13 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     }
 
     math::LstmMetaGrad<T> lstm_grad;
+
     if (bias && bias_g) {
-      T* bias_g_data = const_cast<T*>(bias_g->mutable_data<T>(ctx.GetPlace()));
+      bias_g->mutable_data<T>(ctx.GetPlace());
       zero(device_ctx, bias_g, static_cast<T>(0.0));
+    }
+    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_g_data = bias_g->data<T>();
       lstm_grad.checkIgGrad = bias_g_data + 4 * frame_size;
       lstm_grad.checkFgGrad = lstm_grad.checkIgGrad + frame_size;
       lstm_grad.checkOgGrad = lstm_grad.checkFgGrad + frame_size;
@@ -199,36 +251,30 @@ class LSTMGradKernel : public framework::OpKernel<T> {
 
     math::LoDTensor2BatchFunctor<Place, T> to_batch;
 
-    // use the local variable as here.
-    LoDTensor batch_hidden;
-    batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_hidden.set_lod(batch_gate->lod());
-    to_batch(device_ctx, *hidden_out, batch_hidden, false);
+    auto ToBatch = [&batch_gate, &to_batch](
+        const platform::DeviceContext& ctx, const framework::LoDTensor& src,
+        const framework::DDim& dims, framework::LoDTensor& dst) {
+      dst.mutable_data<T>(dims, ctx.GetPlace());
+      dst.set_lod(batch_gate->lod());
+      to_batch(ctx, src, dst, false);
+    };
 
-    LoDTensor batch_hidden_g;
-    batch_hidden_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_hidden_g.set_lod(batch_gate->lod());
-    to_batch(device_ctx, *hidden_g, batch_hidden_g, false);
+    LoDTensor batch_hidden, batch_hidden_g, batch_cell;
+    ToBatch(device_ctx, *hidden_out, out_dims, batch_hidden);
+    ToBatch(device_ctx, *hidden_g, out_dims, batch_hidden_g);
+    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);
 
-    LoDTensor batch_cell;
-    batch_cell.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_cell.set_lod(batch_gate->lod());
-    to_batch(device_ctx, *cell_out, batch_cell, false);
-
-    LoDTensor batch_cell_g;
+    LoDTensor batch_cell_g, batch_gate_g;
     batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_cell_g.set_lod(batch_gate->lod());
     // TODO(qingqing) support the case output cell has gradient.
     // to_batch(device_ctx, *cell_g, batch_cell_g, false);
     zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
-
-    LoDTensor batch_gate_g;
     batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
     batch_gate_g.set_lod(batch_gate->lod());
 
-    auto gate_act = ctx.Attr<std::string>("gateActivation");
-    auto cell_act = ctx.Attr<std::string>("cellActivation");
-    auto cand_act = ctx.Attr<std::string>("candidateActivation");
+    auto gate_act = ctx.Attr<std::string>("gate_activation");
+    auto cell_act = ctx.Attr<std::string>("cell_activation");
+    auto cand_act = ctx.Attr<std::string>("candidate_activation");
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
@@ -250,15 +296,15 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       lstm_grad.gateGrad = gate_g.data<T>();
       lstm_grad.outputGrad = out_g.data<T>();
 
-      if (n) {
+      if (n > 0) {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
         Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
         Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
         lstm_value.prevStateValue = cell_pre.data<T>();
         lstm_grad.prevStateGrad = cell_pre_g.data<T>();
       } else {
-        lstm_value.prevStateValue = nullptr;
-        lstm_grad.prevStateGrad = nullptr;
+        lstm_value.prevStateValue = c0 ? ordered_c0.data<T>() : nullptr;
+        lstm_grad.prevStateGrad = c0_g ? ordered_c0_g.data<T>() : nullptr;
       }
 
       int cur_batch_size = bend - bstart;
@@ -266,7 +312,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
           device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
           gate_act, cell_act, cand_act);
 
-      if (n != 0) {
+      if (n > 0) {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
@@ -280,6 +326,19 @@ class LSTMGradKernel : public framework::OpKernel<T> {
                                  static_cast<T>(1.0), weight_g,
                                  static_cast<T>(1.0));
         }
+      } else {
+        if (h0 && weight_g) {
+          ReorderInitState<Place, T>(device_ctx, *h0, order, &ordered_h0, true);
+          math::matmul<Place, T>(device_ctx, ordered_h0, true, gate_g, false,
+                                 static_cast<T>(1.0), weight_g,
+                                 static_cast<T>(1.0));
+        }
+        if (h0 && h0_g) {
+          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
+          math::matmul<Place, T>(device_ctx, gate_g, false, *weight, true,
+                                 static_cast<T>(1.0), &ordered_h0_g,
+                                 static_cast<T>(0.0));
+        }
       }
     }
 
@@ -302,6 +361,13 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       math::gemv<Place, T>(device_ctx, true, m, n, 1., batch_gate_g.data<T>(),
                            ones.data<T>(), 0., bias_g->data<T>());
     }
+
+    if (h0 && h0_g) {
+      ReorderInitState<Place, T>(device_ctx, ordered_h0_g, order, h0_g, false);
+    }
+    if (c0 && c0_g) {
+      ReorderInitState<Place, T>(device_ctx, ordered_c0_g, order, c0_g, false);
+    }
   }
 };
 
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
index f4519ec16f3f694cf49941f8d23c4106f6f1ddc3..18b9cdf2a39e8226c634194ff2cc56d169979774 100644
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -34,10 +34,10 @@ class LstmUnitOp : public framework::OperatorWithKernel {
     auto c_prev_dims = ctx->GetInputDim("C_prev");
 
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
-    PADDLE_ENFORCE(x_dims[0] == c_prev_dims[0],
-                   "Batch size of inputs and states must be equal");
-    PADDLE_ENFORCE(x_dims[1] == c_prev_dims[1] * 4,
-                   "Dimension of FC should equal to prev state * 4");
+    PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
+                      "Batch size of inputs and states must be equal");
+    PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
+                      "Dimension of FC should equal to prev state * 4");
 
     int b_size = c_prev_dims[0];  // batch size
     int s_dim = c_prev_dims[1];   // state dim
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 90bc9f4f922e7aa09523bad8ffb3ef477dd89857..ab7f23f57043844d45c36acc475422613164bee1 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -13,7 +13,7 @@ if(WITH_GPU)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
-    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
diff --git a/paddle/operators/math/detail/lstm_cpu_kernel.h b/paddle/operators/math/detail/lstm_cpu_kernel.h
index f5b0dd85c9d63805459431f933176581ee3658dc..fc3ad0ce58aa1552ef7e717fb529c2d454b4895a 100644
--- a/paddle/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_cpu_kernel.h
@@ -52,9 +52,9 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
     rValueIg = valueIg[i];
     rValueFg = valueFg[i];
     rValueOg = valueOg[i];
-    rCheckI = value.checkIg[i];
-    rCheckF = value.checkFg[i];
-    rCheckO = value.checkOg[i];
+    rCheckI = value.checkIg ? value.checkIg[i] : 0;
+    rCheckF = value.checkFg ? value.checkFg[i] : 0;
+    rCheckO = value.checkOg ? value.checkOg[i] : 0;
 
     if (value.prevStateValue) {
       rPrevState = value.prevStateValue[i];
@@ -114,9 +114,9 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
     rValueIg = valueIg[i];
     rValueFg = valueFg[i];
     rValueOg = valueOg[i];
-    rCheckI = value.checkIg[i];
-    rCheckF = value.checkFg[i];
-    rCheckO = value.checkOg[i];
+    rCheckI = value.checkIg ? value.checkIg[i] : 0;
+    rCheckF = value.checkFg ? value.checkFg[i] : 0;
+    rCheckO = value.checkOg ? value.checkOg[i] : 0;
     rState = value.stateValue[i];
     rStateAtv = value.stateActiveValue[i];
     rOutputGrad = grad.outputGrad[i];
@@ -155,9 +155,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize,
   __m256 rValueIg;
   __m256 rValueFg;
   __m256 rValueOg;
-  __m256 rCheckI;
-  __m256 rCheckF;
-  __m256 rCheckO;
+  __m256 rCheckI = _mm256_set1_ps(0.0f);
+  __m256 rCheckF = _mm256_set1_ps(0.0f);
+  __m256 rCheckO = _mm256_set1_ps(0.0f);
   __m256 rState;
   __m256 rPrevState = _mm256_set1_ps(0.0f);
   __m256 rStateAtv;
@@ -173,9 +173,11 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize,
     rValueIg = valueIg[i];
     rValueFg = valueFg[i];
     rValueOg = valueOg[i];
-    rCheckI = ((__m256 *)value.checkIg)[i];
-    rCheckF = ((__m256 *)value.checkFg)[i];
-    rCheckO = ((__m256 *)value.checkOg)[i];
+    if (value.checkIg) {
+      rCheckI = ((__m256 *)value.checkIg)[i];
+      rCheckF = ((__m256 *)value.checkFg)[i];
+      rCheckO = ((__m256 *)value.checkOg)[i];
+    }
 
     if (value.prevStateValue) {
       rPrevState = ((__m256 *)value.prevStateValue)[i];
@@ -216,9 +218,9 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
   __m256 rState;
   __m256 rStateAtv;
   __m256 rOutputGrad;
-  __m256 rCheckI;
-  __m256 rCheckF;
-  __m256 rCheckO;
+  __m256 rCheckI = _mm256_set1_ps(0.0f);
+  __m256 rCheckF = _mm256_set1_ps(0.0f);
+  __m256 rCheckO = _mm256_set1_ps(0.0f);
   __m256 rCheckIGrad;
   __m256 rCheckFGrad;
   __m256 rCheckOGrad;
@@ -237,9 +239,11 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
     rValueIg = valueIg[i];
     rValueFg = valueFg[i];
     rValueOg = valueOg[i];
-    rCheckI = ((__m256 *)value.checkIg)[i];
-    rCheckF = ((__m256 *)value.checkFg)[i];
-    rCheckO = ((__m256 *)value.checkOg)[i];
+    if (value.checkIg) {
+      rCheckI = ((__m256 *)value.checkIg)[i];
+      rCheckF = ((__m256 *)value.checkFg)[i];
+      rCheckO = ((__m256 *)value.checkOg)[i];
+    }
     rState = ((__m256 *)value.stateValue)[i];
     rStateAtv = ((__m256 *)value.stateActiveValue)[i];
     rOutputGrad = ((__m256 *)grad.outputGrad)[i];
diff --git a/paddle/operators/math/detail/lstm_gpu_kernel.h b/paddle/operators/math/detail/lstm_gpu_kernel.h
index 8b46510db05fbc87ed482bbcad29c9da2fdfb97c..d138bbe411f69929a14ad19af3e84824ac7a5d58 100644
--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
@@ -55,9 +55,10 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frameSize,
   T rValueIg;
   T rValueFg;
   T rValueOg;
-  T rCheckI = value.checkIg[frameIdx];
-  T rCheckF = value.checkFg[frameIdx];
-  T rCheckO = value.checkOg[frameIdx];
+
+  T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0;
+  T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0;
+  T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0;
 
   rValueIn = value.gateValue[frameIdx];
   rValueIg = value.gateValue[frameIdx + frameSize];
@@ -121,9 +122,10 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
   T rStateGrad;
   T rStateAtv;
   T rOutputGrad;
-  T rCheckI = value.checkIg[frameIdx];
-  T rCheckF = value.checkFg[frameIdx];
-  T rCheckO = value.checkOg[frameIdx];
+  T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0;
+  T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0;
+  T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0;
+
   T rCheckIGrad;
   T rCheckFGrad;
   T rCheckOGrad;
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index 09c3f0b1e6f787547b9253d3aeadf70674708ba0..1b0d4c8bdc683b5203a4bc4b3838560cffe00bc8 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -234,8 +234,8 @@ void gemv<platform::CPUPlace, double>(const platform::DeviceContext& context,
 
 template struct SetConstant<platform::CPUPlace, float>;
 
-struct TensorSetConstant {
-  TensorSetConstant(framework::Tensor* tensor, float value)
+struct TensorSetConstantCPU {
+  TensorSetConstantCPU(framework::Tensor* tensor, float value)
       : tensor_(tensor), value_(value) {}
   template <typename T>
   void operator()() const {
@@ -252,7 +252,7 @@ void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
   framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstant(tensor, value));
+                           TensorSetConstantCPU(tensor, value));
 }
 
 struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 255e480680499877ff599b96b8336a968cccbb34..817deec94314bdfd2ed7e4b0ba5212c72b813455 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -233,8 +233,8 @@ void gemv<platform::GPUPlace, double>(const platform::DeviceContext& context,
 
 template struct SetConstant<platform::GPUPlace, float>;
 
-struct TensorSetConstant {
-  TensorSetConstant(const platform::DeviceContext& context,
+struct TensorSetConstantGPU {
+  TensorSetConstantGPU(const platform::DeviceContext& context,
                     framework::Tensor* tensor, float value)
       : context_(context), tensor_(tensor), value_(value) {}
 
@@ -254,7 +254,7 @@ void set_constant_with_place<platform::GPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
   framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstant(context, tensor, value));
+                           TensorSetConstantGPU(context, tensor, value));
 }
 
 }  // namespace math
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index 1c9eabb2b78f1d69054b347f27854ee8ca3f3d1e..c2aaa1d7b7e920c3e6fd9ae4424eae725c3b7c0e 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -19,11 +19,6 @@ limitations under the License. */
 #include <mkl_vml_functions.h>
 #endif
 
-#ifdef PADDLE_USE_MKL
-#include <mkl.h>
-#include <mkl_lapacke.h>
-#endif
-
 #ifdef PADDLE_USE_ATLAS
 extern "C" {
 #include <cblas.h>
diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc
index 10c6e105b950b9d510e7a14828d72531e8eb0028..5b3bde02fbf981772759caa3d0054fac4a8520f9 100644
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
@@ -22,8 +22,8 @@ template <typename T>
 class CopyMatrixRowsFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::LoDTensor& src, const size_t* index,
-                  framework::LoDTensor& dst, bool is_src_index) {
+                  const framework::Tensor& src, const size_t* index,
+                  framework::Tensor& dst, bool is_src_index) {
     auto src_dims = src.dims();
     auto dst_dims = dst.dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu
index 4f349946785171e6c59b22163ba76791c7244f88..8d04653832d58aa048f73e53b8349a08da3145a4 100644
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
@@ -41,8 +41,8 @@ template <typename T>
 class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::LoDTensor& src, const size_t* index,
-                  framework::LoDTensor& dst, bool is_src_index) {
+                  const framework::Tensor& src, const size_t* index,
+                  framework::Tensor& dst, bool is_src_index) {
     auto src_dims = src.dims();
     auto dst_dims = dst.dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2,
diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h
index b1ba35a6d4a891e9152ac2088bc76e3969be6405..794c7d43973924d470124baf8c0c3de66e4ba087 100644
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
@@ -30,8 +30,8 @@ class CopyMatrixRowsFunctor {
   // copy the input src to the indexed rows of output dst.
   // The indexed rows are based on the input index.
   void operator()(const platform::DeviceContext& context,
-                  const framework::LoDTensor& src, const size_t* index,
-                  framework::LoDTensor& dst, bool is_src_index);
+                  const framework::Tensor& src, const size_t* index,
+                  framework::Tensor& dst, bool is_src_index);
 };
 
 template <typename Place, typename T>
@@ -57,7 +57,7 @@ class LoDTensor2BatchFunctor {
                   bool is_reverse = false) const {
     if (!is_cal_batch_lod) {
       auto lods = batch.lod();
-      PADDLE_ENFORCE_EQ(lods.size(), 2UL);
+      PADDLE_ENFORCE_GT(lods.size(), 2UL);
       PADDLE_ENFORCE_EQ(lods[1].size(),
                         static_cast<size_t>(lod_tensor.dims()[0]));
       CopyMatrixRowsFunctor<Place, T> to_batch;
@@ -66,8 +66,8 @@ class LoDTensor2BatchFunctor {
     }
 
     auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
     auto lod = lods[0];
+    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
 
     std::vector<SeqInfo> seq_info;
     for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
@@ -78,8 +78,7 @@ class LoDTensor2BatchFunctor {
     std::sort(seq_info.begin(), seq_info.end(),
               [](SeqInfo a, SeqInfo b) { return a.length > b.length; });
 
-    // calculate the start position of each batch
-    // (numBatch equal the maxLength of sequences)
+    // Calculate the start position of each batch.
     // example:  sequences = {s0, s1, s2}
     //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
     //           num_batch = 5,
@@ -95,19 +94,25 @@ class LoDTensor2BatchFunctor {
     //                                6, 2, 11,
     //                                7, 3,
     //                                8}
-    // The batch number represents batch size after rearranging the
+    //           seq_order = {1, 0, 2}, the sort order.
+    //               where 1 is the second sequence,
+    //                     0 is the first sequence,
+    //                     2 is the third sequence.
+    // The num_batch represents batch size after rearranging the
     // input LodTensor. It is also the maximum length of input sequence.
 
     paddle::framework::LoD batch_lods;
     batch_lods.emplace_back(std::vector<size_t>{0});
     batch_lods.emplace_back(std::vector<size_t>{0});
+    batch_lods.emplace_back(std::vector<size_t>{0});
 
     // batch_lods[0] is the start positions for batch LoDTensor
     int num_batch = seq_info[0].length;
     batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
     // batch_lods[1] is the raw index in the input LoDTensor
-    auto dims = lod_tensor.dims();
-    batch_lods[1].resize(static_cast<size_t>(dims[0]));
+    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
+    // batch_lods[2] is the sort order for the input LoDTensor.
+    batch_lods[2].resize(seq_info.size());
 
     size_t* batch_starts = batch_lods[0].data();
     size_t* seq2batch_idx = batch_lods[1].data();
@@ -127,6 +132,10 @@ class LoDTensor2BatchFunctor {
       }
       batch_starts[n + 1] = static_cast<size_t>(batch_id);
     }
+    size_t* seq_order = batch_lods[2].data();
+    for (size_t i = 0; i < seq_info.size(); ++i) {
+      seq_order[i] = seq_info[i].seq_idx;
+    }
     batch.set_lod(batch_lods);
 
     CopyMatrixRowsFunctor<Place, T> to_batch;
@@ -141,8 +150,7 @@ class Batch2LoDTensorFunctor {
                   const framework::LoDTensor& batch,
                   framework::LoDTensor& lod_tensor) const {
     auto in_lod = batch.lod();
-    PADDLE_ENFORCE_EQ(in_lod.size(), 2UL,
-                      "The LoD size of input `batch` should be 2.");
+    PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
     PADDLE_ENFORCE_EQ(in_lod[1].size(),
                       static_cast<size_t>(lod_tensor.dims()[0]));
     CopyMatrixRowsFunctor<Place, T> to_seq;
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 78b4bbca84d4670aba73222f1d679604d7516b02..dcc5b4286f4ac833268a779a9a7edd2ed119ffff 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -51,6 +51,7 @@ class MeanGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", framework::GradVarName("X"));
   }
 };
 
diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc
index e8ce16f4cfcf83fd13e4d3a5318a4ae0c8c8449c..19954006195c1e9fd34328b52ed2a9eade526235 100644
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
@@ -75,7 +75,7 @@ class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("VelocityOut", "(Tensor) Output updated velocity");
 
     AddAttr<float>("mu", "(float) Momentum coefficient");
-    AddAttr<bool>("useNesterov",
+    AddAttr<bool>("use_nesterov",
                   "(bool, default false) "
                   "Use Nesterov Momentum")
         .SetDefault(false);
diff --git a/paddle/operators/momentum_op.h b/paddle/operators/momentum_op.h
index e6d6d1da3df9f7e43a93fcc2e12658a01a491f81..8f7f5eb5c21c0342f57a47b85d28f4454f4566c2 100644
--- a/paddle/operators/momentum_op.h
+++ b/paddle/operators/momentum_op.h
@@ -34,7 +34,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
     velocity_out->mutable_data<T>(ctx.GetPlace());
 
     float mu = ctx.Attr<float>("mu");
-    bool use_nesterov = ctx.Attr<bool>("useNesterov");
+    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
 
     auto p_out = framework::EigenVector<T>::Flatten(*param_out);
     auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
index a81444dbe63edeecedc5d822c65ff56c42b5db90..66dc3d6d106a18640adad413d4e967fa101abcfc 100644
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index bd1bdb4f81b88256822d663fe42ad314338c91ff..0eb9df41e9415845f88af283de63856158b447f9 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -16,16 +16,12 @@
 
 #include "paddle/operators/math/math_function.h"
 
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
 class MulKernel : public framework::OpKernel<T> {
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 5858cd4839d367bb888b2b98cde2225751391162..48e322f99398a7f1d6af9cab653d0cc92d981fe0 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -35,6 +35,7 @@ constexpr int kInvalidGPUId = -1;
 struct Communicator {
   std::vector<ncclComm_t> comms_;
   std::unordered_map<int, int> comm_id_map_;
+  bool inited_;
 
   Communicator() {}
 
@@ -42,17 +43,21 @@ struct Communicator {
 
   void InitAll(const std::vector<int>& gpus) {
     comms_.resize(gpus.size());
+    inited_ = false;
     for (size_t i = 0; i < gpus.size(); ++i) {
       comm_id_map_[gpus[i]] = i;
     }
     PADDLE_ENFORCE(
         dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
+    inited_ = true;
   }
 
   ~Communicator() {
-    for (size_t i = 0; i < comms_.size(); ++i) {
-      // FIXME(dzh) : PADDLE_ENFORCE return void
-      dynload::ncclCommDestroy(comms_[i]);
+    if (inited_) {
+      for (size_t i = 0; i < comms_.size(); ++i) {
+        // FIXME(dzh) : PADDLE_ENFORCE return void
+        dynload::ncclCommDestroy(comms_[i]);
+      }
     }
   }
 
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index e5927d56ae7cfbd09e941c993041af46ecd8d70d..56ba57854955c08031214d1f751c17fbb8bb882c 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -26,7 +26,6 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/var_desc.h"
-#include "paddle/operators/math/math_function.h"
 #include "paddle/operators/nccl/nccl_gpu_common.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
diff --git a/paddle/operators/pool_cudnn_op.cu b/paddle/operators/pool_cudnn_op.cu
index 8d0741dccc1fdae069af55da49f44378e2c4ddf8..8711567b95fea355396173b5312d26d31f9ffb12 100644
--- a/paddle/operators/pool_cudnn_op.cu
+++ b/paddle/operators/pool_cudnn_op.cu
@@ -37,11 +37,11 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> {
     const T *input_data = input->data<T>();
     T *output_data = output->mutable_data<T>(ctx.GetPlace());
 
-    std::string pooling_type = ctx.Attr<std::string>("poolingType");
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    if (ctx.Attr<bool>("globalPooling")) {
+    if (ctx.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(input->dims()[i + 2]);
@@ -92,12 +92,12 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
         ctx.Input<Tensor>(framework::GradVarName("Out"));
     Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    std::string pooling_type = ctx.Attr<std::string>("poolingType");
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
 
-    if (ctx.Attr<bool>("globalPooling")) {
+    if (ctx.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(input->dims()[i + 2]);
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index f58aab733866973f477ca79e5e53ba58adbf3dc7..f3963b1995ef8767786f0bf230b134afc69aa99d 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -29,7 +29,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 
   auto in_x_dims = ctx->GetInputDim("X");
 
-  std::string pooling_type = ctx->Attrs().Get<std::string>("poolingType");
+  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
   std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
@@ -37,7 +37,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
   PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                  "Pooling intput should be 4-D or 5-D tensor.");
 
-  if (ctx->Attrs().Get<bool>("globalPooling")) {
+  if (ctx->Attrs().Get<bool>("global_pooling")) {
     ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
     for (size_t i = 0; i < ksize.size(); ++i) {
       paddings[i] = 0;
@@ -83,20 +83,20 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
             "H is the height of the feature, "
             "and W is the width of the feature.");
 
-  AddAttr<std::string>("poolingType",
+  AddAttr<std::string>("pooling_type",
                        "(string), pooling type, can be \"max\" for max-pooling "
                        "and \"avg\" for average-pooling.")
       .InEnum({"max", "avg"});
   AddAttr<std::vector<int>>("ksize",
                             "(vector<int>) The pooling window "
                             "size(height, width) of the pooling operator. "
-                            "If globalPooling = true, ksize and paddings will "
+                            "If global_pooling = true, ksize and paddings will "
                             "be ignored.");  // TODO(Chengduo): Add checker.
                                              // (Currently,
   // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>("globalPooling",
+  AddAttr<bool>("global_pooling",
                 "(bool, default false) Whether to use the global pooling. "
-                "If globalPooling = true, ksize and paddings will be ignored.")
+                "If global_pooling = true, ksize and paddings will be ignored.")
       .SetDefault(false);
   AddAttr<std::vector<int>>("strides",
                             "(vector<int>, default {1, 1}), strides(height, "
@@ -107,7 +107,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
       "paddings",
       "(vector<int>, defalut {0,0}), paddings(height, width) of pooling "
       "operator."
-      "If globalPooling = true, paddings and ksize will be ignored.")
+      "If global_pooling = true, paddings and ksize will be ignored.")
       .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
   // TypedAttrChecker don't support vector type.)
 
@@ -115,7 +115,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
 Pool2d Operator.
 
 The pooling2d operation calculates the output based on
-the input, poolingType and ksize, strides, paddings parameters.
+the input, pooling_type and ksize, strides, paddings parameters.
 Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
 number of channels, H is the height of the feature, and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
@@ -152,7 +152,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
             "the number of channels, and D, H and W is the depth, height and "
             "width of the feature, respectively.");
 
-  AddAttr<std::string>("poolingType",
+  AddAttr<std::string>("pooling_type",
                        "(string) Pooling type, can be \"max\" for max-pooling "
                        "and \"avg\" for average-pooling.")
       .InEnum({"max", "avg"});
@@ -160,13 +160,14 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
       "ksize",
       "(vector<int>) The pooling window size(depth, height, "
       "width) of pooling operator. "
-      "If globalPooling = true, ksize and paddings will "
+      "If global_pooling = true, ksize and paddings will "
       "be ignored.");  // TODO(Chengduo): Add checker.
                        // (Currently,
   // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>("globalPooling",
-                "(bool, default false) Whether to use the global pooling. "
-                "If globalPooling = true, ksize and paddings wille be ignored.")
+  AddAttr<bool>(
+      "global_pooling",
+      "(bool, default false) Whether to use the global pooling. "
+      "If global_pooling = true, ksize and paddings wille be ignored.")
       .SetDefault(false);
   AddAttr<std::vector<int>>(
       "strides",
@@ -178,7 +179,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
       "paddings",
       "(vector<int>, defalut {0,0,0}), paddings(depth, height, "
       "width) of pooling operator. "
-      "If globalPooling = true, ksize and paddings will be ignored.")
+      "If global_pooling = true, ksize and paddings will be ignored.")
       .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                                // TypedAttrChecker don't support vector type.)
 
@@ -186,7 +187,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
 Pool3d Operator.
 
 The pooling3d operation calculates the output based on
-the input, poolingType, ksize, strides, and paddings parameters.
+the input, pooling_type, ksize, strides, and paddings parameters.
 Input(X) and output(Out) are in NCDHW format, where N is batch
 size, C is the number of channels, and D, H and W are the depth, height and
 width of the feature, respectively. Parameters(ksize, strides, paddings) 
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index d9d445f6a6257b0c8a1959c64c9a878539e10cd4..4da1941ab541483e706257667b14aa5a95e0c3cc 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -57,11 +57,11 @@ class PoolKernel : public framework::OpKernel<T> {
     const Tensor* in_x = context.Input<Tensor>("X");
     Tensor* out = context.Output<Tensor>("Out");
 
-    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
@@ -119,12 +119,12 @@ class PoolGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
 
-    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
 
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index a31b3fcb7083ba5bfbcb6666c4641833ae1e69eb..1df36e965abab3549aeb88bf682b712033c4d79c 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -44,7 +44,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                    "Pooling intput should be 4-D or 5-D tensor.");
 
-    if (ctx->Attrs().Get<bool>("globalPooling")) {
+    if (ctx->Attrs().Get<bool>("global_pooling")) {
       ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
@@ -110,14 +110,14 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int>>("ksize",
                               "(vector<int>) The pooling window size(height, "
                               "width) of pooling operator. "
-                              "If globalPooling = true, ksize and paddings "
+                              "If global_pooling = true, ksize and paddings "
                               "will be ignored.");  // TODO(Chengduo): Add
                                                     // checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
-        "globalPooling",
+        "global_pooling",
         "(bool, default false) Whether to use the global pooling. "
-        "If globalPooling = true, ksize and paddings will be ignored.")
+        "If global_pooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
                               "(vector<int>, default {1, 1}), strides(height, "
@@ -128,7 +128,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
         "paddings",
         "(vector<int>, defalut {0, 0}), paddings(height, width) of pooling "
         "operator. "
-        "If globalPooling = true, paddings and will be ignored.")
+        "If global_pooling = true, paddings and will be ignored.")
         .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
 
@@ -188,14 +188,14 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int>>("ksize",
                               "(vector<int>) The pooling window size(depth, "
                               "height, width) of pooling operator. "
-                              "If globalPooling = true, ksize and paddings "
+                              "If global_pooling = true, ksize and paddings "
                               "will be ignored.");  // TODO(Chengduo): Add
                                                     // checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
-        "globalPooling",
+        "global_pooling",
         "(bool, default false) Whether to use the global pooling. "
-        "If globalPooling = true, ksize and paddings will be ignored.")
+        "If global_pooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
                               "(vector<int>, default {1,1,1}), strides(depth, "
@@ -206,7 +206,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
         "paddings",
         "(vector, defalut {0,0,0}), paddings(depth, "
         "height, width) of pooling operator. "
-        "If globalPooling = true, paddings and ksize will be ignored.")
+        "If global_pooling = true, paddings and ksize will be ignored.")
         .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
 
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h
index 48627740435b7d397c5a53491c1f89ba1b603803..ea37de84abeb577461ccd5c1f0eda8bacb4458eb 100644
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -35,7 +35,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
@@ -72,7 +72,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index b0e87b7059eab3772c179fe31cdb09477b589ed1..0075ccd24271bf83f139e121efad00c2316cc11b 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -387,8 +387,8 @@ class RecurrentGradOp : public RecurrentBase {
         auto &p_names = Inputs(kParameters);
         PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
 
-        for (size_t prog_id = 0; prog_id < pg_names.size(); ++prog_id) {
-          auto inside_grad_name = framework::GradVarName(p_names[prog_id]);
+        for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
+          auto inside_grad_name = framework::GradVarName(p_names[param_id]);
 
           // If does not compute gradient of that variable inside rnn, just
           // continue
@@ -406,27 +406,19 @@ class RecurrentGradOp : public RecurrentBase {
             attrs["value"] = 0.0f;
 
             auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", {}, {{"Out", {pg_names[prog_id]}}}, attrs);
+                "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
             zero_op->Run(scope, dev_ctx);
           }
 
+          auto new_inside_name = cur_scope.Rename(inside_grad_name);
           // sum gradient
-          auto *outside_var = scope.FindVar(pg_names[prog_id]);
-          PADDLE_ENFORCE(outside_var != nullptr);
-          auto &outside_tensor =
-              *outside_var->GetMutable<framework::LoDTensor>();
-
-          std::string result_var_name;
-          auto *local_result_var = cur_scope.Var(&result_var_name);
-          auto &local_result_tensor =
-              *local_result_var->GetMutable<framework::LoDTensor>();
-
-          local_result_tensor.ShareDataWith(outside_tensor);
 
           auto sum_op = framework::OpRegistry::CreateOp(
-              "sum", {{"X", {result_var_name, inside_grad_name}}},
-              {{"Out", {result_var_name}}}, {});
+              "sum", {{"X", {pg_names[param_id], new_inside_name}}},
+              {{"Out", {pg_names[param_id]}}}, {});
           sum_op->Run(cur_scope, dev_ctx);
+
+          cur_scope.Rename(new_inside_name, inside_grad_name);
         }
       }
       VLOG(5) << "Accumulate Parameter finished ";
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
index 64097ef2525d734f79f22ddd7957b3216b06ee7b..db737bed7a4d2dc5b60cbc6ac172caec95acd35e 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -68,38 +68,42 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The level should be less than the level number of inputs.")
         .SetDefault(0);
     AddComment(R"DOC(
-Sequence Concat Operator.
-
-The sequence_concat operator concatenates multiple LoDTensors.
-It supports a sequence (LoD Tensor with level number is 1)
+The sequence_concat operator concatenates multiple LoDTensors. 
+It only supports sequence (LoD Tensor with level number is 1) 
 or a nested sequence (LoD tensor with level number is 2) as its input.
-The following examples explain how the operator works:
 - Case1:
   If the axis is other than 0(here, axis is 1 and level is 1),
-  each input should have the same LoD information and the LoD
+  each input should have the same LoD information and the LoD 
   information of the output keeps the same as the input.
 
-    LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-    LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4)
-    LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4)
+  LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
 
 - Case2:
-  If the axis is 0(here, leve is 0), the inputs are concatenated along
+  If the axis is 0(here, leve is 0), the inputs are concatenated along 
   time steps, the LoD information of the output need to re-compute.
+  The LoD information of level-1 should be same.
 
-    LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-    LoD(x1) = {{0,3,5}, {0,1,2,3,5}}; Dims(x1) = (5,3,4)
-    LoD(Out) = {{0,5,9}, {0,1,2,3,4,5,6,7,9}}; Dims(Out) = (9,3,4)
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,2,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,2,4}, {0,2,5,8,11}}; Dims(Out) = (11,3,4)
 
 - Case3:
   If the axis is 0(here, level is 1).
 
-    LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-    LoD(x1) = {{0,3,5}, {0,1,3,4,5}}; Dims(x1) = (5,3,4)
-    LoD(Out) = {{0,5,9}, {0,2,5,7,9}}; Dims(Out) = (9,3,4)
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,3,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,5,8}, {0,1,2,3,5,7,8,9,11}}; Dims(Out) = (11,3,4)
 
-NOTE: The levels of all the inputs should be the same.
+- Case4:
+  If the LoD number is 1, axis is 0, level is 0
 
+  LoD(x0) = {{0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,2,5,8,11}}; Dims(Out) = (11,3,4)
+
+NOTE: The levels of all the inputs should be the same.
     )DOC");
   }
 };
diff --git a/paddle/operators/sequence_concat_op.cu b/paddle/operators/sequence_concat_op.cu
index 8dc4764785871262d21a5631cc9e8b805ba84244..9ca99c2258f547e6f9c23be0d394bc3ea2bb6678 100644
--- a/paddle/operators/sequence_concat_op.cu
+++ b/paddle/operators/sequence_concat_op.cu
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
-
 #include "paddle/operators/sequence_concat_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h
index 6adf96120c99f9b84a1ff947058e65ac3ddff1d4..09212070aa90b0f080f6140a312924229162aaec 100644
--- a/paddle/operators/sequence_concat_op.h
+++ b/paddle/operators/sequence_concat_op.h
@@ -24,28 +24,38 @@ using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;
 
 template <typename T>
-LoD concatLoD(const std::vector<const T*> ins, const size_t axis,
-              const size_t level) {
+LoD ConcatLoD(const std::vector<const T*> ins, const size_t level) {
   auto out_lod = ins[0]->lod();
+  auto numLevels = ins[0]->NumLevels();
   const size_t n = ins.size();
-  if (axis == 0UL) {
-    for (size_t i = 1; i < n; ++i) {
-      for (size_t j = 0; j < ins[i]->lod()[0].size(); ++j) {
-        out_lod[0][j] += ins[i]->lod()[0][j];
-      }
+  const size_t level_idx = ins[0]->NumLevels() - 1 - level;
+  for (size_t i = 1; i < n; ++i) {
+    for (size_t j = 0; j < ins[i]->lod()[level_idx].size(); ++j) {
+      out_lod[level_idx][j] += ins[i]->lod()[level_idx][j];
+    }
+  }
 
-      if (ins[0]->NumLevels() == 2) {
-        for (size_t j = 1; j < ins[i]->lod()[1].size(); ++j) {
-          if (level == 0UL) {
-            out_lod[1].push_back(out_lod[1].back() + ins[i]->lod()[1][j] -
-                                 ins[i]->lod()[1][j - 1]);
-          } else if (level == 1UL) {
-            out_lod[1][j] += ins[1]->lod()[1][j];
-          }
+  for (size_t i = level_idx; i < numLevels - 1; ++i) {
+    size_t lod_len = 1;
+    for (size_t j = 0; j < n; ++j) {
+      lod_len += ins[j]->lod()[i + 1].size() - 1;
+    }
+    out_lod[i + 1].clear();
+    out_lod[i + 1].resize(lod_len);
+
+    size_t idx = 1;
+    for (size_t j = 0; j < ins[0]->lod()[i].size() - 1; ++j) {
+      for (size_t k = 0; k < n; ++k) {
+        for (size_t m = ins[k]->lod()[i][j]; m < ins[k]->lod()[i][j + 1]; ++m) {
+          out_lod[i + 1][idx] = out_lod[i + 1][idx - 1] +
+                                ins[k]->lod()[i + 1][m + 1] -
+                                ins[k]->lod()[i + 1][m];
+          idx++;
         }
       }
     }
   }
+
   return out_lod;
 }
 
@@ -82,18 +92,21 @@ class SequenceConcatOpKernel : public framework::OpKernel<T> {
                       "should be greater than the specify level");
 
     out->mutable_data<T>(ctx.GetPlace());
-    auto out_lod = concatLoD<LoDTensor>(ins, axis, level);
+    auto out_lod = ins[0]->lod();
+    if (axis == 0) {
+      out_lod = ConcatLoD<LoDTensor>(ins, level);
+    }
     out->set_lod(out_lod);
 
-    auto out_lod_level = out_lod[level];
+    const size_t level_idx = out_lod.size() - level - 1;
+    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
     for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
       Tensor out_t = out->Slice(static_cast<int>(out_lod_level[i]),
                                 static_cast<int>(out_lod_level[i + 1]));
       auto out_stride = framework::stride(out_t.dims());
       size_t offset = 0;
-
       for (size_t j = 0; j < n; ++j) {
-        auto in_lod_level = ins[j]->lod()[level];
+        auto in_lod_level = framework::ToAbsOffset(ins[j]->lod())[level_idx];
         auto in_stride = framework::stride(ins[j]->dims());
         Tensor in_t = ins[j]->Slice(static_cast<int>(in_lod_level[i]),
                                     static_cast<int>(in_lod_level[i + 1]));
@@ -124,9 +137,12 @@ class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
       x_grads[i]->set_lod(ins[i]->lod());
       x_grads[i]->mutable_data<T>(ctx.GetPlace());
     }
-
-    auto out_lod = concatLoD<LoDTensor>(ins, axis, level);
-    auto out_lod_level = out_lod[level];
+    auto out_lod = ins[0]->lod();
+    if (axis == 0UL) {
+      out_lod = ConcatLoD<LoDTensor>(ins, level);
+    }
+    const size_t level_idx = out_lod.size() - level - 1;
+    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
 
     for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
       Tensor out_grad_t =
@@ -136,7 +152,8 @@ class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
       size_t offset = 0;
 
       for (size_t j = 0; j < n; ++j) {
-        auto x_grad_lod_level = x_grads[j]->lod()[level];
+        auto x_grad_lod_level =
+            framework::ToAbsOffset(x_grads[j]->lod())[level_idx];
         auto x_grad_stride = framework::stride(x_grads[j]->dims());
         Tensor x_grad_t =
             x_grads[j]->Slice(static_cast<int>(x_grad_lod_level[i]),
diff --git a/paddle/operators/sequence_softmax_op.cu b/paddle/operators/sequence_softmax_op.cu
index f2a1e3d5e31ef21b95a51b287bdd1d4aa9221e89..7023795a3b5777c250a9323a304a54849d763e9e 100644
--- a/paddle/operators/sequence_softmax_op.cu
+++ b/paddle/operators/sequence_softmax_op.cu
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
-
 #include "paddle/operators/sequence_softmax_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h
index 3eb1e2844dff6ac94e86dcf4586bb51bc33adbec..1b68dd0662ddfffc57b187945fe131e202c55174 100644
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/softmax.h"
 
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..65bccc0c81d0ad9674649933a20ec7b09fec5b37
--- /dev/null
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/operators/array_operator.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class ShrinkRNNMemoryOp : public ArrayOp {
+ public:
+  ShrinkRNNMemoryOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *x_var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
+    auto &x_tensor = x_var->Get<framework::LoDTensor>();
+    size_t offset = this->GetOffset(scope, dev_ctx);
+    auto *rank_table_var = scope.FindVar(Input("RankTable"));
+    PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set");
+    auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
+
+    auto &rank_items = rank_table.items();
+    int dst_num_rows =
+        std::lower_bound(rank_items.begin(), rank_items.end(), offset,
+                         [](const framework::LoDRankTable::TableItem &a,
+                            size_t b) { return a.length > b; }) -
+        rank_items.begin();
+
+    auto *out_var = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(out_var != nullptr, "Output Out must be set");
+    auto &out_tensor = *out_var->GetMutable<framework::LoDTensor>();
+    if (dst_num_rows != 0) {
+      out_tensor.ShareDataWith(x_tensor.Slice(0, dst_num_rows));
+    }
+  }
+};
+
+class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto,
+                              framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddInput("RankTable", "");
+    AddInput("I", "");
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class ShrinkRNNMemoryInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasInput("I"));
+    PADDLE_ENFORCE(context->HasInput("RankTable"));
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ShrinkRNNMemoryGradOp : public ArrayOp {
+ public:
+  ShrinkRNNMemoryGradOp(const std::string &type,
+                        const framework::VariableNameMap &inputs,
+                        const framework::VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
+    auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
+    PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
+    auto *x_var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x_var != nullptr);
+
+    auto &x_tensor = x_var->Get<framework::LoDTensor>();
+    auto &dx_tensor = *dx_var->GetMutable<framework::LoDTensor>();
+    dx_tensor.Resize(x_tensor.dims());
+    dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
+
+    if (dout_var == nullptr) {  // dx_tensor fill zero
+      math::set_constant(dev_ctx, &dx_tensor, 0.0f);
+    } else {
+      auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
+      auto height = dout_tensor.dims()[0];
+      dx_tensor.Slice(0, static_cast<int>(height))
+          .CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx);
+      if (dx_tensor.dims()[0] < height) {
+        auto rest_tensor = dx_tensor.Slice(
+            static_cast<int>(height), static_cast<int>(dout_tensor.dims()[0]));
+        math::set_constant(dev_ctx, &rest_tensor, 0.0f);
+      }
+    }
+  }
+};
+
+class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasOutput(framework::GradVarName("X")));
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+  }
+};
+
+class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *op = new framework::OpDescBind();
+    op->SetType("shrink_rnn_memory_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shrink_rnn_memory, ops::ShrinkRNNMemoryOp,
+                  ops::ShrinkRNNMemoryInferShape,
+                  ops::ShrinkRNNMemoryOpProtoMaker, ops::ShrinkRNNGradOpMaker);
+REGISTER_OPERATOR(shrink_rnn_memory_grad, ops::ShrinkRNNMemoryGradOp,
+                  ops::ShrinkRNNMemoryGradInferShape);
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
index 2e99a89699dbdcafc8055c47debf9e49f10507e6..013ace19ae3d4a1af29b570ba33fea3e4595fe5b 100644
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/operators/softmax_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 2c08853f4f615bfe95f51aa20776ddddcdaa8f61..44d1e63f1bb4798144218cd1caf01f133825bcff 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/softmax.h"
 
@@ -21,9 +20,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
 class SoftmaxKernel : public framework::OpKernel<T> {
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 750f96296a8414019265b26095d50eefb7dbb2dd..57b99bdb3a9359bbfdbe62a6fc9afca6c4d5df9e 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -99,11 +99,12 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
 
     bool any_input_is_lod_tensor = std::any_of(
         inputs.begin(), inputs.end(), [block](const std::string& name) {
-          return block->Var(name)->GetType() == framework::VarDesc::LOD_TENSOR;
+          return block->FindRecursiveOrCreateVar(name)->GetType() ==
+                 framework::VarDesc::LOD_TENSOR;
         });
 
     auto is_tensor_array = [block](const std::string& name) {
-      return block->Var(name)->GetType() ==
+      return block->FindRecursiveOrCreateVar(name)->GetType() ==
              framework::VarDesc::LOD_TENSOR_ARRAY;
     };
 
@@ -120,7 +121,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
     }
 
     auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(var_type);
+    block->FindRecursiveOrCreateVar(out_var_name)->SetType(var_type);
   }
 };
 
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index 50824032ca0e23b6f961928103ea4aa74b6ac23a..62e15604c47f25c458abc69ecd1cabf964de39bb 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -11,48 +11,18 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/operators/array_operator.h"
 
 namespace paddle {
 namespace operators {
-class ArrayOpBase : public framework::OperatorBase {
- public:
-  ArrayOpBase(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {}
-
- protected:
-  size_t GetOffset(const framework::Scope &scope,
-                   const platform::DeviceContext &dev_ctx) const {
-    auto *i = scope.FindVar(Input("I"));
-    PADDLE_ENFORCE(i != nullptr, "I must be set");
-    auto &i_tensor = i->Get<framework::LoDTensor>();
-    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
-    size_t offset;
-    if (platform::is_gpu_place(i_tensor.place())) {
-      // FIXME: Avoid copy from GPU to CPU
-      framework::Tensor t;
-      t.CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx);
-      dev_ctx.Wait();
-      offset = static_cast<size_t>(*t.data<int64_t>());
-    } else {
-      offset = static_cast<size_t>(*i_tensor.data<int64_t>());
-    }
-    return offset;
-  }
-};
 
-class WriteToArrayOp : public ArrayOpBase {
+class WriteToArrayOp : public ArrayOp {
  public:
   WriteToArrayOp(const std::string &type,
                  const framework::VariableNameMap &inputs,
                  const framework::VariableNameMap &outputs,
                  const framework::AttributeMap &attrs)
-      : ArrayOpBase(type, inputs, outputs, attrs) {}
+      : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
            const platform::DeviceContext &dev_ctx) const override {
@@ -117,18 +87,19 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
                   framework::BlockDescBind *block) const override {
     for (auto &out_var : op_desc.OutputArgumentNames()) {
       VLOG(10) << "Set Variable " << out_var << " as LOD_TENSOR_ARRAY";
-      block->Var(out_var)->SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+      block->FindRecursiveOrCreateVar(out_var)->SetType(
+          framework::VarDesc::LOD_TENSOR_ARRAY);
     }
   }
 };
 
-class ReadFromArrayOp : public ArrayOpBase {
+class ReadFromArrayOp : public ArrayOp {
  public:
   ReadFromArrayOp(const std::string &type,
                   const framework::VariableNameMap &inputs,
                   const framework::VariableNameMap &outputs,
                   const framework::AttributeMap &attrs)
-      : ArrayOpBase(type, inputs, outputs, attrs) {}
+      : ArrayOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
            const platform::DeviceContext &dev_ctx) const override {
     auto *x = scope.FindVar(Input("X"));
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ca6c8507a48507fd29a9c9acae2bdf36ed936ee
--- /dev/null
+++ b/paddle/operators/while_op.cc
@@ -0,0 +1,197 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <vector>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+using StepScopeVar = std::vector<framework::Scope *>;
+using LoDTensor = framework::LoDTensor;
+
+constexpr char kStepBlock[] = "step_block";
+constexpr char kCondition[] = "Condition";
+constexpr char kStepScopes[] = "StepScopes";
+constexpr char kParamGrads[] = "X@Grad";
+constexpr char kParameters[] = "X";
+
+class WhileOp : public framework::OperatorBase {
+ public:
+  WhileOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+    auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+    framework::Executor executor(dev_ctx);
+    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *program = block->Program();
+
+    auto step_scopes =
+        scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
+
+    while (cond.data<bool>()[0]) {
+      auto &current_scope = scope.NewScope();
+      step_scopes->push_back(&current_scope);
+
+      executor.Run(*program, &current_scope, block->ID(),
+                   false /*create_local_scope*/);
+    }
+  }
+};
+
+class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  WhileOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kParameters,
+             "A set of variables, which are required by operators inside the "
+             "block of While Op.")
+        .AsDuplicable();
+    AddInput(
+        kCondition,
+        "(Bool) An scalar. When it's False, the While Op will be terminated.")
+        .AsDuplicable();
+    AddOutput("Out",
+              "A set of variables, which will be assigned with values "
+              "generated by perators inside the block of While Op.")
+        .AsDuplicable();
+    AddOutput(kStepScopes,
+              "(StepScopeVar) A vector of local scope, which size equals the "
+              "step number of While Op. The i'th scope storages temporary "
+              "variables generated in the i'th step.");
+    AddAttr<framework::BlockDescBind *>(kStepBlock,
+                                        "The step block inside WhileOp");
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+class WhileGradOp : public framework::OperatorBase {
+ public:
+  WhileGradOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    //    PADDLE_ENFORCE(...)
+
+    framework::Executor executor(dev_ctx);
+    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *program = block->Program();
+
+    auto *step_scopes =
+        scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
+
+    for (auto cur_scope_iter = step_scopes->rbegin();
+         cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
+      executor.Run(*program, *cur_scope_iter, block->ID(), false);
+
+      auto &pg_names = Outputs(kParamGrads);
+      auto &p_names = Inputs(kParameters);
+      PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
+      for (size_t prog_id = 0; prog_id < pg_names.size(); ++prog_id) {
+        auto inside_grad_name = framework::GradVarName(p_names[prog_id]);
+
+        //  // TODO(tonyyang-savil: Not sure we need the following
+        //  // If does not compute gradient of that variable inside rnn,
+        //  just
+        //  // continue
+        //  if (local_var_names.find(inside_grad_name) ==
+        //  local_var_names.end()) {
+        //    continue;
+        //  }
+
+        // zero gradient variable in step 0
+        if (cur_scope_iter == step_scopes->rbegin()) {
+          auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
+          PADDLE_ENFORCE_NOT_NULL(var);
+          if (var->IsType<LoDTensor>()) {
+            auto &inside_tensor = var->Get<framework::LoDTensor>();
+            framework::AttributeMap attrs;
+            attrs["data_type"] = framework::ToDataType(inside_tensor.type());
+            attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
+            attrs["value"] = 0.0f;
+
+            auto zero_op = framework::OpRegistry::CreateOp(
+                "fill_constant", {}, {{"Out", {pg_names[prog_id]}}}, attrs);
+            zero_op->Run(scope, dev_ctx);
+          }
+        }
+
+        // sum gradient
+        auto *outside_var = scope.FindVar(pg_names[prog_id]);
+        PADDLE_ENFORCE_NOT_NULL(outside_var);
+        auto &outside_tensor = *outside_var->GetMutable<framework::LoDTensor>();
+
+        std::string result_var_name;
+        auto *local_result_var = (*cur_scope_iter)->Var(&result_var_name);
+        auto &local_result_tensor =
+            *local_result_var->GetMutable<framework::LoDTensor>();
+
+        local_result_tensor.ShareDataWith(outside_tensor);
+
+        auto sum_op = framework::OpRegistry::CreateOp(
+            "sum", {{"X", {result_var_name, inside_grad_name}}},
+            {{"Out", {result_var_name}}}, {});
+        sum_op->Run(**cur_scope_iter, dev_ctx);
+      }
+    }
+  }
+};
+
+class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  virtual std::unique_ptr<framework::OpDescBind> Apply() const {
+    auto *grad = new framework::OpDescBind();
+    grad->SetType("while_grad");
+    for (auto &input_param : this->InputNames()) {
+      grad->SetInput(input_param, this->Input(input_param));
+      grad->SetOutput(framework::GradVarName(input_param),
+                      this->InputGrad(input_param));
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      grad->SetInput(output_param, this->Output(output_param));
+      if (output_param != kStepScopes) {
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->OutputGrad(output_param));
+      }
+    }
+    grad->SetAttrMap(this->Attrs());
+    grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
+
+    return std::unique_ptr<framework::OpDescBind>(grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(while, paddle::operators::WhileOp,
+                  paddle::operators::WhileOpMaker,
+                  paddle::operators::WhileGradOpDescMaker);
diff --git a/paddle/platform/call_once.h b/paddle/platform/call_once.h
new file mode 100644
index 0000000000000000000000000000000000000000..248baf6613c5caebdabcecff5d57290585238d78
--- /dev/null
+++ b/paddle/platform/call_once.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <mutex>
+
+namespace paddle {
+namespace platform {
+
+/*
+ The current implementation of std::call_once has a bug described in
+ https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call.
+ This is likely caused by a deeper bug of pthread_once, which is discussed in
+ https://patchwork.ozlabs.org/patch/482350/
+
+ This wrap is a hack to avoid this bug.
+*/
+template <class Callable, class... Args>
+inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
+  bool good = false;
+  std::exception ex;
+  std::call_once(flag, [&]() {
+    try {
+      f(args...);
+      good = true;
+    } catch (const std::exception& e) {
+      ex = e;
+    } catch (...) {
+      ex = std::runtime_error("excption caught in call_once");
+    }
+  });
+  if (!good) {
+    throw std::exception(ex);
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
index 0618c7414fd1235e81ee9d92a3a07b53d6ad6ebc..981b2ab258a34ce92f02ee12b5957f88ba61d1c0 100644
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
@@ -17,6 +17,7 @@
 #include <dlfcn.h>
 #include <nccl.h>
 #include <mutex>
+#include "paddle/platform/call_once.h"
 #include "paddle/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -27,18 +28,18 @@ extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;
 
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                    \
-  struct DynLoad__##__name {                                      \
-    template <typename... Args>                                   \
-    auto operator()(Args... args) -> decltype(__name(args...)) {  \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);   \
-      std::call_once(nccl_dso_flag,                               \
-                     paddle::platform::dynload::GetNCCLDsoHandle, \
-                     &nccl_dso_handle);                           \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);    \
-    }                                                             \
-  };                                                              \
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                         \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    auto operator()(Args... args) -> decltype(__name(args...)) {       \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);        \
+      platform::call_once(nccl_dso_flag,                               \
+                          paddle::platform::dynload::GetNCCLDsoHandle, \
+                          &nccl_dso_handle);                           \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);              \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);         \
+    }                                                                  \
+  };                                                                   \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
index f196868c725cbb91b3df710260c5b60f14d53f37..bb9d59ec0a18ce013632f128c9b5d230255f1ac4 100644
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
@@ -49,8 +49,6 @@ struct Transform<platform::CPUPlace> {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
   void operator()(const DeviceContext& context, InputIter first, InputIter last,
                   OutputIter result, UnaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
     std::transform(first, last, result, op);
   }
 
@@ -59,8 +57,6 @@ struct Transform<platform::CPUPlace> {
   void operator()(const DeviceContext& context, InputIter1 first1,
                   InputIter1 last1, InputIter2 first2, OutputIter result,
                   BinaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
     std::transform(first1, last1, first2, result, op);
   }
 };
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index ebf0911d6ea0b39d51447859ae2aef485b50b0e6..2c2cc6245932d4af56a68d6399ce31f008bf3748 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -321,6 +321,19 @@ message ClipConfig {
   required double max = 2;
 }
 
+message ROIPoolConfig {
+  required uint32 pooled_width = 1;
+  required uint32 pooled_height = 2;
+  required float spatial_scale = 3;
+  optional uint32 height = 4 [ default = 1 ];
+  optional uint32 width = 5 [ default = 1 ];
+}
+
+message ScaleSubRegionConfig {
+  required ImageConfig image_conf = 1;
+  required float value = 2;
+}
+
 message LayerInputConfig {
   required string input_layer_name = 1;
   optional string input_parameter_name = 2;
@@ -342,6 +355,8 @@ message LayerInputConfig {
   optional MultiBoxLossConfig multibox_loss_conf = 16;
   optional DetectionOutputConfig detection_output_conf = 17;
   optional ClipConfig clip_conf = 18;
+  optional ScaleSubRegionConfig scale_sub_region_conf = 19;
+  optional ROIPoolConfig roi_pool_conf = 20;
 }
 
 message LayerConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 0e65598485d8785b3f5b2f1bc7e87f377b35792e..43d02bf70e74c3903d50a4a2177059f4f474045a 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1969,6 +1969,18 @@ class DetectionOutputLayer(LayerBase):
         self.config.size = size
 
 
+@config_layer('roi_pool')
+class ROIPoolLayer(LayerBase):
+    def __init__(self, name, inputs, pooled_width, pooled_height, spatial_scale,
+                 num_channels, **xargs):
+        super(ROIPoolLayer, self).__init__(name, 'roi_pool', 0, inputs)
+        config_assert(len(inputs) == 2, 'ROIPoolLayer must have 2 inputs')
+        self.config.inputs[0].roi_pool_conf.pooled_width = pooled_width
+        self.config.inputs[0].roi_pool_conf.pooled_height = pooled_height
+        self.config.inputs[0].roi_pool_conf.spatial_scale = spatial_scale
+        self.set_cnn_layer(name, pooled_height, pooled_width, num_channels)
+
+
 @config_layer('data')
 class DataLayer(LayerBase):
     def __init__(self,
@@ -3801,6 +3813,25 @@ class SwitchOrderLayer(LayerBase):
         self.config.reshape_conf.width_axis.extend(reshape['width'])
 
 
+@config_layer('scale_sub_region')
+class ScaleSubRegionLayer(LayerBase):
+    def __init__(self, name, inputs, value, **xargs):
+        super(ScaleSubRegionLayer, self).__init__(
+            name, 'scale_sub_region', 0, inputs=inputs, **xargs)
+        scale_sub_region_conf = self.config.inputs[0].scale_sub_region_conf
+        scale_sub_region_conf.value = value
+
+        # get channel, width and height from input_0 layer
+        input_layer = self.get_input_layer(0)
+        image_conf = scale_sub_region_conf.image_conf
+        image_conf.img_size = input_layer.width
+        image_conf.img_size_y = input_layer.height
+        image_conf.channels = input_layer.size / (input_layer.width *
+                                                  input_layer.height)
+        self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
+                           image_conf.channels)
+
+
 # Deprecated, use a new layer specific class instead
 @config_func
 def Layer(name, type, **xargs):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 0fd77a0be60124c882e43a71fd1fed3587ec48a4..617fbff948bf03098eca4a31f44d4ff05e73dbcf 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -122,6 +122,7 @@ __all__ = [
     'cross_channel_norm_layer',
     'multibox_loss_layer',
     'detection_output_layer',
+    'roi_pool_layer',
     'spp_layer',
     'pad_layer',
     'eos_layer',
@@ -144,6 +145,7 @@ __all__ = [
     'img_conv3d_layer',
     'resize_layer',
     'sub_seq_layer',
+    'scale_sub_region_layer',
 ]
 
 
@@ -220,6 +222,7 @@ class LayerType(object):
     PRIORBOX_LAYER = 'priorbox'
     MULTIBOX_LOSS_LAYER = 'multibox_loss'
     DETECTION_OUTPUT_LAYER = 'detection_output'
+    ROI_POOL_LAYER = 'roi_pool'
 
     CTC_LAYER = 'ctc'
     WARP_CTC_LAYER = 'warp_ctc'
@@ -255,6 +258,8 @@ class LayerType(object):
     RESIZE = 'resize'
     SUB_SEQ_LAYER = 'subseq'
 
+    SCALE_SUB_REGION_LAYER = 'scale_sub_region'
+
     @staticmethod
     def is_layer_type(type_name):
         """
@@ -786,10 +791,9 @@ class MixedLayerType(LayerOutput):
         :type size: int
         :param act: Activation type.
         :type act: BaseActivation
-        :param bias_attr: The Bias Attribute. If the parameter is set to
-                          False or something not type of ParameterAttribute,
-                          no bias is defined. If the parameter is set to
-                          True, the bias is initialized to zero.
+        :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                          whose type is not ParameterAttribute, no bias is defined. If the
+                          parameter is set to True, the bias is initialized to zero.
         :type bias_attr: ParameterAttribute | None | bool | Any
         :param layer_attr: Extra Layer Attribute.
         :type layer_attr: ExtraLayerAttribute or None
@@ -886,10 +890,9 @@ def mixed_layer(size=0,
                   then this function will just return layer's name.
     :param act: Activation Type. LinearActivation is the default.
     :type act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: The extra layer config. Default is None.
     :type layer_attr: ExtraLayerAttribute
@@ -1031,10 +1034,9 @@ def fc_layer(input,
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute|list.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute | None
@@ -1305,6 +1307,50 @@ def detection_output_layer(input_loc,
         name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
 
 
+@wrap_name_default("roi_pool")
+def roi_pool_layer(input,
+                   rois,
+                   pooled_width,
+                   pooled_height,
+                   spatial_scale,
+                   num_channels=None,
+                   name=None):
+    """
+    A layer used by Fast R-CNN to extract feature maps of ROIs from the last
+    feature map.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput.
+    :param rois: The input ROIs' data.
+    :type rois: LayerOutput.
+    :param pooled_width: The width after pooling.
+    :type pooled_width: int
+    :param pooled_height: The height after pooling.
+    :type pooled_height: int
+    :param spatial_scale: The spatial scale between the image and feature map.
+    :type spatial_scale: float
+    :param num_channels: number of input channel.
+    :type num_channels: int
+    :return: LayerOutput
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+    size = num_channels * pooled_width * pooled_height
+    Layer(
+        name=name,
+        type=LayerType.ROI_POOL_LAYER,
+        inputs=[input.name, rois.name],
+        pooled_width=pooled_width,
+        pooled_height=pooled_height,
+        spatial_scale=spatial_scale,
+        num_channels=num_channels)
+    return LayerOutput(
+        name, LayerType.ROI_POOL_LAYER, parents=[input, rois], size=size)
+
+
 @wrap_name_default("cross_channel_norm")
 def cross_channel_norm_layer(input, name=None, param_attr=None):
     """
@@ -1387,10 +1433,9 @@ def pooling_layer(input,
     :type pooling_type: BasePoolingType | None
     :param stride: The step size between successive pooling regions.
     :type stride: Int
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: The Extra Attributes for layer, such as dropout.
     :type layer_attr: ExtraLayerAttribute | None
@@ -1488,10 +1533,9 @@ def lstmemory(input,
     :type gate_act: BaseActivation
     :param state_act: state activation type, TanhActivation by default.
     :type state_act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: Parameter Attribute.
     :type param_attr: ParameterAttribute | None | False
@@ -1614,10 +1658,9 @@ def grumemory(input,
                      This activation affects the :math:`z_t` and :math:`r_t`. It is the
                      :math:`\\sigma` in the above formula.
     :type gate_act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: Parameter Attribute.
     :type param_attr: ParameterAttribute | None | False
@@ -1814,10 +1857,9 @@ def expand_layer(input,
     :type expand_as: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param expand_level: whether input layer is timestep(default) or sequence.
     :type expand_level: ExpandLevel
@@ -1936,10 +1978,9 @@ def seq_reshape_layer(input,
     :type act: BaseActivation
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2323,10 +2364,9 @@ def hsigmoid(input,
     :type num_classes: int | None
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: Parameter Attribute. None means default parameter.
     :type param_attr: ParameterAttribute | None
@@ -2466,10 +2506,9 @@ def img_conv_layer(input,
     :type dilation: int | tuple | list
     :param dilation_y: The y dimension of the dilation.
     :type dilation_y: int
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param num_channels: number of input channels. If None will be set
                         automatically from previous output.
@@ -3216,10 +3255,9 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
     :type input: LayerOutput | list | tuple
     :param act: Activation Type. LinearActivation is the default.
     :type act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3372,10 +3410,9 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
     :type act: BaseActivation
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3555,10 +3592,9 @@ def lstm_step_layer(input,
     :type gate_act: BaseActivation
     :param state_act: State Activation Type. TanhActivation is the default.
     :type state_act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3614,10 +3650,9 @@ def gru_step_layer(input,
     :param name: The name of this layer. It is optional.
     :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
     :type gate_act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: the parameter_attribute for transforming the output_mem
                        from previous step.
@@ -3677,10 +3712,9 @@ def gru_step_naive_layer(input,
     :type act: BaseActivation
     :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
     :type gate_act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr:
     :param layer_attr:
@@ -3810,10 +3844,9 @@ def recurrent_layer(input,
     :type input: LayerOutput
     :param act: Activation type. TanhActivation is the default.
     :type act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: parameter attribute.
     :type param_attr: ParameterAttribute
@@ -4803,10 +4836,9 @@ def tensor_layer(a,
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute | None
@@ -4868,10 +4900,9 @@ def selective_fc_layer(input,
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute | None
@@ -5494,7 +5525,11 @@ def crf_decoding_layer(input,
     return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1)
 
 
-@wrap_act_default(act=SigmoidActivation())
+"""
+Following are cost Layers.
+"""
+
+
 @wrap_bias_attr_default(has_bias=True)
 @wrap_param_attr_default()
 @wrap_name_default()
@@ -5502,7 +5537,6 @@ def crf_decoding_layer(input,
 def nce_layer(input,
               label,
               num_classes=None,
-              act=None,
               param_attr=None,
               weight=None,
               num_neg_samples=10,
@@ -5511,9 +5545,12 @@ def nce_layer(input,
               bias_attr=None,
               layer_attr=None):
     """
-    Noise-contrastive estimation.
-    Implements the method in the following paper:
-    A fast and simple algorithm for training neural probabilistic language models.
+    Noise-contrastive estimation. This layer implements the method in the
+    following paper:
+
+    Reference:
+        A fast and simple algorithm for training neural probabilistic language
+        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf
 
     The example usage is:
 
@@ -5525,32 +5562,37 @@ def nce_layer(input,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layers. It could be a LayerOutput of list/tuple of LayerOutput.
+    :param input: The input layers. It should be a LayerOutput or a list/tuple
+                  of LayerOutput.
     :type input: LayerOutput | list | tuple | collections.Sequence
-    :param label: label layer
+    :param label: The ground truth.
     :type label: LayerOutput
-    :param weight: weight layer, can be None(default)
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. The default value is None.
     :type weight: LayerOutput
-    :param num_classes: number of classes.
+    :param num_classes: The class number.
     :type num_classes: int
-    :param act: Activation type. SigmoidActivation is the default.
-    :type act: BaseActivation
-    :param param_attr: The Parameter Attribute|list.
-    :type param_attr: ParameterAttribute
-    :param num_neg_samples: number of negative samples. Default is 10.
+    :param param_attr: The parameter attributes.
+    :type param_attr: ParameterAttribute|list
+    :param num_neg_samples: The number of sampled negative labels. The default
+                            value is 10.
     :type num_neg_samples: int
-    :param neg_distribution: The distribution for generating the random negative labels.
-                             A uniform distribution will be used if not provided.
-                             If not None, its length must be equal to num_classes.
+    :param neg_distribution: The discrete noisy distribution over the output
+                             space from which num_neg_samples negative labels
+                             are sampled. If this parameter is not set, a
+                             uniform distribution will be used. A user defined
+                             distribution is a list whose length must be equal
+                             to the num_classes. Each member of the list defines
+                             the probability of a class given input x.
     :type neg_distribution: list | tuple | collections.Sequence | None
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The attribute for bias. If this parameter is set False or
+                      any object whose type is not ParameterAttribute, no bias
+                      is added. If this parameter is set True, the bias is
+                      initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
-    :return: layer name.
+    :return: The LayerOutput object.
     :rtype: LayerOutput
     """
     if isinstance(input, LayerOutput):
@@ -5573,8 +5615,6 @@ def nce_layer(input,
         assert isinstance(neg_distribution, collections.Sequence)
         assert len(neg_distribution) == num_classes
         assert abs(sum(neg_distribution) - 1.0) < 1e-5
-    if not isinstance(act, BaseActivation):
-        raise TypeError()
 
     ipts_for_layer = []
     parents = []
@@ -5596,7 +5636,7 @@ def nce_layer(input,
         type=LayerType.NCE_LAYER,
         num_classes=num_classes,
         neg_sampling_dist=neg_distribution,
-        active_type=act.name,
+        active_type=SigmoidActivation().name,
         num_neg_samples=num_neg_samples,
         inputs=ipts_for_layer,
         bias=ParamAttr.to_bias(bias_attr),
@@ -5606,12 +5646,7 @@ def nce_layer(input,
         LayerType.NCE_LAYER,
         parents=parents,
         size=l.config.size,
-        activation=act)
-
-
-"""
-following are cost Layers.
-"""
+        activation=SigmoidActivation())
 
 
 @wrap_name_default()
@@ -5770,20 +5805,21 @@ def cross_entropy(input,
     :param input: The first input layer.
     :type input: LayerOutput.
     :param label: The input label.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
-    :param coeff: The cost is multiplied with coeff.
-                  The coefficient affects the gradient in the backward.
-    :type coeff: float.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default.
+    :type coeff: float
     :param weight: The cost of each sample is multiplied with each weight.
                    The weight should be a layer with size=1. Note that gradient
                    will not be calculated for weight.
     :type weight: LayerOutout
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype: LayerOutput.
+    :rtype: LayerOutput
     """
 
     ipts, parents = __cost_input__(input, label, weight)
@@ -5816,19 +5852,21 @@ def cross_entropy_with_selfnorm(input,
                                           label=label_layer)
 
     :param input: The first input layer.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param label: The input label.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
-    :param coeff: The coefficient affects the gradient in the backward.
-    :type coeff: float.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default.
+    :type coeff: float
     :param softmax_selfnorm_alpha: The scale factor affects the cost.
-    :type softmax_selfnorm_alpha: float.
-    :param layer_attr: Extra Layer Attribute.
+    :type softmax_selfnorm_alpha: float
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype: LayerOutput.
+    :rtype: LayerOutput
     """
     Layer(
         name=name,
@@ -5849,7 +5887,7 @@ def cross_entropy_with_selfnorm(input,
 @layer_support()
 def sum_cost(input, name=None, layer_attr=None):
     """
-    A loss layer which calculate the sum of the input as loss
+    A loss layer which calculates the sum of the input as loss.
 
     The example usage is:
 
@@ -5858,10 +5896,11 @@ def sum_cost(input, name=None, layer_attr=None):
        cost = sum_cost(input=input_layer)
 
     :param input: The input of this layer.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
-    :param layer_attr: Extra Layer Attribute.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
@@ -5901,16 +5940,18 @@ def huber_regression_cost(input,
        cost = huber_regression_cost(input=input_layer, label=label_layer)
 
     :param input: The first input layer.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param label: The input label.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
+    :type name: basestring
     :param delta: The difference between the observed and predicted values.
-    :type delta: float.
-    :param coeff: The coefficient affects the gradient in the backward.
-    :type coeff: float.
-    :param layer_attr: Extra Layer Attribute.
+    :type delta: float
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default.
+    :type coeff: float
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
@@ -5951,17 +5992,19 @@ def huber_classification_cost(input,
        cost = huber_classification_cost(input=input_layer, label=label_layer)
 
     :param input: The first input layer.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param label: The input label.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
-    :param coeff: The coefficient affects the gradient in the backward.
-    :type coeff: float.
-    :param layer_attr: Extra Layer Attribute.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default.
+    :type coeff: float
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype: LayerOutput.
+    :rtype: LayerOutput
     """
     assert isinstance(input, LayerOutput)
     if input.size is not None:
@@ -5998,10 +6041,12 @@ def multi_binary_label_cross_entropy(input,
     :param label: The input label.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring
-    :param coeff: The coefficient affects the gradient in the backward.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default.
     :type coeff: float
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6104,7 +6149,7 @@ def cross_entropy_over_beam(input, name=None):
 
     :param input: Input beams for this layer.
     :type input: BeamInput
-    :param name: The name of this layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6139,7 +6184,7 @@ def cross_entropy_over_beam(input, name=None):
 def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     """
     This is a L1 loss but more smooth. It requires that the
-    size of input and label are equal. The formula is as follows,
+    sizes of input and label are equal. The formula is as follows,
 
     .. math::
 
@@ -6151,8 +6196,9 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
 
         smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
 
-    More details can be found by referring to `Fast R-CNN
-    <https://arxiv.org/pdf/1504.08083v2.pdf>`_
+    Reference:
+        Fast R-CNN
+        https://arxiv.org/pdf/1504.08083v2.pdf
 
     The example usage is:
 
@@ -6166,10 +6212,12 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     :param label: The input label.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring
-    :param coeff: The coefficient affects the gradient in the backward.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default.
     :type coeff: float
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6191,12 +6239,12 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
 @wrap_name_default()
 def multiplex_layer(input, name=None, layer_attr=None):
     """
-    This layer multiplex multiple layers according to the index,
-    which is provided by the first input layer.
-    inputs[0]: the index of the layer to output of size batchSize.
+    This layer multiplex multiple layers according to the indexes,
+    which are provided by the first input layer.
+    inputs[0]: the indexes of the layers to form the output of size batchSize.
     inputs[1:N]; the candidate output data.
-    For each index i from 0 to batchSize -1, the output is the i-th row of the
-    (index[i] + 1)-th layer.
+    For each index i from 0 to batchSize - 1, the i-th row of the output is the
+    the same to the i-th row of the (index[i] + 1)-th layer.
 
     For each i-th row of output:
     .. math::
@@ -6215,7 +6263,8 @@ def multiplex_layer(input, name=None, layer_attr=None):
     :type input: list of LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6319,14 +6368,14 @@ def row_conv_layer(input,
     :type context_len: int
     :param act: Activation Type. LinearActivation is the default.
     :type act: BaseActivation
-    :param param_attr: The Parameter Attribute. If None, the parameter will be
-                       initialized smartly. It's better to set it by yourself.
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param layer_attr: Extra Layer config.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
-
     """
     assert isinstance(input, LayerOutput)
     assert context_len > 0, "the context_len must be greatet than 0."
@@ -6351,7 +6400,7 @@ def prelu_layer(input,
                 param_attr=None,
                 layer_attr=None):
     """
-    The Parameter Relu activation that actives outputs with a learnable weight.
+    The Parametric Relu activation that actives outputs with a learnable weight.
 
     Reference:
         Delving Deep into Rectifiers: Surpassing Human-Level Performance on
@@ -6371,16 +6420,17 @@ def prelu_layer(input,
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param partial_sum: this parameter makes a group of inputs share a same weight.
+    :param partial_sum: this parameter makes a group of inputs share the same weight.
 
         - partial_sum = 1, indicates the element-wise activation: each element has a weight.
-        - partial_sum = number of elements in one channel, indicates the channel-wise activation, elements in a channel share a same weight.
-        - partial_sum = number of outputs, indicates all elements share a same weight.
+        - partial_sum = number of elements in one channel, indicates the channel-wise activation, elements in a channel share the same weight.
+        - partial_sum = number of outputs, indicates all elements share the same weight.
 
     :type partial_sum: int
     :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute | None
-    :param layer_attr: Extra layer configurations. Default is None.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6436,34 +6486,34 @@ def gated_unit_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param size: output size of the gated unit.
+    :param size: The dimension of this layer's output.
     :type size: int
-    :param act: Activation type of the projected input. LinearActivation is the default.
+    :param act: Activation type of the projection. LinearActivation is the default.
     :type act: BaseActivation
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param gate_attr: Attributes to tune the gate output, for example, error
-        clipping threshold, dropout and so on. See ExtraLayerAttribute for
-        more details.
+    :param gate_attr: The extra layer attribute of the gate. See ExtraLayerAttribute for
+                      details.
     :type gate_attr: ExtraLayerAttribute | None
-    :param gate_param_attr: Attributes to tune the learnable projected matrix
-        parameter of the gate.
-    :type gate_param_attr: ParameterAttribute | None
-    :param gate_bias_attr: Attributes to tune the learnable bias of the gate.
-    :type gate_bias_attr: ParameterAttribute | None
-    :param inproj_attr: Attributes to the tune the projected input, for
-        example, error clipping threshold, dropout and so on. See
-        ExtraLayerAttribute for more details.
+    :param gate_param_attr: The parameter attribute of the gate. See ParameterAttribute
+                            for details.
+    :type gate_param_attr: ParameterAttribute
+    :param gate_bias_attr: The bias attribute of the gate. If the parameter is set to False or
+                           an object whose type is not ParameterAttribute, no bias is defined.
+                           If the parameter is set to True, the bias is initialized to zero.
+    :type gate_bias_attr: ParameterAttribute | bool | None | Any
+    :param inproj_attr: Extra layer attributes of the projection. See ExtraLayerAttribute for
+                        details.
     :type inproj_attr: ExtraLayerAttribute | None
-    :param inproj_param_attr: Attributes to tune the learnable parameter of
-        the projection of input.
-    :type inproj_param_attr: ParameterAttribute | None
-    :param inproj_bias_attr: Attributes to tune the learnable bias of
-        projection of the input.
-    :type inproj_bias_attr: ParameterAttribute | None
-    :param layer_attr: Attributes to tune the final output of the gated unit,
-        for example, error clipping threshold, dropout and so on. See
-        ExtraLayerAttribute for more details.
+    :param inproj_param_attr: The parameter attribute of the projection. See ParameterAttribute
+                              for details.
+    :type inproj_param_attr: ParameterAttribute
+    :param inproj_bias_attr: The bias attribute of the projection. If the parameter is set to False
+                             or an object whose type is not ParameterAttribute, no bias is defined.
+                             If the parameter is set to True, the bias is initialized to zero.
+    :type inproj_bias_attr: ParameterAttribute | bool | None | Any
+    :param layer_attr: Extra layer attribute of the product. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6659,9 +6709,9 @@ def clip_layer(input, min, max, name=None):
     :param input: The input of this layer.
     :type input: LayerOutput.
     :param min: The lower threshold for clipping.
-    :type min: double
+    :type min: float
     :param max: The upper threshold for clipping.
-    :type max: double
+    :type max: float
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -6709,7 +6759,6 @@ def seq_slice_layer(input, starts, ends, name=None):
     :type ends: LayerOutput | None
     :return: LayerOutput object.
     :rtype: LayerOutput
-
     """
 
     assert isinstance(input, LayerOutput), (
@@ -6830,20 +6879,21 @@ def img_conv3d_layer(input,
     :param padding: The numbers of padding along three axises. If the parameter is set to
                     one integer, they will be same.
     :type padding: int | tuple | list
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param num_channels: The number of input channels. If the parameter is not set or
                          set to None,  its actual value will be automatically set to
                          the channels number of the input .
     :type num_channels: int
-    :param param_attr: The parameter attribute of the convolution.
+    :param param_attr: The parameter attribute of the convolution. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
     :param shared_biases: Whether biases will be shared between filters or not.
     :type shared_biases: bool
-    :param layer_attr: Extra layer attributes.
+    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :param trans: True if it is a convTransLayer, False if it is a convLayer
     :type trans: bool
@@ -6950,12 +7000,12 @@ def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param param_attr: The parameter attribute of scaling.
+    :param param_attr: The parameter attribute of scaling. See ParameterAttribute for
+                      details.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -7013,10 +7063,9 @@ def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
     :type sizes: LayerOutput
     :param act: Activation type, LinearActivation is the default.
     :type act: BaseActivation.
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -7042,3 +7091,54 @@ def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
         LayerType.SUB_SEQ_LAYER,
         parents=[input, offsets, sizes],
         size=input.size)
+
+
+@wrap_name_default('scale_sub_region')
+def scale_sub_region_layer(input, indices, value, name=None):
+    """
+    Given an image or feature map with CHW information, scale_sub_region_layer
+    can be used to multiply a real value to values of a sub continuous region.
+    You can provide start and end indices of CHW for each instance.
+    Please notice that all start indices are counting from 1.
+    The shape of indices should be [batch_size, 6] and the layout for each row
+    is [C_Start, C_End, H_Start, H_End, W_Start, W_End].
+
+    .. code-block:: python
+
+        scale_sub_region = scale_sub_region_layer(input=input,
+                                                  indices=indices,
+                                                  value=value)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer which should contains CHW information.
+    :type input: LayerOutput
+    :param indices: Start index and end index for C H W, the input value should
+                    be a 2-D matrix with shape [batch_size, 6].
+    :type indices: LayerOutput.
+    :param value: value to multiply.
+    :type value: float
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of scale_sub_region_layer, '
+        'must be a PaddlePaddle layer.')
+    assert isinstance(indices, LayerOutput), (
+        'The start and end indices for CHW, must be a PaddlePaddle layer.')
+    assert isinstance(value, float), (
+        'The value to multiply, must be a real value.')
+
+    Layer(
+        name=name,
+        type=LayerType.SCALE_SUB_REGION_LAYER,
+        inputs=[input.name, indices.name],
+        value=value)
+
+    return LayerOutput(
+        name,
+        LayerType.SCALE_SUB_REGION_LAYER,
+        parents=[input, indices],
+        num_filters=input.num_filters,
+        size=input.size)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 6a4550c209762362d40f8a2afaf526a1fe53ca6b..1c7451e0abf5dc1b99671f292e2ffc2d2282abe9 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -9,7 +9,7 @@ test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
-test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer
-test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer)
+test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
+test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..f1bc65b3aee7488700a9d24e049adb510649c475
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
@@ -0,0 +1,98 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 588
+  active_type: ""
+  height: 14
+  width: 14
+}
+layers {
+  name: "rois"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__conv_0__"
+  type: "exconv"
+  size: 3136
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 3
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 3
+      output_x: 14
+      img_size: 14
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 1
+      output_y: 14
+      img_size_y: 14
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 14
+  width: 14
+}
+layers {
+  name: "__roi_pool_0__"
+  type: "roi_pool"
+  size: 784
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    roi_pool_conf {
+      pooled_width: 7
+      pooled_height: 7
+      spatial_scale: 0.0625
+    }
+  }
+  inputs {
+    input_layer_name: "rois"
+  }
+  height: 7
+  width: 7
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 432
+  initial_mean: 0.0
+  initial_std: 0.272165526976
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+input_layer_names: "rois"
+output_layer_names: "__roi_pool_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "rois"
+  layer_names: "__conv_0__"
+  layer_names: "__roi_pool_0__"
+  input_layer_names: "data"
+  input_layer_names: "rois"
+  output_layer_names: "__roi_pool_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..d20133a10ec605654bd3744297673068a77020b8
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
@@ -0,0 +1,51 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2016
+  active_type: ""
+  height: 48
+  width: 42
+}
+layers {
+  name: "indices"
+  type: "data"
+  size: 6
+  active_type: ""
+}
+layers {
+  name: "__scale_sub_region_0__"
+  type: "scale_sub_region"
+  size: 2016
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    scale_sub_region_conf {
+      image_conf {
+        channels: 1
+        img_size: 42
+        img_size_y: 48
+      }
+      value: 0.0
+    }
+  }
+  inputs {
+    input_layer_name: "indices"
+  }
+  height: 48
+  width: 42
+}
+input_layer_names: "data"
+input_layer_names: "indices"
+output_layer_names: "__scale_sub_region_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "indices"
+  layer_names: "__scale_sub_region_0__"
+  input_layer_names: "data"
+  input_layer_names: "indices"
+  output_layer_names: "__scale_sub_region_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b739a81b8505c94a2312ac735647fb114982f1f7
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
@@ -0,0 +1,23 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=3 * 14 * 14, height=14, width=14)
+
+rois = data_layer(name='rois', size=10)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=3,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+roi_pool = roi_pool_layer(
+    input=conv,
+    rois=rois,
+    pooled_width=7,
+    pooled_height=7,
+    spatial_scale=1. / 16)
+
+outputs(roi_pool)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d4bf28bf1eaf58e1fd0eb62fd10efe998587edd
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
@@ -0,0 +1,11 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2016, height=48, width=42)
+indices = data_layer(name='indices', size=6)
+
+scale_sub_region = scale_sub_region_layer(
+    input=data, indices=indices, value=0.0)
+
+outputs(scale_sub_region)
diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py
index 8fb3cca91e5f8759b8a83b12428c78d222f382ac..b9db2707c0705659260c04ab3412f429058a1316 100644
--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
@@ -285,7 +285,7 @@ class Operator(object):
         self.desc.check_attrs()
         no_kernel_op_set = {
             'feed', 'fetch', 'save', 'load', 'recurrent',
-            'rnn_memory_helper_grad'
+            'rnn_memory_helper_grad', 'while'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
index d42af89eaead83922f95df6702a047dc781990f8..9a1999243750aca62a4ef898ae979d273902b45c 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -22,12 +22,36 @@ def fc(input,
        num_flatten_dims=1,
        main_program=None,
        startup_program=None):
-    # create helper
+    """
+    Fully Connected Layer.
+
+    Args:
+       input: The input tensor to the function
+       size: The size of the layer
+       param_attr: The parameters/weights to the FC Layer
+       bias_attr: The bias parameter for the FC layer
+       name: Name/alias of the function
+       act: Activation to be applied to the output of FC layer
+       num_flatten_dims: Number of columns in input
+       main_program: Name of the main program that calls this
+       startup_program: Name of the startup program
+
+    This function can take in multiple inputs and performs the Fully Connected
+    function (linear transformation) on top of each of them.
+    So for input x, the output will be : Wx + b. Where W is the parameter,
+    b the bias and x is the input.
+
+    The function also applies an activation (non-linearity) on top of the
+    output, if activation is passed in the input.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    """
     helper = LayerHelper('fc', **locals())
 
     dtype = helper.input_dtype()
 
-    # mul
     mul_results = []
     for input_var, param_attr in helper.iter_inputs_and_params():
         input_shape = input_var.shape
@@ -68,6 +92,26 @@ def embedding(input,
               param_attr=None,
               main_program=None,
               startup_program=None):
+    """
+    Embedding Layer.
+
+    Args:
+       input: The input to the function
+       size: The size of the layer
+       data_type: The type of data : float32, float_16, int etc
+       is_sparse: A flag that decleares whether the input is sparse
+       param_attr: Parameters for this layer
+       main_program: Name of the main program that calls this
+       startup_program: Name of the startup program
+
+    This function can take in the input (which is a vector of IDs) and
+    performs a lookup in the lookup_table using these IDs, to result into
+    the embedding of each ID in the input.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    """
     helper = LayerHelper('embedding', **locals())
     w = helper.create_parameter(
         attr=helper.param_attr, shape=size, dtype=data_type)
@@ -87,7 +131,30 @@ def data(name,
          type=core.VarDesc.VarType.LOD_TENSOR,
          append_batch_size=True,
          main_program=None,
-         startup_program=None):
+         startup_program=None,
+         stop_gradient=True):
+    """
+    Data Layer.
+
+    Args:
+       name: The name/alias of the function
+       shape: Tuple declaring the shape.
+       data_type: The type of data : float32, float_16, int etc
+       type: The output type. By default it is LOD_TENSOR.
+       append_batch_size: Whether or not to append the data as a batch.
+       main_program: Name of the main program that calls this
+       startup_program: Name of the startup program
+       stop_gradient: A boolean that mentions whether gradient should flow.
+
+    This function takes in input and based on whether data has
+    to be returned back as a minibatch, it creates the global variable using
+    the helper functions. The global variables can be accessed by all the
+    following operations and layers in the graph.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    """
     helper = LayerHelper('data', **locals())
     shape = list(shape)
     for i in xrange(len(shape)):
@@ -101,15 +168,40 @@ def data(name,
         shape = [-1] + shape  # append batch size as -1
 
     return helper.create_global_variable(
-        name=name, shape=shape, dtype=data_type, type=type, stop_gradient=True)
+        name=name,
+        shape=shape,
+        dtype=data_type,
+        type=type,
+        stop_gradient=stop_gradient)
 
 
 def _convert_(name):
+    """
+    Formatting.
+
+    Args:
+       name: The name/alias
+
+    This function takes in a name and converts it to a standard format of
+    group1_group2. Where as per the regular expression, group1 can have
+    alphabets and numbers and group2 has capital alphabets.
+
+    """
     s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
     return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
 
 
 def _create_op_func_(op_type):
+    """
+    Create an Operator for a Function.
+
+    Args:
+       op_type: The name of the operator to be created
+
+    This function takes in the operator type (sigmoid, mean , average etc) and
+    creates the operator functionality.
+
+    """
     op_proto = OpProtoHolder.instance().get_op_proto(op_type)
     not_intermediate_outputs = \
         filter(lambda output: not output.intermediate, op_proto.outputs)
@@ -117,26 +209,26 @@ def _create_op_func_(op_type):
         filter(lambda output: output.intermediate, op_proto.outputs)
 
     if len(not_intermediate_outputs) != 1:
-        raise ValueError(
-            "Only one not intermediate output operator can be automatically generated"
-        )
+        raise ValueError("Only one non intermediate output operator can be",
+                         "automatically generated")
 
     if not_intermediate_outputs[0].duplicable:
         raise ValueError(
-            "Only not duplicable op can be automatically generated")
+            "Only non duplicable op can be automatically generated")
 
     for output in intermediate_outputs:
         if output.duplicable:
-            raise ValueError(
-                "Only when all intermediate ops are not duplicable, "
-                "this op can be automatically generated")
+            raise ValueError("The op can be automatically generated only when ",
+                             "all intermediate ops are not duplicable")
 
     o_name = not_intermediate_outputs[0].name
     intermediate_output_names = [output.name for output in intermediate_outputs]
 
-    def func(**kwargs):
-        helper = LayerHelper(op_type, **kwargs)
-        inputs = dict()
+    def infer_and_check_data_type(op_proto, **kwargs):
+        """
+        This function performs the sanity check for data_type and
+        instance type.
+        """
         dtype = None
         for ipt in op_proto.inputs:
             name = _convert_(ipt.name)
@@ -153,6 +245,25 @@ def _create_op_func_(op_type):
                 elif dtype != each.data_type:
                     raise ValueError(
                         "operator {0} must input same dtype".format(op_type))
+
+        return dtype
+
+    def func(**kwargs):
+        """
+        This function implements the function for the operator. This process
+        involves doing the sanity check (using the function above), reading
+        inputs from protobuf and applying the activations on top.
+        """
+        helper = LayerHelper(op_type, **kwargs)
+
+        dtype = infer_and_check_data_type(op_proto, **kwargs)
+
+        inputs = dict()
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
             inputs[ipt.name] = val
 
         outputs = dict()
@@ -178,9 +289,32 @@ _create_op_func_('reshape')
 _create_op_func_('elementwise_add')
 _create_op_func_('sigmoid')
 _create_op_func_('scale')
+_create_op_func_('reshape')
+_create_op_func_('transpose')
+
+
+def fill_constant(data_type, shape, value=None, program=None):
+    """
+    This function creates a tensor , with shape as mentioned in the input and
+    specified data_type and fills this up with a constant value that
+    comes in the input.
+    """
+    helper = LayerHelper('fill_constant', **locals())
+    out = helper.create_tmp_variable(dtype=data_type)
+    helper.append_op(
+        type='fill_constant',
+        outputs={'Out': [out]},
+        attrs={'data_type': data_type,
+               'shape': shape,
+               'value': value})
+    return out
 
 
 def cast(x, data_type, main_program=None):
+    """
+    This function takes in the input with input_data_type
+    and casts it to the output_data_type as the output.
+    """
     helper = LayerHelper('cast', **locals())
     out = helper.create_tmp_variable(dtype=data_type)
     helper.append_op(
@@ -193,6 +327,10 @@ def cast(x, data_type, main_program=None):
 
 
 def concat(input, axis, main_program=None, startup_program=None):
+    """
+    This function concats the input along the axis mentioned
+    and returns that as the output.
+    """
     helper = LayerHelper('concat', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
     helper.append_op(
@@ -204,6 +342,10 @@ def concat(input, axis, main_program=None, startup_program=None):
 
 
 def sums(input, main_program=None, startup_program=None):
+    """
+    This function takes in the input and performs the sum operation on it
+    and returns that as the output.
+    """
     helper = LayerHelper('sum', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
     helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
@@ -211,6 +353,10 @@ def sums(input, main_program=None, startup_program=None):
 
 
 def cos_sim(X, Y, **kwargs):
+    """
+    This function performs the cosine similarity between two tensors
+    X and Y and returns that as the output.
+    """
     helper = LayerHelper('cos_sim', **kwargs)
     out = helper.create_tmp_variable(dtype=X.data_type)
     xnorm = helper.create_tmp_variable(dtype=X.data_type)
@@ -226,6 +372,9 @@ def cos_sim(X, Y, **kwargs):
 
 
 def cross_entropy(input, label, **kwargs):
+    """
+    This function computes cross_entropy using the input and label.
+    """
     helper = LayerHelper('cross_entropy', **kwargs)
     out = helper.create_tmp_variable(dtype=input.data_type)
     helper.append_op(
@@ -238,6 +387,10 @@ def cross_entropy(input, label, **kwargs):
 
 
 def square_error_cost(input, label, **kwargs):
+    """
+    This functions returns the squared error cost using the input and label.
+    The output is appending the op to do the above.
+    """
     helper = LayerHelper('square_error_cost', **kwargs)
     minus_out = helper.create_tmp_variable(dtype=input.data_type)
     helper.append_op(
@@ -253,6 +406,10 @@ def square_error_cost(input, label, **kwargs):
 
 
 def accuracy(input, label, k=1, **kwargs):
+    """
+    This function computes the accuracy using the input and label.
+    The output is the top_k inputs and their indices.
+    """
     helper = LayerHelper("accuracy", **kwargs)
     topk_out = helper.create_tmp_variable(dtype=input.data_type)
     topk_indices = helper.create_tmp_variable(dtype="int64")
@@ -285,6 +442,11 @@ def sequence_conv(input,
                   param_attr=None,
                   main_program=None,
                   startup_program=None):
+    """
+    This function creates the op for sequence_conv, using the inputs and
+    other convolutional configurations for the filters and stride as given
+    in the input parameters to the function.
+    """
     # FIXME(dzh) : want to unify the argument of python layer
     # function. So we ignore some unecessary attributes.
     # such as, padding_trainable, context_start.
@@ -325,6 +487,13 @@ def conv2d(input,
            param_attr=None,
            main_program=None,
            startup_program=None):
+    """
+    This function creates the op for a 2-dimensional Convolution.
+    This is performed using the parameters of filters(size, dimensionality etc)
+    , stride and other configurations for a Convolution operation.
+    This funciton can also append an activation on top of the
+    conv-2d output, if mentioned in the input parameters.
+    """
     helper = LayerHelper('conv2d', **locals())
     dtype = helper.input_dtype()
 
@@ -371,6 +540,11 @@ def conv2d(input,
 
 
 def sequence_pool(input, pool_type, **kwargs):
+    """
+    This function add the operator for sequence pooling.
+    This is applied on top of the input using pool_type mentioned
+    in the parameters.
+    """
     helper = LayerHelper('sequence_pool', input=input, **kwargs)
     dtype = helper.input_dtype()
     pool_out = helper.create_tmp_variable(dtype)
@@ -394,6 +568,10 @@ def pool2d(input,
            global_pooling=False,
            main_program=None,
            startup_program=None):
+    """
+    This function adds the operator for pooling in 2 dimensions, using the
+    pooling configurations mentioned in input parameters.
+    """
     if pool_type not in ["max", "avg"]:
         raise ValueError(
             "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
@@ -414,9 +592,9 @@ def pool2d(input,
         inputs={"X": input},
         outputs={"Out": pool_out},
         attrs={
-            "poolingType": pool_type,
+            "pooling_type": pool_type,
             "ksize": pool_size,
-            "globalPooling": global_pooling,
+            "global_pooling": global_pooling,
             "strides": pool_stride,
             "paddings": pool_padding
         })
@@ -434,6 +612,10 @@ def batch_norm(input,
                data_layout='NCHW',
                main_program=None,
                startup_program=None):
+    """
+    This function helps create an operator to implement
+    the BatchNorm layer using the configurations from the input parameters.
+    """
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
 
@@ -505,8 +687,10 @@ def batch_norm(input,
 
 class BlockGuard(object):
     """
-    BlockGuard used to create sub-block in program by using Python `with` 
-    keyword.
+    BlockGuard class.
+
+    BlockGuard class is used to create a sub-block in a program by
+    using the Python `with` keyword.
     """
 
     def __init__(self, main_program):
@@ -525,9 +709,15 @@ class BlockGuard(object):
 
 
 class StaticRNNGuard(BlockGuard):
+    """
+    StaticRNNGuard class.
+
+    StaticRNNGuard class is used to create a StaticRNN block in a program.
+    """
+
     def __init__(self, rnn):
         if not isinstance(rnn, StaticRNN):
-            raise TypeError("StaticRNNGuard takes an StaticRNN")
+            raise TypeError("StaticRNNGuard takes a StaticRNN")
         super(StaticRNNGuard, self).__init__(rnn.helper.main_program)
         self.rnn = rnn
 
@@ -545,12 +735,18 @@ class StaticRNNGuard(BlockGuard):
 
 class StaticRNNMemoryLink(object):
     """
-    :param init: the initial variable for Memory
-    :type init: Variable
-    :param pre_mem: the memory variable in previous time step
-    :type pre_mem: Variable
-    :param mem: the memory variable in current time step
-    :type mem: Variable
+    StaticRNNMemoryLink class.
+
+    Args:
+        init: the initial variable for Memory
+        init: Variable
+        pre_mem: the memory variable in previous time step
+        pre_mem: Variable
+        mem: the memory variable in current time step
+        mem: Variable
+
+    StaticRNNMemoryLink class is used to create a link between two
+    memory cells of a StaticRNN.
     """
 
     def __init__(self, init, pre_mem, mem=None):
@@ -560,6 +756,12 @@ class StaticRNNMemoryLink(object):
 
 
 class StaticRNN(object):
+    """
+    StaticRNN class.
+
+    StaticRNN class is used to create a StaticRNN. The RNN will have its
+    own parameters like inputs, outputs, memories, status and length.
+    """
     BEFORE_RNN_BLOCK = 0
     IN_RNN_BLOCK = 1
     AFTER_RNN_BLOCK = 2
@@ -588,15 +790,15 @@ class StaticRNN(object):
                init_value=0.0,
                init_batch_dim_idx=0,
                ref_batch_dim_idx=1):
-        '''
-        :param init: boot memory, if not set, a shape, batch_ref must be provided
-        :param shape: shape of the boot memory
-        :param batch_ref: batch size reference variable
-        :param init_value: the init value of boot memory
-        :param init_batch_dim_idx: the index of batch size in init's dimension
-        :param ref_batch_dim_idx: the index of batch size in batch_ref's dimension
-        :return: boot memory
-        '''
+        """
+        Args:
+            init: boot memory, if not set, a shape, batch_ref must be provided
+            shape: shape of the boot memory
+            batch_ref: batch size reference variable
+            init_value: the init value of boot memory
+            init_batch_dim_idx: the index of batch size in init's dimension
+            ref_batch_dim_idx: the index of batch size in batch_ref's dimension
+        """
         self._assert_in_rnn_block_('memory')
         if init is None:
             if shape is None or batch_ref is None:
@@ -762,7 +964,131 @@ class StaticRNN(object):
             })
 
 
+class WhileGuard(BlockGuard):
+    def __init__(self, while_op):
+        if not isinstance(while_op, While):
+            raise TypeError("WhileGuard takes a while op")
+        super(WhileGuard, self).__init__(while_op.helper.main_program)
+        self.while_op = while_op
+
+    def __enter__(self):
+        self.while_op.status = While.IN_WHILE_BLOCK
+        return super(WhileGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        self.while_op.status = While.AFTER_WHILE_BLOCK
+        self.while_op.complete()
+        return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
+
+
+class While(object):
+    BEFORE_WHILE_BLOCK = 0
+    IN_WHILE_BLOCK = 1
+    AFTER_WHILE_BLOCK = 2
+
+    def __init__(self, cond, name=None, main_program=None):
+        self.helper = LayerHelper("while", name=name, main_program=main_program)
+        self.status = While.BEFORE_WHILE_BLOCK
+        if not isinstance(cond, Variable):
+            raise TypeError("condition should be a variable")
+        assert isinstance(cond, Variable)
+        if cond.data_type != core.DataType.BOOL:
+            raise TypeError("condition should be a bool variable")
+        if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
+            raise TypeError("condition should be a bool scalar")
+        self.cond_var = cond
+
+    def block(self):
+        return WhileGuard(self)
+
+    def complete(self):
+        main_program = self.helper.main_program
+        while_block = main_program.current_block()
+        parent_block = main_program.block(main_program.current_block()
+                                          .parent_idx)
+
+        inner_outputs = {self.cond_var.name}
+        x_name_list = set()
+        for op in while_block.ops:
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in inner_outputs:
+                        x_name_list.add(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    inner_outputs.add(out_var_name)
+
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in parent_block.vars:
+                out_vars.append(parent_block.var(inner_out_name))
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        parent_block.append_op(
+            type='while',
+            inputs={
+                'X': [parent_block.var(x_name) for x_name in x_name_list],
+                'Condition': [self.cond_var]
+            },
+            outputs={'Out': out_vars,
+                     'StepScopes': [step_scope]},
+            attrs={'step_block': while_block})
+
+
+def lstm(x,
+         c_pre_init,
+         hidden_dim,
+         forget_bias=None,
+         main_program=None,
+         startup_program=None):
+    """
+    This function helps create an operator for the LSTM (Long Short Term
+    Memory) cell that can be used inside an RNN.
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+    rnn = StaticRNN()
+    with rnn.step():
+        c_pre = rnn.memory(init=c_pre_init)
+        x_t = rnn.step_input(x)
+
+        before_fc = concat(
+            input=[x_t, c_pre],
+            axis=1,
+            main_program=main_program,
+            startup_program=startup_program)
+        after_fc = fc(input=before_fc,
+                      size=hidden_dim * 4,
+                      main_program=main_program,
+                      startup_program=startup_program)
+
+        data_type = x.data_type
+        c = helper.create_tmp_variable(data_type)
+        h = helper.create_tmp_variable(data_type)
+
+        helper.append_op(
+            type='lstm_unit',
+            inputs={"X": after_fc,
+                    "C_prev": c_pre},
+            outputs={"C": c,
+                     "H": h},
+            attrs={"forget_bias": forget_bias})
+
+        rnn.update_memory(c_pre, c)
+        rnn.output(h)
+
+    return rnn()
+
+
 def lod_rank_table(x, level=0, main_program=None):
+    """
+    This function creates an operator for creating a LOD_RANK_TABLE
+    using the input x.
+    """
     helper = LayerHelper("lod_rank_table", **locals())
     table = helper.create_variable(
         type=core.VarDesc.VarType.LOD_RANK_TABLE,
@@ -776,10 +1102,15 @@ def lod_rank_table(x, level=0, main_program=None):
 
 
 def lod_tensor_to_array(x, table, main_program=None):
+    """
+    This function creates an operator to convert an LOD_Tensor to
+    an array.
+    """
     helper = LayerHelper("lod_tensor_to_array", **locals())
     array = helper.create_variable(
         name=unique_name("lod_tensor_to_array"),
-        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=x.data_type)
     helper.append_op(
         type='lod_tensor_to_array',
         inputs={'X': x,
@@ -789,6 +1120,10 @@ def lod_tensor_to_array(x, table, main_program=None):
 
 
 def array_to_lod_tensor(x, table, main_program=None):
+    """
+    This function creates an operator to convert an array to a
+    LOD_Tensor.
+    """
     helper = LayerHelper("array_to_lod_tensor", **locals())
     tmp = helper.create_tmp_variable(dtype=x.data_type)
     helper.append_op(
@@ -800,7 +1135,12 @@ def array_to_lod_tensor(x, table, main_program=None):
 
 
 def fill_constant(shape, dtype, value, main_program=None):
-    helper = LayerHelper("ones", **locals())
+    """
+    This function creates a tensor , with shape as mentioned in the input and
+    specified data_type and fills this up with a constant value that
+    comes in the input. It also sets the stop_gradient to be True.
+    """
+    helper = LayerHelper("fill_constant", **locals())
     out = helper.create_tmp_variable(dtype=dtype)
     helper.append_op(
         type='fill_constant',
@@ -816,25 +1156,45 @@ def fill_constant(shape, dtype, value, main_program=None):
 
 
 def ones(shape, dtype, main_program=None):
+    """
+    This function performs the same function as fill_constant() declared above
+    with the constant value being 1.0.
+    """
     return fill_constant(value=1.0, **locals())
 
 
 def zeros(shape, dtype, main_program=None):
+    """
+    This function performs the same function as fill_constant() declared above
+    with the constant value being 0.0.
+    """
     return fill_constant(value=0.0, **locals())
 
 
-def increment(x, value=1.0, main_program=None):
+def increment(x, value=1.0, in_place=True, main_program=None):
+    """
+    This function creates an operator to increment each value in the input
+    `x` by an amount: `value` as mentioned in the input parameter. This
+    operation is performed in-place by default.
+    """
     helper = LayerHelper("increment", **locals())
-    tmp = helper.create_tmp_variable(dtype=x.data_type)
+    if not in_place:
+        out = helper.create_tmp_variable(dtype=x.data_type)
+    else:
+        out = x
     helper.append_op(
         type='increment',
         inputs={'X': [x]},
-        outputs={'Out': [tmp]},
+        outputs={'Out': [out]},
         attrs={'step': value})
-    return tmp
+    return out
 
 
 def array_write(x, i, array=None, main_program=None):
+    """
+    This function creates an operator to write the data out as a
+    LOD_TENSOR_ARRAY.
+    """
     helper = LayerHelper('array_write', **locals())
     if array is None:
         array = helper.create_variable(
@@ -849,7 +1209,31 @@ def array_write(x, i, array=None, main_program=None):
     return array
 
 
+def create_array(dtype, main_program=None):
+    helper = LayerHelper("array", **locals())
+    return helper.create_variable(
+        name="{0}.out".format(helper.name),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=dtype)
+
+
+def less_than(x, y, cond=None, main_program=None):
+    helper = LayerHelper("less_than", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+
+    helper.append_op(
+        type='less_than', inputs={'X': [x],
+                                  'Y': [y]}, outputs={'Out': [cond]})
+    return cond
+
+
 def array_read(array, i, main_program=None):
+    """
+    This function creates an operator to read the data in as a
+    LOD_TENSOR_ARRAY.
+    """
     helper = LayerHelper('array_read', **locals())
     if not isinstance(
             array,
@@ -862,3 +1246,33 @@ def array_read(array, i, main_program=None):
                 'I': [i]},
         outputs={'Out': [out]})
     return out
+
+
+def shrink_memory(x, i, table, main_program=None):
+    """
+    This function creates an operator to shrink_rnn_memory using the RankTable
+    as mentioned in the input parameter.
+    """
+    helper = LayerHelper('shrink_memory', **locals())
+    out = helper.create_tmp_variable(dtype=x.data_type)
+    helper.append_op(
+        type='shrink_rnn_memory',
+        inputs={'X': [x],
+                'I': [i],
+                'RankTable': [table]},
+        outputs={'Out': [out]},
+        attrs={})
+    return out
+
+
+def array_length(array, main_program=None):
+    """
+    This function creates an operator to find the length of the
+    LOD_TENSOR_ARRAY.
+    """
+    helper = LayerHelper('array_length', **locals())
+    tmp = helper.create_tmp_variable(dtype='int64')
+    tmp.stop_gradient = True
+    helper.append_op(
+        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
+    return tmp
diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/framework/optimizer.py
index f20865d604f68c8398f299bf8edfd020bfa4e4c5..f06c0fb98d572fb54a85996668cc6f32726ec9de 100644
--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/framework/optimizer.py
@@ -35,15 +35,21 @@ class Optimizer(object):
         """
         raise NotImplementedError()
 
-    def _initialize_tensors(self, block):
-        """Create all necessary tensors, that will be shared for all parameter updates.
-
-        Tensors like learning rate should be initialized here.
-
-        Args:
-            block: the block in which the loss variable is present
-        """
-        pass
+    def _create_param_lr(self, param_and_grad):
+        # create learning rate variable for every parameter
+        param = param_and_grad[0]
+        param_lr = param.optimize_attr['learning_rate']
+        param_lr_shape = [1]
+        param_lr_var = self.helper.create_global_variable(
+            name=unique_name("learning_rate"),
+            dtype='float32',
+            shape=param_lr_shape,
+            lod_level=1,
+            persistable=True)
+        param_lr = param_lr * self._learning_rate
+        self.helper.set_variable_initializer(
+            var=param_lr_var, initializer=ConstantInitializer(param_lr))
+        return param_lr_var
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -161,8 +167,6 @@ class Optimizer(object):
             startup_program=startup_program)
         self._create_accumulators(loss.block,
                                   [p[0] for p in parameters_and_grads])
-        # Create any necessary tensors
-        self._initialize_tensors(loss.block)
 
         optimize_ops = []
         for param_and_grad in parameters_and_grads:
@@ -214,27 +218,16 @@ class SGDOptimizer(Optimizer):
         self.type = "sgd"
         self._learning_rate = learning_rate
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
+
         # create the optimize op
         sgd_op = block.append_op(
             type=self.type,
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": self._lr
+                "LearningRate": self._create_param_lr(param_and_grad)
             },
             outputs={"ParamOut": param_and_grad[0]})
 
@@ -259,19 +252,6 @@ class MomentumOptimizer(Optimizer):
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
 
-    def _initialize_tensors(self, block):
-        assert isinstance(block, framework.Block)
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
@@ -290,14 +270,14 @@ class MomentumOptimizer(Optimizer):
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
                 "Velocity": velocity_acc,
-                "LearningRate": self._lr
+                "LearningRate": self._create_param_lr(param_and_grad)
             },
             outputs={
                 "ParamOut": param_and_grad[0],
                 "VelocityOut": velocity_acc
             },
             attrs={"mu": self._momentum,
-                   "useNesterov": self._use_nesterov})
+                   "use_nesterov": self._use_nesterov})
 
         return momentum_op
 
@@ -315,18 +295,6 @@ class AdagradOptimizer(Optimizer):
         self._learning_rate = learning_rate
         self._epsilon = epsilon
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
@@ -346,7 +314,7 @@ class AdagradOptimizer(Optimizer):
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
                 "Moment": moment_acc,
-                "LearningRate": self._lr
+                "LearningRate": self._create_param_lr(param_and_grad)
             },
             outputs={"ParamOut": param_and_grad[0],
                      "MomentOut": moment_acc},
@@ -378,18 +346,6 @@ class AdamOptimizer(Optimizer):
         self._beta2 = beta2
         self._epsilon = epsilon
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
@@ -433,7 +389,7 @@ class AdamOptimizer(Optimizer):
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": self._lr,
+                "LearningRate": self._create_param_lr(param_and_grad),
                 "Moment1": moment1,
                 "Moment2": moment2,
                 "Beta1Pow": self._beta1_pow_acc,
@@ -495,18 +451,6 @@ class AdamaxOptimizer(Optimizer):
         self._beta2 = beta2
         self._epsilon = epsilon
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         # Create beta1 power accumulator tensor
         beta_shape = [1]
@@ -536,7 +480,7 @@ class AdamaxOptimizer(Optimizer):
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": self._lr,
+                "LearningRate": self._create_param_lr(param_and_grad),
                 "Moment": moment,
                 "InfNorm": inf_norm,
                 "Beta1Pow": self._beta1_pow_acc
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 2e6710b5fcfe5a531067498e38a4cb93d3165602..4a269341a4be6c1b72fde5166b7dd089236700b8 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -215,7 +215,11 @@ class OpTest(unittest.TestCase):
             if isinstance(input_vars[var_name], list):
                 for name, np_value in self.inputs[var_name]:
                     tensor = core.LoDTensor()
-                    tensor.set(np_value, place)
+                    if isinstance(np_value, tuple):
+                        tensor.set(np_value[0], place)
+                        tensor.set_lod(np_value[1])
+                    else:
+                        tensor.set(np_value, place)
                     feed_map[name] = tensor
             else:
                 tensor = core.LoDTensor()
@@ -236,7 +240,6 @@ class OpTest(unittest.TestCase):
 
         inputs = append_input_output(block, op_proto, self.inputs, True)
         outputs = append_input_output(block, op_proto, self.outputs, False)
-
         op = block.append_op(
             type=self.op_type,
             inputs=inputs,
@@ -397,9 +400,11 @@ class OpTest(unittest.TestCase):
                 if not isinstance(item[0], basestring):
                     item = [[param_name] + list(item)]
                 if len(item) == 2:
-                    # only set var name and value, set lod to None
-                    var[i] = list(item) + [None]
-
+                    if isinstance(item[1], tuple):
+                        var[i] = [item[0], item[1][0], item[1][1]]
+                    else:
+                        # only set var name and value, set lod to None
+                        var[i] = list(item) + [None]
             var_descs = [(block.create_var(
                 name=name, shape=each.shape, dtype=each.dtype), each, lod)
                          for name, each, lod in var]
diff --git a/python/paddle/v2/framework/tests/test_accuracy_op.py b/python/paddle/v2/framework/tests/test_accuracy_op.py
index 85eabdcfb801f01bcfbf1add4e04e40fa4041325..6536c297e8e559bf04fe6ef3b0e2dadd1914eb87 100644
--- a/python/paddle/v2/framework/tests/test_accuracy_op.py
+++ b/python/paddle/v2/framework/tests/test_accuracy_op.py
@@ -26,5 +26,4 @@ class TestAccuracyOp(OpTest):
 
 
 if __name__ == '__main__':
-    exit(0)
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_array_read_write_op.py b/python/paddle/v2/framework/tests/test_array_read_write_op.py
index b2a2ff2b8213305fe039ae494fb933e65a76781a..79e9938216e2abda5432e525804b0bcb9a655655 100644
--- a/python/paddle/v2/framework/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/framework/tests/test_array_read_write_op.py
@@ -20,21 +20,19 @@ class TestArrayReadWrite(unittest.TestCase):
             each_x.stop_gradient = False
 
         i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = False
         arr = layers.array_write(x=x[0], i=i)
         i = layers.increment(x=i)
-        i.stop_gradient = True
         arr = layers.array_write(x=x[1], i=i, array=arr)
         i = layers.increment(x=i)
-        i.stop_gradient = True
         arr = layers.array_write(x=x[2], i=i, array=arr)
 
         i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = False
         a0 = layers.array_read(array=arr, i=i)
         i = layers.increment(x=i)
-        i.stop_gradient = True  # index should not calculate gradient
         a1 = layers.array_read(array=arr, i=i)
         i = layers.increment(x=i)
-        i.stop_gradient = True
         a2 = layers.array_read(array=arr, i=i)
 
         mean_a0 = layers.mean(x=a0)
diff --git a/python/paddle/v2/framework/tests/test_chunk_eval_op.py b/python/paddle/v2/framework/tests/test_chunk_eval_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..48673296a67716c4de804da533f0fd2567f10e2e
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_chunk_eval_op.py
@@ -0,0 +1,179 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class Segment(object):
+    def __init__(self, chunk_type, start_idx, end_idx):
+        self.chunk_type = chunk_type
+        self.start_idx = start_idx
+        self.end_idx = end_idx
+
+    def __str__(self):
+        return '(Segment: %s, %s, %s)' % (self.chunk_type, self.start_idx,
+                                          self.end_idx)
+
+    __repr__ = __str__
+
+
+class TestChunkEvalOp(OpTest):
+    num_sequences = 5
+    batch_size = 50
+
+    def parse_scheme(self):
+        if self.scheme == 'IOB':
+            self.num_tag_types = 2
+        elif self.scheme == 'IOE':
+            self.num_tag_types = 2
+
+    def fill_with_chunks(self, data, chunks):
+        for chunk in chunks:
+            if self.scheme == 'IOB':
+                data[chunk.start_idx] = chunk.chunk_type * self.num_tag_types
+                data[chunk.start_idx + 1:
+                     chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
+                         self.num_tag_types - 1)
+                data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
+                    self.num_tag_types - 1
+                ) if chunk.start_idx < chunk.end_idx else data[chunk.start_idx]
+            elif self.scheme == 'IOE':
+                data[chunk.start_idx:
+                     chunk.end_idx] = chunk.chunk_type * self.num_tag_types
+                data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
+                    self.num_tag_types - 1)
+
+    def rand_chunks(self, starts, num_chunks):
+        if num_chunks < 0:
+            num_chunks = np.random.randint(starts[-1])
+        chunks = []
+        # generate chunk beginnings
+        chunk_begins = sorted(
+            np.random.choice(
+                range(starts[-1]), num_chunks, replace=False))
+        seq_chunk_begins = []
+        begin_idx = 0
+        # divide chunks into sequences
+        for i in range(len(starts) - 1):
+            tmp_chunk_begins = []
+            while begin_idx < len(chunk_begins) and chunk_begins[
+                    begin_idx] < starts[i + 1]:
+                tmp_chunk_begins.append(chunk_begins[begin_idx])
+                begin_idx += 1
+            seq_chunk_begins.append(tmp_chunk_begins)
+        # generate chunk ends
+        chunk_ends = []
+        for i in range(len(seq_chunk_begins)):
+            for j in range(len(seq_chunk_begins[i])):
+                low = seq_chunk_begins[i][j]
+                high = seq_chunk_begins[i][j + 1] if j < len(seq_chunk_begins[
+                    i]) - 1 else starts[i + 1]
+                chunk_ends.append(np.random.randint(low, high))
+        # generate chunks
+        for chunk_pos in zip(chunk_begins, chunk_ends):
+            chunk_type = np.random.randint(self.num_chunk_types)
+            chunks.append(Segment(chunk_type, *chunk_pos))
+        return chunks
+
+    def gen_chunks(self, infer, label, starts):
+        chunks = self.rand_chunks(starts,
+                                  self.num_infer_chunks + self.num_label_chunks
+                                  - self.num_correct_chunks)
+        correct_chunks = np.random.choice(
+            range(len(chunks)), self.num_correct_chunks, replace=False)
+        infer_chunks = np.random.choice(
+            [x for x in range(len(chunks)) if x not in correct_chunks],
+            self.num_infer_chunks - self.num_correct_chunks,
+            replace=False)
+        infer_chunks = sorted(correct_chunks.tolist() + infer_chunks.tolist())
+        label_chunks = np.random.choice(
+            [x for x in range(len(chunks)) if x not in infer_chunks],
+            self.num_label_chunks - self.num_correct_chunks,
+            replace=False)
+        label_chunks = sorted(correct_chunks.tolist() + label_chunks.tolist())
+        self.fill_with_chunks(infer, [chunks[idx] for idx in infer_chunks])
+        self.fill_with_chunks(label, [chunks[idx] for idx in label_chunks])
+        # exclude types in excluded_chunk_types
+        if len(self.excluded_chunk_types) > 0:
+            for idx in correct_chunks:
+                if chunks[idx].chunk_type in self.excluded_chunk_types:
+                    self.num_correct_chunks -= 1
+            for idx in infer_chunks:
+                if chunks[idx].chunk_type in self.excluded_chunk_types:
+                    self.num_infer_chunks -= 1
+            for idx in label_chunks:
+                if chunks[idx].chunk_type in self.excluded_chunk_types:
+                    self.num_label_chunks -= 1
+        return self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks
+
+    def set_confs(self):
+        # Use the IOB scheme and labels with 2 chunk types
+        self.scheme = 'IOB'
+        self.num_chunk_types = 2
+        self.excluded_chunk_types = []
+        self.other_chunk_type = self.num_chunk_types
+        self.attrs = {
+            'num_chunk_types': self.num_chunk_types,
+            'chunk_scheme': self.scheme,
+            'excluded_chunk_types': self.excluded_chunk_types
+        }
+        self.parse_scheme()
+        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 4, 5, 9
+
+    def set_data(self):
+        infer = np.zeros((self.batch_size, )).astype('int32')
+        infer.fill(self.num_chunk_types * self.num_tag_types)
+        label = np.copy(infer)
+        starts = np.random.choice(
+            range(1, self.batch_size), self.num_sequences - 1,
+            replace=False).tolist()
+        starts.extend([0, self.batch_size])
+        starts = sorted(starts)
+        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks(
+            infer, label, starts)
+        self.inputs = {
+            'Inference': (infer, [starts]),
+            'Label': (label, [starts])
+        }
+        precision = float(
+            self.num_correct_chunks
+        ) / self.num_infer_chunks if self.num_infer_chunks else 0
+        recall = float(self.num_correct_chunks
+                       ) / self.num_label_chunks if self.num_label_chunks else 0
+        f1 = float(2 * precision * recall) / (
+            precision + recall) if self.num_correct_chunks else 0
+        self.outputs = {
+            'Precision': np.asarray(
+                [precision], dtype='float32'),
+            'Recall': np.asarray(
+                [recall], dtype='float32'),
+            'F1-Score': np.asarray(
+                [f1], dtype='float32')
+        }
+
+    def setUp(self):
+        self.op_type = 'chunk_eval'
+        self.set_confs()
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestChunkEvalOpWithExclude(TestChunkEvalOp):
+    def set_confs(self):
+        # Use the IOE scheme and labels with 3 chunk types
+        self.scheme = 'IOE'
+        self.num_chunk_types = 3
+        self.excluded_chunk_types = [1]
+        self.other_chunk_type = self.num_chunk_types
+        self.attrs = {
+            'num_chunk_types': self.num_chunk_types,
+            'chunk_scheme': self.scheme,
+            'excluded_chunk_types': self.excluded_chunk_types
+        }
+        self.parse_scheme()
+        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 15, 18, 20
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_clip_by_norm_op.py b/python/paddle/v2/framework/tests/test_clip_by_norm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..02f6108a3a661b0e32cd2e7ed65cb4b8cb50c067
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_clip_by_norm_op.py
@@ -0,0 +1,50 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestClipByNormOp(OpTest):
+    def setUp(self):
+        self.max_relative_error = 0.006
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        input[np.abs(input) < self.max_relative_error] = 0.5
+        self.op_type = "clip_by_norm"
+        self.inputs = {'X': input, }
+        self.attrs = {}
+        self.attrs['max_norm'] = self.max_norm
+        norm = np.sqrt(np.sum(np.square(input)))
+        if norm > self.max_norm:
+            output = self.max_norm * input / norm
+        else:
+            output = input
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def initTestCase(self):
+        self.shape = (100, )
+        self.max_norm = 1.0
+
+
+class TestCase1(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (100, )
+        self.max_norm = 1e20
+
+
+class TestCase2(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (16, 16)
+        self.max_norm = 0.1
+
+
+class TestCase3(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max_norm = 1.0
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_expand_op.py b/python/paddle/v2/framework/tests/test_expand_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440f7a2bb159bab4923683b5d0980e59e0a69c9
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_expand_op.py
@@ -0,0 +1,97 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestExpandOpRank1(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random(12).astype("float32")}
+        self.attrs = {'expand_times': [2]}
+        output = np.tile(self.inputs['X'], 2)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank2_Corner(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((12, 14)).astype("float32")}
+        self.attrs = {'expand_times': [1, 1]}
+        output = np.tile(self.inputs['X'], (1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank2(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((12, 14)).astype("float32")}
+        self.attrs = {'expand_times': [2, 3]}
+        output = np.tile(self.inputs['X'], (2, 3))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank3_Corner(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")}
+        self.attrs = {'expand_times': [1, 1, 1]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank3(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")}
+        self.attrs = {'expand_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank4(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5, 7)).astype("float32")}
+        self.attrs = {'expand_times': [3, 2, 1, 2]}
+        output = np.tile(self.inputs['X'], (3, 2, 1, 2))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_fit_a_line.py b/python/paddle/v2/framework/tests/test_fit_a_line.py
index 174ee74c3bc8952d296a2ffa205108f2a8e12cfb..6e09b88dca34de2579131e7bdc16b26cf6cde49c 100644
--- a/python/paddle/v2/framework/tests/test_fit_a_line.py
+++ b/python/paddle/v2/framework/tests/test_fit_a_line.py
@@ -3,7 +3,7 @@ import paddle.v2.framework.layers as layers
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program, g_main_program
+from paddle.v2.framework.framework import Program
 from paddle.v2.framework.io import save_persistables, load_persistables
 from paddle.v2.framework.executor import Executor
 
diff --git a/python/paddle/v2/framework/tests/test_increment_op.py b/python/paddle/v2/framework/tests/test_increment_op.py
deleted file mode 100644
index e174272b05b9413cc2bc1e099c4dd17899829e76..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_increment_op.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestIncrementOpPositiveStep(OpTest):
-    """Test increment op with positive step
-    """
-
-    def setUp(self):
-        self.op_type = "increment"
-        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
-        self.attrs = {'step': 14.8}
-        self.outputs = {'Out': self.inputs['X'] + self.attrs['step']}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestIncrementOpNegativeStep(OpTest):
-    """Test increment op with negative step
-    """
-
-    def setUp(self):
-        self.op_type = "increment"
-        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
-        self.attrs = {'step': -3.8}
-        self.outputs = {'Out': self.inputs['X'] + self.attrs['step']}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_inference_model_io.py b/python/paddle/v2/framework/tests/test_inference_model_io.py
index d273387a35820a27f87ef4569f158db4fe670864..48984f86a1864baade58aeb8e35c6065cc2a4bbb 100644
--- a/python/paddle/v2/framework/tests/test_inference_model_io.py
+++ b/python/paddle/v2/framework/tests/test_inference_model_io.py
@@ -3,7 +3,7 @@ import paddle.v2.framework.layers as layers
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program, g_main_program
+from paddle.v2.framework.framework import Program
 from paddle.v2.framework.io import save_inference_model, load_inference_model
 import paddle.v2.framework.executor as executor
 import unittest
diff --git a/python/paddle/v2/framework/tests/test_layers.py b/python/paddle/v2/framework/tests/test_layers.py
index 716963fb431a8aca70e1b87efaf268595af2b09e..b42af5ea45d54723e96279f9e16f82a1d52ad236 100644
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/framework/tests/test_layers.py
@@ -1,6 +1,6 @@
 import paddle.v2.framework.layers as layers
 import paddle.v2.framework.nets as nets
-from paddle.v2.framework.framework import Program, g_main_program
+from paddle.v2.framework.framework import Program
 import paddle.v2.framework.core as core
 import unittest
 
diff --git a/python/paddle/v2/framework/tests/test_lod_array_length_op.py b/python/paddle/v2/framework/tests/test_lod_array_length_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..af2b4d705e7ec121bd5f1350f0a642ae8c44bf1e
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_lod_array_length_op.py
@@ -0,0 +1,21 @@
+import unittest
+import paddle.v2.framework.layers as layers
+from paddle.v2.framework.executor import Executor
+import paddle.v2.framework.core as core
+import numpy
+
+
+class TestLoDArrayLength(unittest.TestCase):
+    def test_array_length(self):
+        tmp = layers.zeros(shape=[10], dtype='int32')
+        i = layers.fill_constant(shape=[1], dtype='int64', value=10)
+        arr = layers.array_write(tmp, i=i)
+        arr_len = layers.array_length(arr)
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        result = numpy.array(exe.run(fetch_list=[arr_len])[0])
+        self.assertEqual(11, result[0])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
index 61a5fcf07d2af18bf904c74d5c0d4c4eb462154c..e9713666b3f64d7a39afadab7da6b22f149b8cf8 100644
--- a/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
@@ -4,6 +4,7 @@ import numpy
 import paddle.v2.framework.layers as layers
 from paddle.v2.framework.framework import Program
 from paddle.v2.framework.executor import Executor
+from paddle.v2.framework.backward import append_backward_ops
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
@@ -123,5 +124,42 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         self.assertEqual(actual.lod(), expect.lod())
 
 
+class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
+    def test_grad(self):
+        place = core.CPUPlace()
+        program = Program()
+
+        x = layers.data(
+            name='x',
+            shape=[1],
+            data_type='float32',
+            main_program=program,
+            stop_gradient=False)
+        table = layers.lod_rank_table(x, level=0, main_program=program)
+        array = layers.lod_tensor_to_array(x, table, main_program=program)
+        result = layers.array_to_lod_tensor(array, table, main_program=program)
+
+        mean = layers.mean(x=result, main_program=program)
+
+        append_backward_ops(mean)
+
+        tensor = core.LoDTensor()
+        tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
+        tensor.set_lod([[0, 3, 9, 10]])
+
+        g_vars = program.global_block().var(x.name + "@GRAD")
+
+        exe = Executor(place)
+        g_out = [
+            item.sum()
+            for item in map(
+                numpy.array,
+                exe.run(program, feed={'x': tensor}, fetch_list=[g_vars]))
+        ]
+        g_out_sum = numpy.array(g_out).sum()
+
+        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lstm_op.py b/python/paddle/v2/framework/tests/test_lstm_op.py
index ff75160083f2936dd653a8396254bf16d1752ffa..77f062e8c8870ec9cc56c9566108abe74665ae30 100644
--- a/python/paddle/v2/framework/tests/test_lstm_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_op.py
@@ -117,8 +117,9 @@ class TestLstmOp(OpTest):
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
 
-        self.has_initial_state = True
+        self.has_initial_state = False
         self.is_reverse = False
+        self.use_peepholes = True
 
     def setUp(self):
         self.set_argument()
@@ -128,18 +129,28 @@ class TestLstmOp(OpTest):
         N = len(self.lod[0]) - 1
 
         x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
-        h0 = np.zeros((N, self.D)).astype('float64')
-        c0 = np.zeros((N, self.D)).astype('float64')
+        if self.has_initial_state:
+            h0 = np.random.normal(size=(N, self.D)).astype('float64')
+            c0 = np.random.normal(size=(N, self.D)).astype('float64')
+        else:
+            h0 = np.zeros((N, self.D)).astype('float64')
+            c0 = np.zeros((N, self.D)).astype('float64')
         w = np.random.normal(size=(self.D, 4 * self.D)).astype('float64')
-        b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        if self.use_peepholes:
+            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        else:
+            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
 
         w_b = b[:, 0:4 * self.D]
-        w_c = b[:, 4 * self.D:]
+        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
         h, c = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse,
                     ACTVATION[self.act_gate], ACTVATION[self.act_cell],
                     ACTVATION[self.act_cand])
 
-        self.inputs = {'Input': (x, self.lod), 'Weight': w, 'Bias': b}
+        self.inputs = {'Input': (x, self.lod), 'Weight': w}
+
+        self.inputs['Bias'] = b
+
         if self.has_initial_state:
             self.inputs['H0'] = h0
             self.inputs['C0'] = c0
@@ -149,17 +160,16 @@ class TestLstmOp(OpTest):
             'Cell': (c, self.lod),
         }
         self.attrs = {
-            'usePeepholes': True,
-            'isReverse': self.is_reverse,
-            'gateActivation': self.act_gate,
-            'cellActivation': self.act_cell,
-            'candidateActivation': self.act_cand
+            'use_peepholes': self.use_peepholes,
+            'is_reverse': self.is_reverse,
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand
         }
 
     def test_check_output(self):
         self.check_output(atol=1e-8)
 
-    #TODO(qingqing) add more unit testing case
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
         N = len(self.lod[0]) - 1
@@ -170,7 +180,7 @@ class TestLstmOp(OpTest):
             ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
 
 
-class TestLstmOpHasNoInitial(TestLstmOp):
+class TestLstmOpHasInitial(TestLstmOp):
     def set_argument(self):
         self.lod = [[0, 2, 5, 7]]
         self.D = 16
@@ -179,8 +189,69 @@ class TestLstmOpHasNoInitial(TestLstmOp):
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
 
-        self.has_initial_state = False
+        self.has_initial_state = True
         self.is_reverse = True
+        self.use_peepholes = True
+
+    def test_check_grad(self):
+        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
+            max_relative_error=5e-4)
+
+    def test_check_grad_ingore_bias(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Bias'))
+
+    def test_check_grad_ingore_weight(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Bias'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Weight'))
+
+    def test_check_grad_ingore_input(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Weight', 'Bias'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Input'))
+
+    def test_check_grad_ingore_h0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('H0'))
+
+    def test_check_grad_ingore_c0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('C0'))
 
 
 class TestLstmOpRerverse(TestLstmOp):
@@ -192,8 +263,23 @@ class TestLstmOpRerverse(TestLstmOp):
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
 
-        self.has_initial_state = True
+        self.has_initial_state = False
+        self.is_reverse = True
+        self.use_peepholes = True
+
+
+class TestLstmOpNotUsePeepholes(TestLstmOp):
+    def set_argument(self):
+        self.lod = [[0, 2, 5, 7]]
+        self.D = 16
+
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+
+        self.has_initial_state = False
         self.is_reverse = True
+        self.use_peepholes = False
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/framework/tests/test_momentum_op.py b/python/paddle/v2/framework/tests/test_momentum_op.py
index 654d31975aab4578055e7e70ade202bd2c3d93cb..638095f7564c8761151a7794f98f9ca797b0083b 100644
--- a/python/paddle/v2/framework/tests/test_momentum_op.py
+++ b/python/paddle/v2/framework/tests/test_momentum_op.py
@@ -37,7 +37,7 @@ class TestMomentumOp1(OpTest):
 
 
 class TestMomentumOp2(OpTest):
-    '''Test Momentum with defaukt values for attributes
+    '''Test Momentum with default values for attributes
     '''
 
     def setUp(self):
@@ -57,7 +57,7 @@ class TestMomentumOp2(OpTest):
             'LearningRate': learning_rate
         }
 
-        self.attrs = {'mu': mu, 'useNesterov': use_nesterov}
+        self.attrs = {'mu': mu, 'use_nesterov': use_nesterov}
 
         velocity_out = mu * velocity + grad
         if use_nesterov:
diff --git a/python/paddle/v2/framework/tests/test_optimizer.py b/python/paddle/v2/framework/tests/test_optimizer.py
index 9333df8f7f347a080cfb035ccd0c575ded7c423a..a39e7402600c7a94301de030c90ea51264248cf1 100644
--- a/python/paddle/v2/framework/tests/test_optimizer.py
+++ b/python/paddle/v2/framework/tests/test_optimizer.py
@@ -98,7 +98,7 @@ class TestMomentumOptimizer(unittest.TestCase):
         self.assertEqual(len(opts), 1)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "momentum")
-        self.assertFalse(sgd_op.attr('useNesterov'))
+        self.assertFalse(sgd_op.attr('use_nesterov'))
 
         # Check accumulators
         accumulators = momentum_optimizer.get_accumulators()
@@ -143,7 +143,7 @@ class TestMomentumOptimizer(unittest.TestCase):
         self.assertEqual(len(opts), 1)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "momentum")
-        self.assertTrue(sgd_op.attr('useNesterov'))
+        self.assertTrue(sgd_op.attr('use_nesterov'))
 
         # Check accumulators
         accumulators = momentum_optimizer.get_accumulators()
diff --git a/python/paddle/v2/framework/tests/test_pool2d_op.py b/python/paddle/v2/framework/tests/test_pool2d_op.py
index c93469e11994c44ee6fbd1a8828074c1558c08fa..ac3fa6aa87835b3cd6fb9bbf6fe66b1d0c577ca2 100644
--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
@@ -61,8 +61,8 @@ class TestPool2d_Op(OpTest):
             'strides': self.strides,
             'paddings': self.paddings,
             'ksize': self.ksize,
-            'poolingType': self.pool_type,
-            'globalPooling': self.global_pool,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
         }
 
         self.outputs = {'Out': output.astype('float32')}
diff --git a/python/paddle/v2/framework/tests/test_pool3d_op.py b/python/paddle/v2/framework/tests/test_pool3d_op.py
index 416f0df7cd27f58c4c99fb776b84e44005f31639..87483ae5e568c01141ff789f37e84069cb8e827d 100644
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
@@ -67,8 +67,8 @@ class TestPool3d_Op(OpTest):
             'strides': self.strides,
             'paddings': self.paddings,
             'ksize': self.ksize,
-            'poolingType': self.pool_type,
-            'globalPooling': self.global_pool,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
         }
 
         self.outputs = {'Out': output.astype('float32')}
diff --git a/python/paddle/v2/framework/tests/test_pool_max_op.py b/python/paddle/v2/framework/tests/test_pool_max_op.py
index cc1a867761142edea506a24e84ad31bfe6858fb0..04843a28ac19e076e097d1aa1034bcf9378aa495 100644
--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
@@ -86,7 +86,7 @@ class TestMaxPoolWithIndex_Op(OpTest):
             'strides': self.strides,
             'paddings': self.paddings,
             'ksize': self.ksize,
-            'globalPooling': self.global_pool,
+            'global_pooling': self.global_pool,
         }
 
         self.inputs = {'X': input}
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
index c3186e25b37fe112987017521daa336199085134..66c629eb4261a9b971f25611d8e49f0cb671304a 100644
--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
@@ -4,7 +4,7 @@ import paddle.v2.framework.nets as nets
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program, g_main_program
+from paddle.v2.framework.framework import Program
 from paddle.v2.framework.executor import Executor
 
 import numpy as np
diff --git a/python/paddle/v2/framework/tests/test_recommender_system.py b/python/paddle/v2/framework/tests/test_recommender_system.py
index 7e54f0d1b86461954b05d1e3c1d5f49f71ff9068..31562b4391d16b831d53801cfa21c7bdf8c3ab8d 100644
--- a/python/paddle/v2/framework/tests/test_recommender_system.py
+++ b/python/paddle/v2/framework/tests/test_recommender_system.py
@@ -4,7 +4,7 @@ import paddle.v2.framework.nets as nets
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program, g_main_program
+from paddle.v2.framework.framework import Program
 from paddle.v2.framework.executor import Executor
 
 import numpy as np
diff --git a/python/paddle/v2/framework/tests/test_seq_concat_op.py b/python/paddle/v2/framework/tests/test_seq_concat_op.py
index abd2ebf0b21a953b76155eb04c57a7b65ac53cbc..dccc6ed8afe2315da74f6886878b15d58b26b3c9 100644
--- a/python/paddle/v2/framework/tests/test_seq_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_seq_concat_op.py
@@ -2,9 +2,36 @@ import unittest
 import numpy as np
 import sys
 from op_test import OpTest
+exit(0)
 
 
-class TestConcatOp(OpTest):
+def to_abs_lod(lod):
+    if len(lod) == 0 or len(lod) == 1:
+        return lod
+    import copy
+    new_lod = copy.deepcopy(lod)
+    for idx, val in enumerate(lod[0]):
+        new_lod[0][idx] = lod[1][val]
+    return new_lod
+
+
+def seq_concat(inputs, level):
+    lod0 = inputs['X'][0][1][1]
+    lod1 = inputs['X'][1][1][1]
+    x0 = inputs['X'][0][1][0]
+    x1 = inputs['X'][1][1][0]
+    level_idx = len(lod0) - level - 1
+    outs = []
+    for i in range(len(lod0[level_idx]) - 1):
+        sub_x0 = x0[to_abs_lod(lod0)[level_idx][i]:to_abs_lod(lod0)[level_idx][
+            i + 1], :]
+        sub_x1 = x1[to_abs_lod(lod1)[level_idx][i]:to_abs_lod(lod1)[level_idx][
+            i + 1], :]
+        outs.append(np.concatenate((sub_x0, sub_x1), axis=0))
+    return np.concatenate(outs, axis=0)
+
+
+class TestSeqConcatOp(OpTest):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
@@ -15,13 +42,7 @@ class TestConcatOp(OpTest):
         level = 1
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        outs = []
-        for i in range(4):
-            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
-            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
-            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
-
-        self.outputs = {'Out': np.concatenate(outs, axis=0)}
+        self.outputs = {'Out': (np.concatenate([x0, x1], axis=1), lod0)}
 
     def setUp(self):
         self.op_type = "sequence_concat"
@@ -34,46 +55,50 @@ class TestConcatOp(OpTest):
         self.check_grad(['x0'], 'Out')
 
 
-class TestConcatOpDiffLod(TestConcatOp):
+class TestSeqConcatOpLevelZeroNestedSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
         lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        x1 = np.random.random((5, 6, 3)).astype('float32')
-        lod1 = [[0, 3, 5], [0, 1, 2, 3, 5]]
+        x1 = np.random.random((7, 6, 3)).astype('float32')
+        lod1 = [[0, 2, 4], [0, 1, 3, 5, 7]]
         axis = 0
-        level = 1
+        level = 0
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        outs = []
-        for i in range(4):
-            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
-            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
-            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
+        out_lod = [[0, 2, 4], [0, 2, 5, 8, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
-        self.outputs = {'Out': np.concatenate(outs, axis=0)}
+
+class TestSeqConcatOplevelOneNestedSequence(TestSeqConcatOp):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 6, 3)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((7, 6, 3)).astype('float32')
+        lod1 = [[0, 3, 4], [0, 1, 3, 5, 7]]
+        axis = 0
+        level = 1
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        out_lod = [[0, 5, 8], [0, 1, 2, 3, 5, 7, 8, 9, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
-class TestConcatOpLevelZero(TestConcatOp):
+class TestSeqConcatOpLevelZeroSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 3, 4)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        x1 = np.random.random((5, 3, 4)).astype('float32')
-        lod1 = [[0, 3, 5], [0, 1, 3, 4, 5]]
+        lod0 = [[0, 1, 2, 3, 4]]
+        x1 = np.random.random((7, 3, 4)).astype('float32')
+        lod1 = [[0, 1, 3, 5, 7]]
         axis = 0
         level = 0
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        outs = []
-        for i in range(2):
-            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
-            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
-            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
-
-        self.outputs = {'Out': np.concatenate(outs, axis=0)}
+        out_lod = [[0, 2, 5, 8, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
 if __name__ == '__main__':
-    sys.exit(0)
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py b/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..2090455b969806685b525f1e588b6570e3072430
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
@@ -0,0 +1,47 @@
+import unittest
+import paddle.v2.framework.core as core
+from paddle.v2.framework.executor import Executor
+import paddle.v2.framework.layers as layers
+from paddle.v2.framework.backward import append_backward_ops
+from paddle.v2.framework.framework import g_main_program
+import numpy
+
+
+class TestShrinkRNNMemory(unittest.TestCase):
+    def test_shrink_rnn_memory(self):
+        x = layers.data('x', shape=[100], data_type='float32')
+        x.stop_gradient = False
+        table = layers.lod_rank_table(x=x)
+        i = layers.zeros(dtype='int64', shape=[1])
+        mem1 = layers.shrink_memory(x=x, i=i, table=table)
+        i = layers.increment(x=i)
+        i.stop_gradient = True
+        mem2 = layers.shrink_memory(x=mem1, i=i, table=table)
+        i = layers.increment(x=i)
+        i.stop_gradient = True
+        mem3 = layers.shrink_memory(x=mem2, i=i, table=table)
+
+        cpu = core.CPUPlace()
+        tensor = core.LoDTensor()
+        tensor.set_lod([[0, 2, 5, 6]])
+        tensor_np = numpy.random.random(size=(3, 100)).astype('float32')
+        tensor.set(tensor_np, cpu)
+        exe = Executor(cpu)
+        outs = map(numpy.array,
+                   exe.run(feed={'x': tensor}, fetch_list=[mem1, mem2, mem3]))
+        self.assertTrue(numpy.allclose(tensor_np[0:3], outs[0]))
+        self.assertTrue(numpy.allclose(tensor_np[0:2], outs[1]))
+        self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2]))
+
+        mem3_mean = layers.mean(x=mem3)
+        append_backward_ops(loss=mem3_mean)
+        x_grad = map(numpy.array,
+                     exe.run(feed={'x': tensor},
+                             fetch_list=[
+                                 g_main_program.global_block().var('x@GRAD')
+                             ]))[0]
+        self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py b/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..26cbd01bc04916e53554e6f70bee7bcf25d6371c
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
@@ -0,0 +1,107 @@
+import paddle.v2 as paddle
+import paddle.v2.framework.layers as layers
+import paddle.v2.framework.core as core
+import paddle.v2.framework.optimizer as optimizer
+
+from paddle.v2.framework.framework import g_main_program, g_startup_program
+from paddle.v2.framework.executor import Executor
+
+import numpy as np
+
+
+def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
+    data = layers.data(
+        name="words",
+        shape=[seq_len * batch_size, 1],
+        append_batch_size=False,
+        data_type="int64")
+    label = layers.data(
+        name="label",
+        shape=[batch_size, 1],
+        append_batch_size=False,
+        data_type="int64")
+
+    emb = layers.embedding(input=data, size=[dict_dim, emb_dim])
+    emb = layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
+    emb = layers.transpose(x=emb, axis=[1, 0, 2])
+
+    c_pre_init = layers.fill_constant(
+        dtype=emb.data_type, shape=[batch_size, emb_dim], value=0.0)
+    layer_1_out = layers.lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
+    layer_1_out = layers.transpose(x=layer_1_out, axis=[1, 0, 2])
+
+    prediction = layers.fc(input=layer_1_out, size=class_dim, act="softmax")
+    cost = layers.cross_entropy(input=prediction, label=label)
+
+    avg_cost = layers.mean(x=cost)
+    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
+    opts = adam_optimizer.minimize(avg_cost)
+    acc = layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def chop_data(data, chop_len=80, batch_len=50):
+    data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len]
+
+    return data[:batch_len]
+
+
+def prepare_feed_data(data, place):
+    tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
+
+    label = np.array(map(lambda x: x[1], data)).astype("int64")
+    label = label.reshape([50, 1])
+    tensor_label = core.LoDTensor()
+    tensor_label.set(label, place)
+
+    return tensor_words, tensor_label
+
+
+def main():
+    word_dict = paddle.dataset.imdb.word_dict()
+    cost, acc = lstm_net(dict_dim=len(word_dict), class_dim=2)
+
+    batch_size = 100
+    train_data = paddle.batch(
+        paddle.reader.buffered(
+            paddle.dataset.imdb.train(word_dict), size=batch_size * 10),
+        batch_size=batch_size)
+
+    data = chop_data(next(train_data()))
+
+    place = core.CPUPlace()
+    tensor_words, tensor_label = prepare_feed_data(data, place)
+    exe = Executor(place)
+    exe.run(g_startup_program)
+
+    while True:
+        outs = exe.run(g_main_program,
+                       feed={"words": tensor_words,
+                             "label": tensor_label},
+                       fetch_list=[cost, acc])
+        cost_val = np.array(outs[0])
+        acc_val = np.array(outs[1])
+
+        print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+        if acc_val > 0.9:
+            break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/framework/tests/test_while_op.py b/python/paddle/v2/framework/tests/test_while_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c344eae49705ecce586154c30c4d4f770022e7e
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_while_op.py
@@ -0,0 +1,68 @@
+import unittest
+import paddle.v2.framework.layers as layers
+from paddle.v2.framework.executor import Executor
+import paddle.v2.framework.core as core
+import numpy
+
+
+class TestWhileOp(unittest.TestCase):
+    def test_simple_forward(self):
+        d0 = layers.data(
+            "d0", shape=[10], append_batch_size=False, data_type='float32')
+        d1 = layers.data(
+            "d1", shape=[10], append_batch_size=False, data_type='float32')
+        d2 = layers.data(
+            "d2", shape=[10], append_batch_size=False, data_type='float32')
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = True
+        init = layers.zeros(shape=[10], dtype='float32')
+        mem_array = layers.array_write(init, i=i)
+        data_array = layers.array_write(x=d0, i=i)
+
+        i = layers.increment(i)
+        layers.array_write(d1, i, array=data_array)
+
+        i = layers.increment(i)
+        layers.array_write(d2, i, array=data_array)
+
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = True
+
+        array_len = layers.fill_constant(shape=[1], dtype='int64', value=3)
+        cond = layers.less_than(x=i, y=array_len)
+
+        while_op = layers.While(cond=cond)
+        with while_op.block():
+            d = layers.array_read(array=data_array, i=i)
+            prev = layers.array_read(array=mem_array, i=i)
+            i = layers.increment(x=i, in_place=True)
+            result = layers.sums(input=[d, prev])
+            layers.array_write(result, i=i, array=mem_array)
+            layers.less_than(x=i, y=array_len, cond=cond)
+        sum_result = layers.array_read(mem_array, i=array_len)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        d = []
+
+        for i in xrange(3):
+            d.append(numpy.random.random(size=[10]).astype('float32'))
+
+        d_tensor = []
+        for item in d:
+            t = core.LoDTensor()
+            t.set(item, cpu)
+            d_tensor.append(t)
+
+        outs = map(numpy.array,
+                   exe.run(feed={
+                       'd0': d_tensor[0],
+                       'd1': d_tensor[1],
+                       'd2': d_tensor[2]
+                   },
+                           fetch_list=[sum_result]))
+        self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_word2vec.py b/python/paddle/v2/framework/tests/test_word2vec.py
index 116854c97b3731e8dce2adac944df47bb8c5da7e..cb9fc2ab62b56348db7a320f7d40d2f0a7bf9d21 100644
--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ b/python/paddle/v2/framework/tests/test_word2vec.py
@@ -3,7 +3,7 @@ import paddle.v2.framework.layers as layers
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program, g_main_program
+from paddle.v2.framework.framework import Program
 from paddle.v2.framework.executor import Executor
 
 import numpy as np
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
index 965d965335a56a97448bd8c738b03eceaee550e2..7408ea8ef611ddfa74dc5bb6ef45d4e0ccb9d141 100644
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
@@ -1,33 +1,35 @@
-import numpy as np
-try:
-    import cv2
-except ImportError:
-    cv2 = None
-import os
-import tarfile
-import cPickle
-
-__all__ = [
-    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
-    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
-    "batch_images_from_tar"
-]
 """
 This file contains some common interfaces for image preprocess.
 Many users are confused about the image layout. We introduce
 the image layout as follows.
 
 - CHW Layout
+
   - The abbreviations: C=channel, H=Height, W=Width
   - The default layout of image opened by cv2 or PIL is HWC.
     PaddlePaddle only supports the CHW layout. And CHW is simply
     a transpose of HWC. It must transpose the input image.
 
 - Color format: RGB or BGR
+
   OpenCV use BGR color format. PIL use RGB color format. Both
   formats can be used for training. Noted that, the format should
   be keep consistent between the training and inference peroid.
 """
+import numpy as np
+try:
+    import cv2
+except ImportError:
+    cv2 = None
+import os
+import tarfile
+import cPickle
+
+__all__ = [
+    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
+    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
+    "batch_images_from_tar"
+]
 
 
 def batch_images_from_tar(data_file,
@@ -36,17 +38,18 @@ def batch_images_from_tar(data_file,
                           num_per_batch=1024):
     """
     Read images from tar file and batch them into batch file.
-    param data_file: path of image tar file
-    type data_file: string
-    param dataset_name: 'train','test' or 'valid'
-    type dataset_name: string
-    param img2label: a dic with image file name as key 
+
+    :param data_file: path of image tar file
+    :type data_file: string
+    :param dataset_name: 'train','test' or 'valid'
+    :type dataset_name: string
+    :param img2label: a dic with image file name as key 
                     and image's label as value
-    type img2label: dic
-    param num_per_batch: image number per batch file
-    type num_per_batch: int
-    return: path of list file containing paths of batch file
-    rtype: string
+    :type img2label: dic
+    :param num_per_batch: image number per batch file
+    :type num_per_batch: int
+    :return: path of list file containing paths of batch file
+    :rtype: string
     """
     batch_dir = data_file + "_batch"
     out_path = "%s/%s" % (batch_dir, dataset_name)
@@ -99,14 +102,16 @@ def load_image_bytes(bytes, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         with open('cat.jpg') as f:
             im = load_image_bytes(f.read())
 
     :param bytes: the input image bytes array.
-    :type file: str
+    :type bytes: str
     :param is_color: If set is_color True, it will load and
                      return a color image. Otherwise, it will
                      load and return a gray image.
+    :type is_color: bool
     """
     flag = 1 if is_color else 0
     file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
@@ -121,6 +126,7 @@ def load_image(file, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         im = load_image('cat.jpg')
 
     :param file: the input image path.
@@ -128,6 +134,7 @@ def load_image(file, is_color=True):
     :param is_color: If set is_color True, it will load and
                      return a color image. Otherwise, it will
                      load and return a gray image.
+    :type is_color: bool
     """
     # cv2.IMAGE_COLOR for OpenCV3
     # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
@@ -147,6 +154,7 @@ def resize_short(im, size):
     Example usage:
     
     .. code-block:: python
+
         im = load_image('cat.jpg')
         im = resize_short(im, 256)
     
@@ -175,6 +183,7 @@ def to_chw(im, order=(2, 0, 1)):
     Example usage:
     
     .. code-block:: python
+
         im = load_image('cat.jpg')
         im = resize_short(im, 256)
         im = to_chw(im)
@@ -196,6 +205,7 @@ def center_crop(im, size, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         im = center_crop(im, 224)
     
     :param im: the input image with HWC layout.
@@ -223,6 +233,7 @@ def random_crop(im, size, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         im = random_crop(im, 224)
     
     :param im: the input image with HWC layout.
@@ -251,6 +262,7 @@ def left_right_flip(im):
     Example usage:
     
     .. code-block:: python
+
         im = left_right_flip(im)
     
     :paam im: input image with HWC layout
@@ -275,6 +287,7 @@ def simple_transform(im,
     Example usage:
     
     .. code-block:: python
+
         im = simple_transform(im, 256, 224, True)
 
     :param im: The input image with HWC layout.
@@ -285,6 +298,11 @@ def simple_transform(im,
     :type crop_size: int
     :param is_train: Whether it is training or not.
     :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
     """
     im = resize_short(im, resize_size)
     if is_train:
@@ -324,6 +342,7 @@ def load_and_transform(filename,
     Example usage:
     
     .. code-block:: python
+
         im = load_and_transform('cat.jpg', 256, 224, True)
 
     :param filename: The file name of input image.
@@ -334,6 +353,11 @@ def load_and_transform(filename,
     :type crop_size: int
     :param is_train: Whether it is training or not.
     :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
     """
     im = load_image(filename)
     im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 94d706b1d6289a7bffbdfb161c35d44c78fdf46f..caef5f484e2d629f2298ced457e89ff93a536311 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -102,7 +102,7 @@ class Momentum(Optimizer):
 
     ..  math::
 
-        v_{t} &= k * v_{t-1} - \\gamma_t / (g_{t} + \\lambda w_{t-1}) \\\\
+        v_{t} &= k * v_{t-1} - \\gamma_t (g_{t} + \\lambda w_{t-1}) \\\\
         w_{t} &= w_{t-1} + v_{t} \\\\
 
     where, :math:`k` is momentum, :math:`\\lambda` is decay rate,