diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ae1857642e8df4b3859eec68a3a5227d1c4fcb3
--- /dev/null
+++ b/benchmark/paddle/image/resnet.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg("layer_num", int, 50)
+is_test = get_config_arg("is_test", bool, False)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+
+#######################Network Configuration #############
+def conv_bn_layer(name,
+                  input,
+                  filter_size,
+                  num_filters,
+                  stride,
+                  padding,
+                  channels=None,
+                  active_type=ReluActivation()):
+    """
+    A wrapper for conv layer with batch normalization layers.
+    Note:
+    conv layer has no activation.
+    """
+
+    tmp = img_conv_layer(
+        name=name + "_conv",
+        input=input,
+        filter_size=filter_size,
+        num_channels=channels,
+        num_filters=num_filters,
+        stride=stride,
+        padding=padding,
+        act=LinearActivation(),
+        bias_attr=False)
+    return batch_norm_layer(
+        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
+
+
+def bottleneck_block(name, input, num_filters1, num_filters2):
+    """
+    A wrapper for bottlenect building block in ResNet.
+    Last conv_bn_layer has no activation.
+    Addto layer has activation of relu.
+    """
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=1,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+
+    return addto_layer(
+        name=name + "_addto", input=[input, last_name], act=ReluActivation())
+
+
+def mid_projection(name, input, num_filters1, num_filters2, stride=2):
+    """
+    A wrapper for middile projection in ResNet.
+    projection shortcuts are used for increasing dimensions,
+    and other shortcuts are identity
+    branch1: projection shortcuts are used for increasing
+    dimensions, has no activation.
+    branch2x: bottleneck building block, shortcuts are identity.
+    """
+    # stride = 2
+    branch1 = conv_bn_layer(
+        name=name + '_branch1',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=stride,
+        padding=0,
+        active_type=LinearActivation())
+
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=stride,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+
+    return addto_layer(
+        name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
+
+
+img = data_layer(name='image', size=height * width * 3)
+
+
+def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
+    """
+    A wrapper for 50,101,152 layers of ResNet.
+    res2_num: number of blocks stacked in conv2_x
+    res3_num: number of blocks stacked in conv3_x
+    res4_num: number of blocks stacked in conv4_x
+    res5_num: number of blocks stacked in conv5_x
+    """
+    # For ImageNet
+    # conv1: 112x112
+    tmp = conv_bn_layer(
+        "conv1",
+        input=img,
+        filter_size=7,
+        channels=3,
+        num_filters=64,
+        stride=2,
+        padding=3)
+    tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
+
+    # conv2_x: 56x56
+    tmp = mid_projection(
+        name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
+    for i in xrange(2, res2_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
+
+    # conv3_x: 28x28
+    tmp = mid_projection(
+        name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
+    for i in xrange(2, res3_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res3_" + str(i),
+            input=tmp,
+            num_filters1=128,
+            num_filters2=512)
+
+    # conv4_x: 14x14
+    tmp = mid_projection(
+        name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
+    for i in xrange(2, res4_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res4_" + str(i),
+            input=tmp,
+            num_filters1=256,
+            num_filters2=1024)
+
+    # conv5_x: 7x7
+    tmp = mid_projection(
+        name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
+    for i in xrange(2, res5_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res5_" + str(i),
+            input=tmp,
+            num_filters1=512,
+            num_filters2=2048)
+
+    tmp = img_pool_layer(
+        name='avgpool',
+        input=tmp,
+        pool_size=7,
+        stride=1,
+        pool_type=AvgPooling())
+
+    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
+
+
+if layer_num == 50:
+    resnet = deep_res_net(3, 4, 6, 3)
+elif layer_num == 101:
+    resnet = deep_res_net(3, 4, 23, 3)
+elif layer_num == 152:
+    resnet = deep_res_net(3, 8, 36, 3)
+else:
+    print("Wrong layer number.")
+
+lbl = data_layer(name="label", size=num_class)
+loss = cross_entropy(name='loss', input=resnet, label=lbl)
+inputs(img, lbl)
+outputs(loss)
diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh
index e31fec1cd850157d90ddcab2d559d52381ecd317..a4527e04968cf8c8c3c31d16f50bc3e28381f6d8 100755
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@@ -5,22 +5,23 @@ function train() {
   export OMP_DYNAMIC="FALSE"
   export KMP_AFFINITY="granularity=fine,compact,0,0"
   topology=$1
-  bs=$2
-  use_mkldnn=$3
-  if [ $3 == "True" ]; then
+  layer_num=$2
+  bs=$3
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
     thread=1
-    log="logs/${topology}-mkldnn-${bs}.log"
-  elif [ $3 == "False" ]; then
+    log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
+  elif [ $4 == "False" ]; then
     thread=`nproc`
     # each trainer_count use only 1 core to avoid conflict
     export OMP_NUM_THREADS=1
     export MKL_NUM_THREADS=1
-    log="logs/${topology}-${thread}mklml-${bs}.log"
+    log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
   else
     echo "Wrong input $3, use True or False."
     exit 0
   fi
-  args="batch_size=${bs}"
+  args="batch_size=${bs},layer_num=${layer_num}"
   config="${topology}.py"
   paddle train --job=time \
     --config=$config \
@@ -40,12 +41,9 @@ if [ ! -d "logs" ]; then
   mkdir logs
 fi
 
-#========== mkldnn ==========#
-train vgg 64 True
-train vgg 128 True
-train vgg 256 True
-
-#========== mklml ===========#
-train vgg 64 False
-train vgg 128 False
-train vgg 256 False
+for use_mkldnn in True False; do
+  for batchsize in 64 128 256; do
+    train vgg 19 $batchsize $use_mkldnn
+    train resnet 50  $batchsize $use_mkldnn
+  done
+done
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
index b8429975f5c83df6996e71478fe276b246e8b77b..420884ed8e1ae36a3f1772bfbe8323f3d0ea71e6 100644
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -13,7 +13,7 @@ define_py_data_sources2(
 
 settings(
     batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
+    learning_rate=0.001 / batch_size,
     learning_method=MomentumOptimizer(0.9),
     regularization=L2Regularization(0.0005 * batch_size))
 
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 9686df00219001769d074ee815d9cc8db0258496..5a06825beb73e85d8a55b7b578b187bee2c4340c 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -46,16 +46,20 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
     MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
 ENDIF()
 
+SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
+SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow")
 ExternalProject_Add(
     ${MKLDNN_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.10"
+    GIT_TAG             "v0.11"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
     CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
                         -DMKLROOT:PATH=${MKLDNN_MKLROOT}
 )
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 74f3279831357c21038df133df0f5a432a6dfd20..20dbc32a738d982df2d3f035206279c82c8de264 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -27,8 +27,8 @@ ENDIF()
 INCLUDE(ExternalProject)
 
 SET(MKLML_PROJECT       "extern_mklml")
-SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
+SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst
index fef87c4fbdb452771ecdb361c6eeae5b32bcee14..b56c7332cc284649c7e04328e51a7faa78593a39 100644
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
@@ -2,112 +2,9 @@
 Data Reader Interface and DataSets
 ==================================
 
+..  toctree::
+    :maxdepth: 1
 
-DataTypes
-=========
-
-..  automodule:: paddle.v2.data_type
-    :members:
-    :noindex:
-
-DataFeeder
-==========
-
-..  automodule:: paddle.v2.data_feeder
-    :members:
-    :noindex:
-
-Reader
-======
-
-..  automodule:: paddle.v2.reader
-    :members:
-    :noindex:
-
-..  automodule:: paddle.v2.reader.creator
-    :members:
-    :noindex:
-
-minibatch
-=========
-
-..  automodule:: paddle.v2.minibatch
-    :members:
-    :noindex:
-
-Dataset
-=======
-
-..  automodule:: paddle.v2.dataset
-    :members:
-    :noindex:
-
-mnist
-+++++
-
-..  automodule:: paddle.v2.dataset.mnist
-    :members:
-    :noindex:
-
-cifar
-+++++
-
-..  automodule:: paddle.v2.dataset.cifar
-    :members:
-    :noindex:
-
-conll05
-+++++++
-
-..  automodule:: paddle.v2.dataset.conll05
-    :members: get_dict,get_embedding,test
-    :noindex:
-
-imdb
-++++
-
-..  automodule:: paddle.v2.dataset.imdb
-    :members:
-    :noindex:
-
-imikolov
-++++++++
-
-..  automodule:: paddle.v2.dataset.imikolov
-    :members:
-    :noindex:
-
-movielens
-+++++++++
-
-..  automodule:: paddle.v2.dataset.movielens
-    :members:
-    :noindex:
-
-..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
-    :noindex:
-    
-..  autoclass:: paddle.v2.dataset.movielens.UserInfo
-    :noindex:
-
-sentiment
-+++++++++
-
-..  automodule:: paddle.v2.dataset.sentiment
-    :members:
-    :noindex:
-
-uci_housing
-+++++++++++
-
-..  automodule:: paddle.v2.dataset.uci_housing
-    :members:
-    :noindex:
-
-wmt14
-+++++
-
-..  automodule:: paddle.v2.dataset.wmt14
-    :members:
-    :noindex:
-
+    data/data_reader.rst
+    data/image.rst
+    data/dataset.rst
diff --git a/doc/api/v2/data/data_reader.rst b/doc/api/v2/data/data_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2ccfec9c284877a7576e9751526b169a4ac78d8e
--- /dev/null
+++ b/doc/api/v2/data/data_reader.rst
@@ -0,0 +1,36 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+..  automodule:: paddle.v2.data_type
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.v2.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.v2.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a8ecc5bb1d855e0ded3719943ab3adb810de365
--- /dev/null
+++ b/doc/api/v2/data/dataset.rst
@@ -0,0 +1,75 @@
+Dataset
+=======
+
+..  automodule:: paddle.v2.dataset
+    :members:
+    :noindex:
+
+mnist
++++++
+
+..  automodule:: paddle.v2.dataset.mnist
+    :members:
+    :noindex:
+
+cifar
++++++
+
+..  automodule:: paddle.v2.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
++++++++
+
+..  automodule:: paddle.v2.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+
+imdb
+++++
+
+..  automodule:: paddle.v2.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
+++++++++
+
+..  automodule:: paddle.v2.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
++++++++++
+
+..  automodule:: paddle.v2.dataset.movielens
+    :members:
+    :noindex:
+
+..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
+    :noindex:
+    
+..  autoclass:: paddle.v2.dataset.movielens.UserInfo
+    :noindex:
+
+sentiment
++++++++++
+
+..  automodule:: paddle.v2.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
++++++++++++
+
+..  automodule:: paddle.v2.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
++++++
+
+..  automodule:: paddle.v2.dataset.wmt14
+    :members:
+    :noindex:
diff --git a/doc/api/v2/data/image.rst b/doc/api/v2/data/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49
--- /dev/null
+++ b/doc/api/v2/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+..  automodule:: paddle.v2.image
+    :members:
diff --git a/doc/design/float16.md b/doc/design/float16.md
new file mode 100644
index 0000000000000000000000000000000000000000..078801ba2ed969d26dd31d5ec4ed268686cf7016
--- /dev/null
+++ b/doc/design/float16.md
@@ -0,0 +1,60 @@
+# Design Doc: float16
+
+## Why float16
+Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. 
+
+When high precision computation is not required, using float16 data type could potentially 
+
+- reduce storage space, memory bandwidth, and power usages; 
+- increase the chance of data fitting into a smaller cache of lower latency; 
+- provide arithmetic speed up if supported by hardware. 
+
+## Survey of current float16 support
+A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info.
+
+The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernel. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. 
+
+### Compiler
+- nvcc supports `__half` data type after CUDA 7.5.
+- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4.
+- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9.
+
+### Hardware
+- `__half` is supported on GPU with compute capability >= 5.3.
+- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above.
+- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018).
+
+### Libraries
+- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
+- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
+
+
+## Implementation
+The float16 class holds a 16-bit `uint16_t` data internally.
+```
+struct float16 {
+  uint16_t x;
+};
+``` 
+
+float16 supports the following features:
+  - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. 
+  - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen.
+  - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. 
+  - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. 
+  
+To support the above features, two fundamental conversion functions are provided:
+```
+float16 float_to_half_rn(float f);  // convert to half precision in round-to-nearest-even mode
+float half_to_float(float16 h);
+```
+which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion.
+
+## To do
+After float16 class is available, some of the future items are below:
+
+- Update pybind/tensor_py.h to bind c++ float16 with numpy float16. 
+
+- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16.
+
+- Create a type-casting operator that can convert the data type in tensor between float16 and other types.
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 4e8d630c2634682ff63b38182108eadebb5c7ff9..d485cdf6109274377ad0057223bdd8401e964aa7 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -21,7 +21,7 @@
 #include "paddle/framework/var_desc.h"
 #include "paddle/operators/net_op.h"
 
-USE_OP(fill_constant);
+USE_NO_KERNEL_OP(fill_constant);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index c5ae7b185460c8b0d68ba38bb9db9bd3d3fb14ea..3ec88d7a72c3339bf5e7d0ca3957a3f608f039b7 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -34,6 +34,21 @@ inline DataType ToDataType(std::type_index type) {
   }
 }
 
+inline std::type_index ToTypeIndex(DataType type) {
+  switch (type) {
+    case DataType::FP32:
+      return typeid(float);
+    case DataType::FP64:
+      return typeid(double);
+    case DataType::INT32:
+      return typeid(int);
+    case DataType::INT64:
+      return typeid(int64_t);
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
 template <typename Visitor>
 inline void VisitDataType(DataType type, Visitor visitor) {
   switch (type) {
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 239ae5e1233c7f5c506930df374b5d0cc8de7c8d..53b899a23997b71e723a298ec360a4e018d89878 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -79,6 +79,13 @@ DDim make_ddim(const std::vector<int64_t>& dims) {
   return result;
 }
 
+DDim make_ddim(const std::vector<int>& dims) {
+  std::vector<int64_t> res(dims.size());
+  std::transform(dims.begin(), dims.end(), res.begin(),
+                 [](int d) { return static_cast<int64_t>(d); });
+  return make_ddim(res);
+}
+
 /// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
@@ -117,7 +124,7 @@ int64_t DDim::operator[](int idx) const {
   return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
 
-int64_t DDim::size() const { return arity(*this); }
+int DDim::size() const { return arity(*this); }
 
 bool DDim::operator==(DDim d) const {
   if (var.which() != d.getVar().which()) {
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 2a5e2d2b6948b045642dbac5e83992a048ecb63d..4ca5e49566b7ec006eba80f3f9808bacb1ff2615 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -71,7 +71,7 @@ struct DDim {
 
   DDim operator*(DDim d) const;
 
-  int64_t size() const;
+  int size() const;
 };
 
 /**
@@ -81,6 +81,8 @@ struct DDim {
  */
 DDim make_ddim(const std::vector<int64_t>& dims);
 
+DDim make_ddim(const std::vector<int>& dims);
+
 /**
  * \brief Make a DDim from an initializer list
  *
diff --git a/paddle/framework/lod_rank_table.cc b/paddle/framework/lod_rank_table.cc
index 68a83def7e5a12fe3261be9e27cbb9bb54e1f8ad..1c2fba70c8ab0827ba6d1563f08cd0820650822e 100644
--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
@@ -31,6 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
     TableItem item;
     item.index = i;
     item.length = vec[i + 1] - vec[i];
+    VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
     items_.emplace_back(item);
   }
   // NOTE(yuyang18):
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 2bcfffb134f46416301b28043e1875e822fbc3e4..a0f2906c749054c1ff9f624e47df432ec2bd6ac8 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -27,6 +27,20 @@
 namespace paddle {
 namespace framework {
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+  os << "{";
+  for (auto& v : lod) {
+    os << "{";
+    for (auto& i : v) {
+      os << i << ",";
+    }
+    os << "}";
+  }
+  os << "}";
+
+  return os;
+}
+
 LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
   LoD new_lod;
   new_lod.reserve(level_end - level_begin);
@@ -136,37 +150,35 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
   ShareDataWith(Slice(begin, end));
 }
 
-void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
-                             std::vector<std::vector<size_t>>* lod_length,
-                             size_t* start_offset) {
-  lod_length->clear();
-  PADDLE_ENFORCE(start_idx < lod.size() - 1,
-                 "start_idx should be >= 0 and < lod.size() - 1.");
-  PADDLE_ENFORCE(end_idx < lod.size(),
-                 "end_idx should be >= 0 and < lod.size().");
-  PADDLE_ENFORCE_LE(start_idx, end_idx,
-                    "start_idx should be less than end_idx.");
-  for (size_t level_idx = 0; level_idx < lod.size(); ++level_idx) {
+using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+                                        size_t end_idx, size_t start_level) {
+  LoD sub_lod;
+
+  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
+    PADDLE_ENFORCE_LE(start_idx, end_idx);
+    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
     std::vector<size_t> level_lens;
     for (size_t i = start_idx; i < end_idx; ++i) {
       level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
     }
-    lod_length->emplace_back(level_lens);
+    sub_lod.emplace_back(level_lens);
     start_idx = lod[level_idx][start_idx];
     end_idx = lod[level_idx][end_idx];
   }
-  *start_offset = start_idx;
+
+  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
 }
 
-void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length) {
-  PADDLE_ENFORCE_EQ(
-      lod->size(), lod_length.size(),
+void AppendLoD(LoD* lod, const LoD& lod_length) {
+  PADDLE_ENFORCE(
+      lod->empty() || lod->size() == lod_length.size(),
       "The lod_length should has the same size with the appended lod.");
+  if (lod->empty()) {
+    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
+  }
   for (size_t i = 0; i < lod->size(); ++i) {
     auto& level = (*lod)[i];
-    if (level.empty()) {
-      level.push_back(0);
-    }
     for (size_t len : lod_length[i]) {
       level.push_back(level.back() + len);
     }
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 1437da399a28288429527f9672ace0482825159f..7f8a51cc581e759bc707e506ac7cdeb3680f40ac 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -56,6 +56,8 @@ using Vector = thrust::host_vector<
  */
 using LoD = std::vector<Vector<size_t>>;
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod);
+
 /*
  * Slice levels from a LoD.
  * NOTE the lowest level should always be the absolute offsets of the underlying
@@ -181,11 +183,10 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
   return tensor;
 }
 
-void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
-                             std::vector<std::vector<size_t>>* lod_length,
-                             size_t* start_offset);
+std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
+    const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
 
-void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length);
+void AppendLoD(LoD* lod, const LoD& lod_length);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index bf61c9ee7aa99b06e78b9b27dff72d216c74a86e..02d84b68233f2fdfc66e1df2fc7ce20307cadd94 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -146,43 +146,44 @@ TEST(LodExpand, test) {
 
 TEST(LoD, GetFineGrainedLoDLength) {
   LoD lod;
-  lod.push_back(std::vector<size_t>{0, 2, 4, 5});
-  lod.push_back(std::vector<size_t>{0, 1, 6, 8, 10, 11});
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
   lod.push_back(
-      std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29});
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29}));
 
-  std::vector<std::vector<size_t>> lod_length;
-  size_t start_offset;
-  paddle::framework::GetFineGrainedLoDLength(lod, 1, 2, &lod_length,
-                                             &start_offset);
+  auto lod_and_offset =
+      paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0);
+  LoD lod_length = lod_and_offset.first;
+  size_t start_offset = lod_and_offset.second.first;
+  size_t end_offset = lod_and_offset.second.second;
 
-  std::vector<std::vector<size_t>> expected;
+  LoD expected;
   expected.push_back(std::vector<size_t>{2});
   expected.push_back(std::vector<size_t>{2, 2});
   expected.push_back(std::vector<size_t>{2, 3, 4, 2});
   EXPECT_EQ(lod_length, expected);
   EXPECT_EQ(start_offset, 15UL);
+  EXPECT_EQ(end_offset, 26UL);
 }
 
 TEST(LoD, AppendLoD) {
-  std::vector<std::vector<size_t>> lod_lens;
-  lod_lens.push_back(std::vector<size_t>{2});
-  lod_lens.push_back(std::vector<size_t>{2, 2});
-  lod_lens.push_back(std::vector<size_t>{2, 3, 4, 2});
+  LoD lod_lens;
+  lod_lens.push_back(std::vector<size_t>({2}));
+  lod_lens.push_back(std::vector<size_t>({2, 2}));
+  lod_lens.push_back(std::vector<size_t>({2, 3, 4, 2}));
 
   LoD origin;
-  origin.push_back(std::vector<size_t>{0, 2});
-  origin.push_back(std::vector<size_t>{0, 1, 6});
-  origin.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15});
+  origin.push_back(std::vector<size_t>({0, 2}));
+  origin.push_back(std::vector<size_t>({0, 1, 6}));
+  origin.push_back(std::vector<size_t>({0, 2, 5, 7, 10, 12, 15}));
 
   paddle::framework::AppendLoD(&origin, lod_lens);
 
   LoD expected;
-  expected.push_back(std::vector<size_t>{0, 2, 4});
-  expected.push_back(std::vector<size_t>{0, 1, 6, 8, 10});
+  expected.push_back(std::vector<size_t>({0, 2, 4}));
+  expected.push_back(std::vector<size_t>({0, 1, 6, 8, 10}));
   expected.push_back(
-      std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26});
-
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26}));
   EXPECT_EQ(origin, expected);
 }
 
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 2bb5e0e8ec29fb2df81549650aa0c65bc1e51c49..daade439e5232f06be72bc5bb1e2285124f2c3a4 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -92,8 +92,7 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
 
   void operator()(const char* op_type) const {
     using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    OperatorWithKernel::OpKernelKey key(ToDataType(std::type_index(typeid(T))),
-                                        PlaceType());
+    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType());
     OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
 
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 22a7d9728a05950d66a1acd23d6fb18263f4ff6b..3276f8af396fe58450a8dc6713fe61e49d5ca708 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -254,8 +254,7 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
   return res;
 }
 
-std::ostream& operator<<(std::ostream& os,
-                         const OperatorWithKernel::OpKernelKey& kernel_key) {
+std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key) {
   os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
      << "]";
   return os;
@@ -432,7 +431,7 @@ void OperatorWithKernel::Run(const Scope& scope,
 
   // check if op[type] have kernel for kernel_key
   OpKernelMap& kernels = kernels_iter->second;
-  auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx);
+  auto kernel_key = GetKernelType(ctx);
   auto kernel_iter = kernels.find(kernel_key);
 
   if (kernel_iter == kernels.end()) {
@@ -440,6 +439,41 @@ void OperatorWithKernel::Run(const Scope& scope,
   }
 
   kernel_iter->second->Compute(ctx);
+
+  // throws errors if have.
+  dev_ctx.Finish();
+}
+OpKernelType OperatorWithKernel::GetKernelType(
+    const ExecutionContext& ctx) const {
+  return OpKernelType(IndicateDataType(ctx), ctx.device_context());
+}
+DataType OperatorWithKernel::IndicateDataType(
+    const ExecutionContext& ctx) const {
+  auto& scope = ctx.scope();
+  int data_type = -1;
+  for (auto& input : this->inputs_) {
+    for (auto& ipt_name : input.second) {
+      auto* var = scope.FindVar(ipt_name);
+      if (var != nullptr) {
+        const Tensor* t = nullptr;
+        if (var->IsType<Tensor>()) {
+          t = &var->Get<Tensor>();
+        } else if (var->IsType<LoDTensor>()) {
+          t = &var->Get<LoDTensor>();
+        } else if (var->IsType<SelectedRows>()) {
+          t = &(var->Get<SelectedRows>().value());
+        }
+        if (t != nullptr) {
+          int tmp = static_cast<int>(ToDataType(t->type()));
+          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+                         "DataType of Paddle Op %s must be the same.", Type());
+          data_type = tmp;
+        }
+      }
+    }
+  }
+  PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
+  return static_cast<DataType>(data_type);
 }
 
 }  // namespace framework
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index a1303a90980b40ff03bce1ab1a6f67bbbf952bcf..60861d92933dd100f877bec8d43f9b924f951e60 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -345,27 +345,10 @@ class OpKernel : public OpKernelBase {
   using ELEMENT_TYPE = T;
 };
 
-class OperatorWithKernel : public OperatorBase {
- public:
-  struct OpKernelKey {
-    platform::Place place_;
-    DataType data_type_;
-
-    OpKernelKey(DataType data_type, platform::Place place)
-        : place_(place), data_type_(data_type) {}
-
-    OpKernelKey(DataType data_type, const platform::DeviceContext& dev_ctx)
-        : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
-
-    bool operator==(const OpKernelKey& o) const {
-      return platform::places_are_same_class(place_, o.place_) &&
-             data_type_ == o.data_type_;
-    }
-  };
-
-  struct OpKernelHash {
+struct OpKernelType {
+  struct Hash {
     std::hash<int> hash_;
-    size_t operator()(const OpKernelKey& key) const {
+    size_t operator()(const OpKernelType& key) const {
       int place = key.place_.which();
       int data_type = static_cast<int>(key.data_type_);
       int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
@@ -374,9 +357,26 @@ class OperatorWithKernel : public OperatorBase {
     }
   };
 
+  platform::Place place_;
+  DataType data_type_;
+
+  OpKernelType(DataType data_type, platform::Place place)
+      : place_(place), data_type_(data_type) {}
+
+  OpKernelType(DataType data_type, const platform::DeviceContext& dev_ctx)
+      : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
+
+  bool operator==(const OpKernelType& o) const {
+    return platform::places_are_same_class(place_, o.place_) &&
+           data_type_ == o.data_type_;
+  }
+};
+
+class OperatorWithKernel : public OperatorBase {
+ public:
   using OpKernelMap =
-      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernelBase>,
-                         OpKernelHash>;
+      std::unordered_map<OpKernelType, std::unique_ptr<OpKernelBase>,
+                         OpKernelType::Hash>;
 
   OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
                      const VariableNameMap& outputs, const AttributeMap& attrs)
@@ -404,40 +404,15 @@ class OperatorWithKernel : public OperatorBase {
   }
 
  protected:
+  virtual OpKernelType GetKernelType(const ExecutionContext& ctx) const;
+
+ private:
   // indicate kernel DataType by input data. Defaultly all input data must be
   // same.
-  virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
-    auto& scope = ctx.scope();
-    int data_type = -1;
-    for (auto& input : this->inputs_) {
-      for (auto& ipt_name : input.second) {
-        auto* var = scope.FindVar(ipt_name);
-        if (var != nullptr) {
-          const Tensor* t = nullptr;
-          if (var->IsType<Tensor>()) {
-            t = &var->Get<Tensor>();
-          } else if (var->IsType<LoDTensor>()) {
-            t = &var->Get<LoDTensor>();
-          } else if (var->IsType<SelectedRows>()) {
-            t = &(var->Get<SelectedRows>().value());
-          }
-          if (t != nullptr) {
-            int tmp = static_cast<int>(ToDataType(t->type()));
-            PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                           "DataType of Paddle Op %s must be the same.",
-                           Type());
-            data_type = tmp;
-          }
-        }
-      }
-    }
-    PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
-    return static_cast<DataType>(data_type);
-  }
+  DataType IndicateDataType(const ExecutionContext& ctx) const;
 };
 
-std::ostream& operator<<(std::ostream& os,
-                         const OperatorWithKernel::OpKernelKey& kernel_key);
+std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key);
 
 extern bool OpSupportGPU(const std::string& op_type);
 
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 42e0d52eed3911d8e684e76a88bc690ca0783ce5..1e19f82b341768142258ba4a5dfa246d87ba4c43 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -114,8 +114,8 @@ class OpWithKernelTest : public OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {}
-  DataType IndicateDataType(const ExecutionContext& ctx) const override {
-    return DataType::FP32;
+  OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
+    return OpKernelType(DataType::FP32, ctx.device_context());
   }
 };
 
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index d78a2c4c21149ef3c800991b9a144ea198f1bdcf..7e88e039611007d17156d10f852eb46f3ee8e7a3 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -52,7 +52,7 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };
 
 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t> functor;
+  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool> functor;
   size_t size = functor(type);
   PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
   return size;
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 16aca192d41a64003de85ce45f7697bf45c556ed..0babec29f6f4412ed29deeafe24470e86b30a636 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -45,7 +45,8 @@ void VarDescBind::SetLoDLevel(int32_t lod_level) {
       desc_.mutable_tensor_array()->set_lod_level(lod_level);
       break;
     default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+                   desc_.tensor_array().lod_level());
   }
 }
 
@@ -56,7 +57,8 @@ int32_t VarDescBind::GetLodLevel() const {
     case VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().lod_level();
     default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+                   desc_.tensor_array().lod_level());
   }
 }
 
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 4fd72d64a90ae6f16dd1499ceb7fba6e40fe4cea..9b2779b42cad324253dadf27dbff20fd8e8c8e16 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -45,6 +45,7 @@ if(WITH_GPU)
     add_simple_unittest(BlockExpandOpTest)
     add_simple_unittest(CropOpTest)
     add_simple_unittest(SwitchOpTest)
+    add_simple_unittest(ScaleSubRegionOpTest)
 endif()
 
 add_simple_unittest(Im2ColTest)
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index ba446bf92da264fafa1fb47a2c30da9cb13176ce..370940532ef40335be54a3e6467de0409e923ec4 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -110,6 +110,7 @@ public:
         function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
     function1_->init(config);
     function2_->init(config);
+    initArgsCallback_ = nullptr;
   }
 
   ~Compare2Function() {}
@@ -170,6 +171,10 @@ public:
                                       *seq2_));
   }
 
+  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
+    initArgsCallback_ = callback;
+  }
+
   // output need only contains shape, do not contains data.
   void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
     size_t size =
@@ -340,6 +345,10 @@ protected:
         initArg(*func1Inputs_[i]);
       }
 
+      if (initArgsCallback_ != nullptr) {
+        initArgsCallback_(*func1Inputs_[i], i);
+      }
+
       copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
     }
   }
@@ -386,6 +395,7 @@ protected:
   std::shared_ptr<SequenceIdArg> seq1_;
   std::shared_ptr<SequenceIdArg> seq2_;
   test::CopyArgument<DType1, DType2> copyArg_;
+  std::function<void(BufferArg&, size_t)> initArgsCallback_;
 };
 
 class CpuGpuFuncCompare
diff --git a/paddle/function/ScaleSubRegionOp.cpp b/paddle/function/ScaleSubRegionOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a080505d7df83a6c0a9d88fbcb7863fc0e1f7b21
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "paddle/function/TensorShape.h"
+
+namespace paddle {
+
+template <>
+void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
+                                     const real* inputs,
+                                     const real* indices,
+                                     const TensorShape shape,
+                                     const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
+
+  for (int n = 0; n < number; ++n) {
+    // indices start from 1
+    int offset = n * 6;
+    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
+      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
+        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          outputs[idx] *= value;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                                         real* outGrad,
+                                         const real* indices,
+                                         const TensorShape shape,
+                                         const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  for (int n = 0; n < number; ++n) {
+    for (int c = 0; c < channel; ++c) {
+      for (int h = 0; h < height; ++h) {
+        for (int w = 0; w < width; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          int offset = n * 6;
+          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+              h >= (indices[offset + 2] - 1) &&
+              h <= (indices[offset + 3] - 1) &&
+              w >= (indices[offset + 4] - 1) &&
+              w <= (indices[offset + 5] - 1)) {
+            outGrad[idx] += inGrad[idx] * value;
+          } else {
+            outGrad[idx] += inGrad[idx];
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief For each instance, ScaleSubRegion can be used to multiply a value to
+ *        a specified sub continuous region. By providing start index and end
+ *        index for C/H/W, you can specify the location and shape of the region.
+ *
+ * Argument in this Function:
+ * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
+ * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs   A 4-D tensor with same shape as inputs, output value.
+ */
+template <DeviceType Device>
+class ScaleSubRegionFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegion<Device>(outputs[0].data<real>(),
+                           inputs[0].data<real>(),
+                           inputs[1].data<real>(),
+                           shape,
+                           conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of ScaleSubRegion Function.
+ *
+ * Argument in this Function:
+ * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
+ */
+
+template <DeviceType Device>
+class ScaleSubRegionGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
+                               outputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               shape,
+                               conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.h b/paddle/function/ScaleSubRegionOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..0480c8577f3fbf3bc9e94b635df96a31b103e9e3
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief Function to multiply a value to values in specified sub continuous
+ *        region. Indices must be provided to indcate the location and shape of
+ *        the region and the multiplied value is passed by configure variable.
+ *
+ *
+ * \param[out] outputs  Output value.
+ * \param[in]  inputs   Input data which contains NCHW information.
+ * \param[in]  indices  Indices data to indcate the sub region.
+ * \param[in]  shape    Tensor shape of input value.
+ * \param[in]  conf     Configure variable which contains the multiplied value.
+ */
+template <DeviceType Device>
+void ScaleSubRegion(real* outputs,
+                    const real* inputs,
+                    const real* indices,
+                    const TensorShape shape,
+                    const FuncConfig& conf);
+
+/**
+ * \brief Backward propagation function of ScaleSubRegion.
+ *
+ * \param[out] inGrad   Gradients of previous layer.
+ * \param[in]  outGrad  Output gradient.
+ * \param[in]  indices  Indices data.
+ * \param[in]  shape    The Shape of input tensor.
+ * \param[in]  conf     Configure variable.
+ */
+template <DeviceType Device>
+void ScaleSubRegionGrad(const real* inGrad,
+                        real* outGrad,
+                        const real* indices,
+                        const TensorShape shape,
+                        const FuncConfig& conf);
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpGpu.cu b/paddle/function/ScaleSubRegionOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8aae2e44c3fdc8b516e66ecfd2e04f466a17dde9
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpGpu.cu
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "hl_base.h"
+
+namespace paddle {
+
+__global__ void KeScaleSubRegion(real* outputs,
+                                 const real* inputs,
+                                 const real* indices,
+                                 real value,
+                                 int channel,
+                                 int height,
+                                 int width,
+                                 int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int c = (idx / width / height) % channel;
+    const int n = idx / width / height / channel;
+
+    const int offset = n * 6;
+    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+      outputs[idx] = inputs[idx] * value;
+    } else {
+      outputs[idx] = inputs[idx];
+    }
+  }
+}
+
+template <>
+void ScaleSubRegion<DEVICE_TYPE_GPU>(real* outputs,
+                                     const real* inputs,
+                                     const real* indices,
+                                     const TensorShape shape,
+                                     const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  size_t nth = number * channel * height * width;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeScaleSubRegion<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      outputs, inputs, indices, value, channel, height, width, nth);
+  CHECK_SYNC("ScaleSubRegion");
+}
+
+__global__ void KeScaleSubRegionDiff(const real* inGrad,
+                                     real* outGrad,
+                                     const real* indices,
+                                     real value,
+                                     int channel,
+                                     int height,
+                                     int width,
+                                     int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int c = (idx / width / height) % channel;
+    const int n = idx / width / height / channel;
+
+    const int offset = n * 6;
+    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+      outGrad[idx] += inGrad[idx] * value;
+    } else {
+      outGrad[idx] += inGrad[idx];
+    }
+  }
+}
+
+template <>
+void ScaleSubRegionGrad<DEVICE_TYPE_GPU>(const real* inGrad,
+                                         real* outGrad,
+                                         const real* indices,
+                                         const TensorShape shape,
+                                         const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  size_t nth = number * channel * height * width;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeScaleSubRegionDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      inGrad, outGrad, indices, value, channel, height, width, nth);
+  CHECK_SYNC("ScaleSubRegionGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpTest.cpp b/paddle/function/ScaleSubRegionOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..43331f258dddaa43cbc8cc77519e299de7e98290
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpTest.cpp
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(ScaleSubRegion, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {5, 32}) {
+      for (size_t imgSizeH : {5, 33}) {
+        for (size_t imgSizeW : {5, 32}) {
+          for (real value : {-0.5, 0.0, 0.5}) {
+            for (bool firstHalf : {false, true}) {
+              VLOG(3) << " numSamples=" << numSamples
+                      << " channels=" << channels << " imgSizeH=" << imgSizeH
+                      << " imgSizeW=" << imgSizeW;
+
+              for (bool testGrad : {false, true}) {
+                CpuGpuFuncCompare compare(
+                    testGrad ? "ScaleSubRegionGrad" : "ScaleSubRegion",
+                    FuncConfig().set<real>("value", value));
+
+                TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+                TensorShape indicesShape{numSamples, 6};
+
+                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, indicesShape));
+
+                compare.registerInitCallback([=](BufferArg& arg, size_t index) {
+                  if (index == 1) {
+                    real* data = (real*)arg.data();
+
+                    for (size_t i = 0; i < numSamples; ++i) {
+                      size_t offset = i * 6;
+                      data[offset] = firstHalf ? 1 : channels / 2;
+                      data[offset + 1] = firstHalf ? channels / 2 : channels;
+                      data[offset + 2] = firstHalf ? 1 : imgSizeH / 2;
+                      data[offset + 3] = firstHalf ? imgSizeH / 2 : imgSizeH;
+                      data[offset + 4] = firstHalf ? 1 : imgSizeW / 2;
+                      data[offset + 5] = firstHalf ? imgSizeW / 2 : imgSizeW;
+                    }
+                  }
+                });
+
+                compare.addOutputs(
+                    BufferArg(
+                        VALUE_TYPE_FLOAT, shape, testGrad ? ADD_TO : ASSIGN_TO),
+                    testGrad ? ADD_TO : ASSIGN_TO);
+                compare.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
index 8eb700723f2cf7dda969739bb5e3d48358d278a0..6ffe4fbec643e50d27924a989875454d307f5b9b 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -62,16 +62,14 @@ void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
                                 MKLDNNMatrixPtr& wgt,
                                 MKLDNNMatrixPtr& bias,
                                 MKLDNNMatrixPtr& out) {
-  if (biases_) {
-    LOG(FATAL) << "not implemented yet";
-  }
-  resetFwdBuffers(inVals_, out);
+  resetFwdBuffers(inVals_, bias, out);
   in = inVals_[0];
 
   std::shared_ptr<sum::primitive_desc> fwdPD;
-  resetFwdPD(fwdPD, inVals_, out);
+  std::shared_ptr<sum::primitive_desc> biasPD;
+  resetFwdPD(fwdPD, biasPD, inVals_, bias, out);
 
-  resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+  resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
 }
 
 void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -79,7 +77,7 @@ void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
                                 MKLDNNMatrixPtr& wgt,
                                 MKLDNNMatrixPtr& bias,
                                 MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inGrads_, out);
+  resetBwdBuffers(inGrads_, bias, out);
   in = inGrads_[0];
 
   // backward only need share output grad to input grad
@@ -89,6 +87,20 @@ void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
       inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
     }
   }
+
+  // backward bias
+  bwdBias_ = nullptr;
+  if (bias) {
+    std::vector<float> scales(bs_, 1.0);
+    std::vector<memory::primitive_desc> srcPDs(bs_, bias->getPrimitiveDesc());
+    auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
+    std::vector<primitive::at> srcs;
+    for (size_t i = 0; i < grads_.size(); ++i) {
+      srcs.push_back(*(grads_[i]));
+    }
+    bwdBias_.reset(new sum(biasPD, srcs, *bias));
+    pipeline.push_back(*bwdBias_);
+  }
 }
 
 void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
@@ -97,7 +109,25 @@ void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
   }
 }
 
+void MKLDNNAddtoLayer::prepareBias(MKLDNNMatrixPtr& bias,
+                                   const MatrixPtr& biasMat,
+                                   const MKLDNNMatrixPtr& out,
+                                   std::vector<MKLDNNMatrixPtr>& outs) {
+  auto pd = MKLDNNMatrix::createPrimitiveDesc(
+      {(int)layerSize_}, memory::format::x, engine_);
+  bias = MKLDNNMatrix::create(pd, biasMat);
+  outs.clear();
+  real* data = out->getData();
+  CHECK_EQ(bs_ * layerSize_, out->getElementCnt());
+  for (int i = 0; i < bs_; ++i) {
+    MatrixPtr tmp =
+        Matrix::create(data + i * layerSize_, 1, layerSize_, false, false);
+    outs.push_back(MKLDNNMatrix::create(bias->getPrimitiveDesc(), tmp));
+  }
+}
+
 void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& bias,
                                        MKLDNNMatrixPtr& out) {
   inputs.resize(inputLayers_.size());
   for (size_t i = 0; i < inputs.size(); i++) {
@@ -110,12 +140,20 @@ void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
   }
 
   resetOutValue(out, inputs[0]->getPrimitiveDesc());
+
+  if (biases_ && biases_->getW()) {
+    prepareBias(bias, biases_->getW(), out, vals_);
+  } else {
+    bias = nullptr;
+  }
 }
 
 void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
+                                  std::shared_ptr<sum::primitive_desc>& biasPD,
                                   std::vector<MKLDNNMatrixPtr>& inputs,
+                                  MKLDNNMatrixPtr bias,
                                   MKLDNNMatrixPtr out) {
-  std::vector<double> scales(inputs.size(), 1.0);
+  std::vector<float> scales(inputs.size(), 1.0);
   std::vector<memory::primitive_desc> srcPDs;
   for (size_t i = 0; i < inputs.size(); i++) {
     srcPDs.push_back(inputs[i]->getPrimitiveDesc());
@@ -123,12 +161,23 @@ void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
   CHECK(out);
   pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
   CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+
+  biasPD = nullptr;
+  if (bias) {
+    std::vector<float> scales(2, 1.0);
+    std::vector<memory::primitive_desc> srcPDs(2, bias->getPrimitiveDesc());
+    biasPD.reset(
+        new sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs));
+    CHECK_PRIMITIVE_DESC_EQ(bias, biasPD->dst_primitive_desc());
+  }
 }
 
 void MKLDNNAddtoLayer::resetFwdPipeline(
     std::vector<primitive>& pipeline,
     std::shared_ptr<sum::primitive_desc>& pd,
+    std::shared_ptr<sum::primitive_desc>& biasPD,
     std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
   std::vector<primitive::at> srcs;
   for (size_t i = 0; i < inputs.size(); i++) {
@@ -136,9 +185,23 @@ void MKLDNNAddtoLayer::resetFwdPipeline(
   }
   fwd_.reset(new sum(*pd, srcs, *out));
   pipeline.push_back(*fwd_);
+
+  fwdBias_.clear();
+  if (biasPD == nullptr || bias == nullptr) {
+    return;
+  }
+  fwdBias_.resize(vals_.size());
+  for (size_t i = 0; i < vals_.size(); ++i) {
+    std::vector<primitive::at> srcs;
+    srcs.push_back(*(vals_[i]));
+    srcs.push_back(*bias);
+    fwdBias_[i].reset(new sum(*biasPD, srcs, *vals_[i]));
+    pipeline.push_back(*fwdBias_[i]);
+  }
 }
 
 void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& bias,
                                        MKLDNNMatrixPtr& out) {
   CHECK(outVal_);
   resetOutGrad(out, outVal_->getPrimitiveDesc());
@@ -149,6 +212,12 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
     resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
     CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
   }
+
+  if (biases_ && biases_->getWGrad()) {
+    prepareBias(bias, biases_->getWGrad(), out, grads_);
+  } else {
+    bias = nullptr;
+  }
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/gserver/layers/MKLDNNAddtoLayer.h
index 15f74ec5bdf3d1e4ae5e09051be6be418590a67a..24504b7b4f50726e2b2757ca3029461cdc27b411 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -32,9 +32,15 @@ protected:
   // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
   size_t layerSize_;
 
-  // TODO(TJ): this part has not been optimized by MKL-DNN
   std::unique_ptr<Weight> biases_;
 
+  // buffers for adding bias
+  std::vector<MKLDNNMatrixPtr> vals_;
+  std::vector<MKLDNNMatrixPtr> grads_;
+  // primitives for adding bias
+  std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
+  std::shared_ptr<mkldnn::primitive> bwdBias_;
+
 public:
   explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
 
@@ -91,20 +97,34 @@ protected:
    *                    reset pipeline.
    */
   void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                  std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
                   std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr bias,
                   MKLDNNMatrixPtr out);
   void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
                         std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
                         std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
 
   /**
    * Backward functions: reset buffers(inputs, output, bias)
    */
   void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
+
+  /**
+   * prepare for bias
+   */
+  void prepareBias(MKLDNNMatrixPtr& bias,
+                   const MatrixPtr& biasMat,
+                   const MKLDNNMatrixPtr& out,
+                   std::vector<MKLDNNMatrixPtr>& outs);
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index d82063a7130ca928ba042e210eb216f90c7207cd..3429c53d2396e051d62fe0ae405934758e89f9c2 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -60,18 +60,16 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
   }
 
   CHECK(wgtVal_) << "should have been initialized";
-  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
   auto targetDim = wgtVal_->getDims();
-  auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo;
+  auto srcFmt = targetDim.size() == 2 ? format::io : format::ihwo;
   wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
   hasInitedWgt_ = true;
 }
 
 void MKLDNNFcLayer::convertWeightsToPaddle() {
   CHECK(wgtVal_) << "should have been initialized";
-  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
   auto targetDim = wgtVal_->getDims();
-  auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo;
+  auto dstFmt = targetDim.size() == 2 ? format::io : format::ihwo;
   wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
 
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index 5fd62f4f73b18df683ccf74143e45054c3631c22..e75ac5ba4647a8267b7bc189893bd7adb5c3053f 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -181,21 +181,17 @@ void MKLDNNLayer::resetInValue(
   auto extPD = MKLDNNMatrix::createPrimitiveDesc(
       {bs_, ic_, ih_, iw_}, format::nchw, engine_);
   const MatrixPtr& inMat = inputLayers_[inputIdx]->getOutputValue();
-  in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-  CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
-  if (in == nullptr || in->getFormat() == format::nc) {
-    in = MKLDNNMatrix::create(extPD, inMat);
-  }
-  extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
-  if (in->getFormat() == format::nc) {
-    CHECK(ih_ == 1 && iw_ == 1);
+  extInVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+  CHECK_EQ(inputIsOnlyMKLDNN(), extInVal_ != nullptr);
+  if (extInVal_ == nullptr || extInVal_->getFormat() == format::nc) {
+    extInVal_ = MKLDNNMatrix::create(extPD, inMat);
   }
+  in = extInVal_;
   if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
     return;
   }
   // need create reorder
   in = MKLDNNMatrix::create(*intPD);
-  extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
   cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
   CHECK(cvtInVal_) << "should not be emptry";
 }
@@ -291,7 +287,7 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
     return;
   }
   CHECK(out) << "should have reset internal ouput grad";
-  std::vector<double> scales(outputMap_.size(), 1.0);
+  std::vector<float> scales(outputMap_.size(), 1.0);
   std::vector<memory::primitive_desc> srcPDs;
   std::vector<primitive::at> srcs;
   for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b18bc0c1b9065e09324d8ab4ed165679f6196d00
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionLayer.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
+
+bool ScaleSubRegionLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(static_cast<int>(inputLayers_.size()), 2);
+  auto& conf = config_.inputs(0).scale_sub_region_conf();
+  value_ = conf.value();
+
+  createFunction(forward_, "ScaleSubRegion", FuncConfig().set("value", value_));
+  createFunction(
+      backward_, "ScaleSubRegionGrad", FuncConfig().set("value", value_));
+
+  return true;
+}
+
+void ScaleSubRegionLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  auto in0 = getInput(0);
+  imgH_ = in0.getFrameHeight();
+  imgW_ = in0.getFrameWidth();
+  if (imgH_ == 0 || imgW_ == 0) {
+    auto& conf = config_.inputs(0).scale_sub_region_conf();
+    imgH_ = conf.image_conf().img_size_y();
+    imgW_ = conf.image_conf().img_size();
+  }
+  MatrixPtr imgV = in0.value;
+  size_t batchSize = imgV->getHeight();
+  size_t spatialSize = imgH_ * imgW_;
+  channelsNum_ = imgV->getWidth() / spatialSize;
+  shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});
+
+  resetOutput(batchSize, imgV->getWidth());
+  auto out = getOutput();
+  out.setFrameHeight(imgH_);
+  out.setFrameWidth(imgW_);
+
+  MatrixPtr indicesV = getInputValue(1);
+  indicesShape_ = TensorShape({batchSize, 6});
+
+  REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
+  BufferArgs inArgs;
+  BufferArgs outArgs;
+  inArgs.addArg(*imgV, shape_);
+  inArgs.addArg(*indicesV, indicesShape_);
+  outArgs.addArg(*out.value, shape_, ASSIGN_TO);
+  forward_[0]->calc(inArgs, outArgs);
+}
+
+void ScaleSubRegionLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("ScaleSubRegionBackward", getName().c_str());
+  BufferArgs inArgs;
+  BufferArgs outArgs;
+  inArgs.addArg(*getOutputGrad(), shape_);
+  inArgs.addArg(*getInputValue(1), indicesShape_);
+  outArgs.addArg(*getInputGrad(0), shape_, ADD_TO);
+  backward_[0]->calc(inArgs, outArgs);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.h b/paddle/gserver/layers/ScaleSubRegionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a27c56de93bb6fdde0f95cd4c5abe5dfabe4e858
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  For each instance, this layer can be used to multiply a value to a
+ *         specified sub continuous region. By providing start index and end
+ *         index for C/H/W, you can specify the location and shape of the
+ *         region.
+ *
+ *         input_0: Input value.
+ *         input_1: Indices value to specify the location an shape of the
+ *                  region.
+ */
+class ScaleSubRegionLayer : public Layer {
+public:
+  explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ScaleSubRegionLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+  TensorShape shape_;
+  TensorShape indicesShape_;
+  size_t imgH_;
+  size_t imgW_;
+  size_t channelsNum_;
+  real value_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 1a46fb49153a0aa4228f58db481b950bc2d6de83..3f7d8810513ed4b765219be722cf8ae9adc7909f 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2358,6 +2358,38 @@ TEST(Layer, ScaleShiftLayer) {
   }
 }
 
+TEST(Layer, ScaleSubRegionLayer) {
+  const size_t batchSize = 64;
+  const size_t size = 4096;
+  TestConfig config;
+  config.layerConfig.set_type("scale_sub_region");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
+  auto* data = indicesV->getData();
+  for (size_t i = 0; i < batchSize; ++i) {
+    data[i * 2] = 2;
+    data[i * 2 + 1] = 4;
+    data[i * 2 + 2] = 16;
+    data[i * 2 + 3] = 32;
+    data[i * 2 + 4] = 16;
+    data[i * 2 + 5] = 32;
+  }
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ScaleSubRegionConfig* scaleSubRegionConf =
+      input->mutable_scale_sub_region_conf();
+  ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
+  imgConf->set_img_size(32);
+  imgConf->set_img_size_y(32);
+  imgConf->set_channels(4);
+  scaleSubRegionConf->set_value(2.0);
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 2e8d9f3333b36005c9b3b28449c76a4a44c74cc6..3960d699ac8dc08316ee413116878ee3eda65793 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -300,13 +300,8 @@ void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
   TestConfig dnnConfig;
   getAddtoConfig(dnnConfig, pm, nInputs);
   dnnConfig.layerConfig.set_type("mkldnn_addto");
-  // TODO(TJ): test with bias
-  for (auto withBias : {false}) {
-    if (withBias) {
-      dnnConfig.biasSize = pm.ic * pm.ih * pm.iw;
-    } else {
-      dnnConfig.biasSize = 0;
-    }
+  for (auto withBias : {false, true}) {
+    dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
     RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
   }
 }
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
index 5bc4a03067a75527fa30e5bb5526f93dc7b9fdcc..b998e5772e70d0a0ec79dc4064dcbaa2c302efd2 100644
--- a/paddle/math/tests/TensorCheck.h
+++ b/paddle/math/tests/TensorCheck.h
@@ -169,7 +169,7 @@ void TensorCheck(AssertEq compare,
       count++;
     }
   }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
 }
 
 template <typename AssertEq, typename Tensor1, typename Tensor2>
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index f22f86468db7be6c87193f85c3f87aef4a9d3c68..29ce44c23308cb5ae1c1df5c9be1412c28abe96f 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -62,6 +62,11 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
     endif()
 
+    if ("${TARGET}" STREQUAL "compare_op")
+        set(pybind_flag 1)
+        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
+    endif()
+
     # pool_with_index_op contains several operators
     if ("${TARGET}" STREQUAL "pool_with_index_op")
         set(pybind_flag 1)
@@ -165,6 +170,8 @@ set(DEPS_OPS
     sequence_conv_op
     sequence_pool_op
     lod_rank_table_op
+    lod_tensor_to_array_op
+    array_to_lod_tensor_op
     lstm_op
     tensor_array_read_write_op
     gru_op)
@@ -177,6 +184,8 @@ op_library(sum_op DEPS net_op selected_rows_functor)
 op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
+op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
+op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
 op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
 if(WITH_GPU)
 op_library(nccl_op DEPS nccl_common)
@@ -186,8 +195,13 @@ op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
-op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
-        DEPS net_op tensor_array)
+if(WITH_TESTING)
+    op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
+        DEPS net_op tensor_array gtest)
+else()
+    op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
+            DEPS net_op tensor_array)
+endif()
 op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index eaafb9ad54b371837cbc0f3268f7f2bf169e83e8..03c2fa945d94a522d25e65103c8842a93852ba3d 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -47,10 +47,11 @@ class AccuracyOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  // IndicateDataType
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Out")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index d0c4c0d25d6f4e3ab7acd72d62a8a17fa102637b..1776f33105367447759aa91c25263dfc53bd2f99 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -65,7 +65,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 
     size_t num_samples = inference->dims()[0];
     size_t infer_width = inference->dims()[1];
-    cudaMemset((void**)&accuracy_data, 0, sizeof(float));
+    PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float)));
 
     if (num_samples == 0) {
       return;
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..666043e824f885e9c0e79e319d0a38ba108c209a
--- /dev/null
+++ b/paddle/operators/array_operator.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+class ArrayOp : public framework::OperatorBase {
+ public:
+  ArrayOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  size_t GetOffset(const framework::Scope &scope,
+                   const platform::DeviceContext &dev_ctx) const {
+    auto *i = scope.FindVar(Input("I"));
+    PADDLE_ENFORCE(i != nullptr, "I must be set");
+    auto &i_tensor = i->Get<framework::LoDTensor>();
+    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+    size_t offset;
+    if (platform::is_gpu_place(i_tensor.place())) {
+      // FIXME: Avoid copy from GPU to CPU
+      framework::Tensor t;
+      t.CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx);
+      dev_ctx.Wait();
+      offset = static_cast<size_t>(*t.data<int64_t>());
+    } else {
+      offset = static_cast<size_t>(*i_tensor.data<int64_t>());
+    }
+    return offset;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0903bb4e5ca7f160e19eefab99af7e3e4a8ed76
--- /dev/null
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -0,0 +1,170 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <numeric>
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using LoD = framework::LoD;
+
+class ArrayToLoDTensorOp : public framework::OperatorBase {
+ public:
+  ArrayToLoDTensorOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    auto &rank_table =
+        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    // Check dims, place and data type of input's elements and infer output's
+    // dim
+    PADDLE_ENFORCE(!x.empty(), "There's no element in the input array.");
+    int rank = x[0].dims().size();
+    platform::Place place = x[0].place();
+    std::type_index data_type = x[0].type();
+    framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank);
+    int64_t batch_size = x[0].dims()[0];
+    for (size_t i = 1; i < x.size(); ++i) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims,
+                        "The dimension of the %zu'th element in LoDTensorArray "
+                        "differs from previous ones.",
+                        i);
+      PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place),
+                     "The place class of the %zu'th element in LoDTensorArray "
+                     "differs from previous ones.",
+                     i);
+      PADDLE_ENFORCE(x[i].type() == data_type,
+                     "The date type of the %zu'th element in LoDTensorArray "
+                     "differs from previous ones.",
+                     i);
+      batch_size += x[i].dims()[0];
+    }
+    auto ins_dim_vec = framework::vectorize(ins_dims);
+    ins_dim_vec.insert(ins_dim_vec.begin(), batch_size);
+    framework::DDim out_dims = framework::make_ddim(ins_dim_vec);
+    out->Resize(out_dims);
+    out->mutable_data(place, data_type);
+
+    auto &table_items = rank_table.items();
+    std::vector<size_t> table_item_idx(table_items.size());
+    // table_item_idx = range(table_items_idx.size())
+    std::iota(table_item_idx.begin(), table_item_idx.end(), 0);
+    std::sort(table_item_idx.begin(), table_item_idx.end(),
+              [&](size_t a, size_t b) {
+                return table_items[a].index < table_items[b].index;
+              });
+
+    // Build LoDTensor `out`
+    framework::LoD *out_lod = out->mutable_lod();
+    out_lod->clear();
+    size_t out_offset = 0;
+    auto prefix_lod = rank_table.coarse_lod();
+    prefix_lod.emplace_back();
+    auto &cur_level_lod = prefix_lod.back();
+    cur_level_lod.push_back(0);
+    for (size_t idx : table_item_idx) {
+      cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
+      for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) {
+        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+            x[x_idx].lod(), idx, idx + 1, 0);
+
+        auto &lod_length = lod_and_offset.first;
+        framework::AppendLoD(out_lod, lod_length);
+
+        size_t start_offset = lod_and_offset.second.first;
+        size_t end_offset = lod_and_offset.second.second;
+        VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
+                 << ", " << end_offset << "]";
+        // Copy data
+        PADDLE_ENFORCE_GE(end_offset, start_offset);
+        size_t len = end_offset - start_offset;
+        if (len == 0) {
+          continue;
+        }
+        out->Slice(out_offset, out_offset + len)
+            .CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx);
+        out_offset += len;
+      }
+    }
+    out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end());
+  }
+};
+
+class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ArrayToLoDTensorOpProtoMaker(framework::OpProto *proto,
+                               framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(std::vector<LodTensor>) A vector of tensors that is going to "
+             "be casted to a big LoDTensor.");
+    AddInput("RankTable",
+             "(LoDRankTable) RankTable provides the coarse lod infomation to "
+             "build the output LoDTensor. See "
+             "'paddle/framework/lod_rank_table.h' for more details.");
+    AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
+    AddComment(
+        R"DOC(This Op build a big LoDTensor from a std::vector<LoDTensor> 
+          and a LoDRankTable. It is supposed to be used in getting dynamic RNN's
+          outputs back to a normal LoDTensor. The std::vector<LoDTensor> 
+          would be the output of RNN Op and the LoDRankTable would be build 
+          with RNN's input.)DOC");
+  }
+};
+
+class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "ArrayToLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("RankTable"),
+                   "ArrayToLoDTensorOp must has input RankTable.");
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("lod_tensor_to_array");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp,
+                  ops::ArrayToLoDTensorOpProtoMaker,
+                  ops::ArrayToLoDTensorInferShape,
+                  ops::ArrayToLoDTensorGradMaker);
diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc
index ccb969ab23a8e1df713278513a66b10f21690108..6c3f67ec32fb1b942241997e87a1e9c4752e707d 100644
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -39,10 +39,11 @@ class AucOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  // IndicateDataType
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Out")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index 7d73dfde786208fe217dac97325d432fb80052ad..8721ca352848fc4d69b206d4ea0ab7c581c8d055 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -303,7 +303,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
   }
 
-  framework::DataType IndicateDataType(
+ protected:
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
     const auto *var = ctx.InputVar(framework::GradVarName("Y"));
     if (var == nullptr) {
@@ -318,7 +319,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     if (t == nullptr) {
       PADDLE_THROW("can't find Y@GRAD");
     }
-    return framework::ToDataType(t->type());
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9fc532e39500fa397be80396b075e866bad9362
--- /dev/null
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/clip_by_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ClipByNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipByNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipByNormOp should not be null.");
+    auto max_norm = ctx->Attrs().Get<float>("max_norm");
+    PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0.");
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ClipByNormOpMaker(framework::OpProto* proto,
+                    framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input of clip_by_norm op."
+             "The number of dimensions must be between [1, 9].");
+    AddOutput("Out",
+              "(Tensor) The output of clip_by_norm op with shape as input(X)");
+    AddAttr<float>("max_norm", "(float) The maximum norm value.");
+    AddComment(R"DOC(
+ClipByNorm operator limits the L2 norm of the input 'X' within 'max_norm'. 
+If the L2 norm of 'X' is less than or equal to 'max_norm', 'Out' will be 
+the same as 'X'. If the L2 norm of 'X' is greater than 'max_norm', 'X' will 
+be linearly scaled to make the L2 norm of 'Out' equal to 'max_norm', as 
+shown in the following formula：
+
+'Out' = 'max_norm' * 'X' / norm('X'),
+
+where norm('X') represents the L2 norm of 'X'.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
+                             ops::ClipByNormOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    clip_by_norm, ops::ClipByNormKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/fill_constant_op.cu b/paddle/operators/clip_by_norm_op.cu
similarity index 63%
rename from paddle/operators/fill_constant_op.cu
rename to paddle/operators/clip_by_norm_op.cu
index bca402a8b988b570a083e9ce253342304f4b8946..2593a24ebbf56ecd286a726e527d2414247576e8 100644
--- a/paddle/operators/fill_constant_op.cu
+++ b/paddle/operators/clip_by_norm_op.cu
@@ -12,13 +12,8 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/fill_constant_op.h"
+#include "paddle/operators/clip_by_norm_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    fill_constant, ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>,
-    ops::FillConstantOpKernel<paddle::platform::GPUPlace, double>,
-    ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>,
-    ops::FillConstantOpKernel<paddle::platform::GPUPlace, int64_t>);
+    clip_by_norm, ops::ClipByNormKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b26476cae9b5b2fa290bc9186b9a64c48ba703d6
--- /dev/null
+++ b/paddle/operators/clip_by_norm_op.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class ClipByNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max_norm = context.Attr<T>("max_norm");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto x = EigenVector<T>::Flatten(*input);
+    auto out = EigenVector<T>::Flatten(*output);
+    auto x_norm = x.square().sum().sqrt();
+    auto place = context.GetEigenDevice<Place>();
+
+    auto temp = (x_norm <= max_norm).template cast<T>().eval();
+    auto scaling = temp + (static_cast<T>(1) - temp) * max_norm / x_norm;
+    Eigen::array<int, 1> one_dim{{1}};
+    Eigen::DSizes<int, 1> m_dsize(input->numel());
+    out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..716b5ee92d0d8737d2069460f53989f691ff7c77
--- /dev/null
+++ b/paddle/operators/compare_op.cc
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/compare_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename OpComment>
+class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CompareOpProtoMaker(framework::OpProto *proto,
+                      framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    OpComment comment;
+    AddInput("X",
+             string::Sprintf("(LoDTensor) the left hand operand of %s operator",
+                             comment.type));
+    AddInput("Y", string::Sprintf(
+                      "(LoDTensor) the right hand operand of %s operator",
+                      comment.type));
+    AddOutput("Out", string::Sprintf(
+                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(%s Operator
+
+It operates element-wise on X and Y, and returns the Out. Each of them is a
+N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
+calculated by %s
+)DOC",
+                               comment.type, comment.equation));
+  }
+};
+
+template <typename OpComment>
+class CompareOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    PADDLE_ENFORCE(context->HasInput("X"), "%s operator must has input X",
+                   comment.type);
+    PADDLE_ENFORCE(context->HasInput("Y"), "%s operator must has input Y",
+                   comment.type);
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
+                      "The number of elements in X and Y should be same");
+
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class CompareOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx);
+    // CompareOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_LOGICAL_OP(op_type, _equation)                      \
+  struct _##op_type##Comment {                                       \
+    static char type[];                                              \
+    static char equation[];                                          \
+  };                                                                 \
+  char _##op_type##Comment::type[]{#op_type};                        \
+  char _##op_type##Comment::equation[]{_equation};                   \
+  REGISTER_OPERATOR(                                                 \
+      op_type, ::paddle::operators::CompareOp,                       \
+      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
+      ::paddle::framework::EmptyGradOpMaker);
+
+REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
+REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_OP(equal, "Out = X == Y");
+REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
diff --git a/paddle/operators/compare_op.cu b/paddle/operators/compare_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..42a5bb2f45fd389f60c3dc034cade7f56a907e35
--- /dev/null
+++ b/paddle/operators/compare_op.cu
@@ -0,0 +1,18 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/compare_op.h"
+
+REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor);
diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..04e04e347b398abb5fb66876bf801b1eee688ec6
--- /dev/null
+++ b/paddle/operators/compare_op.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <type_traits>
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct LessThanFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
+};
+
+template <typename T>
+struct EqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+    if (std::is_floating_point<T>::value) {
+      // This branch will be optimized while compiling if T is integer. It is
+      // safe to cast a and b to double.
+      return fabs(static_cast<double>(a - b)) < 1e-8;
+    } else {
+      return (a == b);
+    }
+  }
+};
+
+template <typename Place, typename Functor>
+class CompareOpKernel
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* y = context.Input<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
+    Functor binary_func;
+    platform::Transform<Place> trans;
+    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
+          y->data<T>(), out->mutable_data<bool>(context.GetPlace()),
+          binary_func);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                     \
+  REGISTER_OP_##dev##_KERNEL(                                              \
+      op_type,                                                             \
+      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+                                           functor<int>>,                  \
+      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+                                           functor<int64_t>>,              \
+      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+                                           functor<float>>,                \
+      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+                                           functor<double>>);
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc
index d1ce74c4b911545476f0b362df0e0f7a0d14cfb4..f418f489c0ff471464a23380598e9f4c8da16ca9 100644
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -120,9 +120,11 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
+        ctx.device_context());
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 9d41879b27a24f83090f5abf1325eca5f9488d00..1e82742eaf86711fe4f9d02d517ad1853131cf67 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -51,9 +51,11 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of cross_entropy
   // is determined by its input "X".
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
   }
 };
 
@@ -98,9 +100,11 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of cross_entropy
   // is determined by its input "X".
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index 232d88e26bd2fb17f52c2dc3626d47c2615b1d42..85871ebbfcd8ee38ef5e8078d1d6cb6bdda46a7b 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -49,9 +49,11 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
+    return framework::OpKernelType(
+        static_cast<framework::DataType>(ctx.Attr<int>("data_type")),
+        ctx.device_context());
   }
 };
 
@@ -73,10 +75,10 @@ class FillConstantBatchSizeLikeOpMaker
               "with the specified value");
     AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
     AddAttr<int>("input_dim_idx",
-                 "(int, default 0) the index of input's batch size dimension")
+                 "(int, default 0) The index of input's batch size dimension")
         .SetDefault(0);
     AddAttr<int>("output_dim_idx",
-                 "(int, default 0) the index of output's batch size dimension")
+                 "(int, default 0) The index of output's batch size dimension")
         .SetDefault(0);
     AddAttr<float>("value", "(float, default 0) The value to be filled")
         .SetDefault(0.0f);
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
index f60425051cc93b77ef9e383f90a62eb9dfed44de..818f113b90a4c239a857791fb9957e51d3287b97 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -12,32 +12,41 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/fill_constant_op.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
-class FillConstantOp : public framework::OperatorWithKernel {
+class FillConstantInferShape : public framework::InferShapeBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillConstantOp should not be null.");
     auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto dims = framework::make_ddim(shape_int64);
-    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
   }
+};
 
- protected:
-  framework::DataType IndicateDataType(
-      const framework::ExecutionContext &ctx) const override {
-    int data_type = ctx.Attr<int>("data_type");
-    VLOG(10) << " FillConstant data_type = " << data_type;
-    return static_cast<framework::DataType>(data_type);
+class FillConstantOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto data_type = static_cast<framework::DataType>(Attr<int>("data_type"));
+    auto value = Attr<float>("value");
+    auto force_cpu = Attr<bool>("force_cpu");
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    if (force_cpu) {
+      auto cpu = platform::CPUPlace();
+      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
+    } else {
+      out.mutable_data(dev_ctx.GetPlace(), framework::ToTypeIndex(data_type));
+    }
+    math::set_constant(dev_ctx, &out, value);
   }
 };
 
@@ -53,6 +62,11 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
     AddAttr<float>("value", "(float, default 0) The value to be filled")
         .SetDefault(0.0f);
+    AddAttr<bool>("force_cpu",
+                  "(bool, default false) Force fill output variable to cpu "
+                  "memory. Otherwise, fill output variable to the running "
+                  "device")
+        .SetDefault(false);
     AddOutput("Out",
               "(Tensor) Tensor of specified shape will be filled "
               "with the specified value");
@@ -68,10 +82,6 @@ Fill up a variable with specified constant value.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
-                             ops::FillConstantOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fill_constant, ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>,
-    ops::FillConstantOpKernel<paddle::platform::CPUPlace, double>,
-    ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>,
-    ops::FillConstantOpKernel<paddle::platform::CPUPlace, int64_t>);
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
+                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/fill_constant_op.h b/paddle/operators/fill_constant_op.h
deleted file mode 100644
index 3668f42f1c29541e29463ff3969064e80703fa04..0000000000000000000000000000000000000000
--- a/paddle/operators/fill_constant_op.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename Place, typename T>
-class FillConstantOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    auto value = ctx.Attr<float>("value");
-
-    auto out_eigen = framework::EigenVector<T>::Flatten(*out);
-    auto place = ctx.GetEigenDevice<Place>();
-    out_eigen.device(place) = out_eigen.constant(static_cast<T>(value));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index aee672500ee5d1bf6cc7ef872f2cb6c408de6d9e..8f80fb162519f60fcce897b3c31a3507bbf6ba6d 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -40,9 +40,11 @@ class GatherOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
   }
 };
 
@@ -55,9 +57,11 @@ class GatherGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 802c98ae764d02af3143d1d39b714d486791da82..53ad86c6c48d1868f4495af51661d91b39a84f0b 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -57,9 +57,11 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
+    return framework::OpKernelType(
+        static_cast<framework::DataType>(ctx.Attr<int>("data_type")),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index bcb48e13bd948b4e91ce8cbd7231a9619fac8d18..066bdf67aa037e9c25cfdfaff7ec8771eb59cde8 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -183,9 +183,11 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of linear_chain_crf
   // is determined by its input "Emission".
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
+        ctx.device_context());
   }
 };
 
@@ -240,10 +242,13 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of output of the linear_chain_crf_grad
   // operator is determined by its input: gradients of LogLikelihood.
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(
-        ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))->type());
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))
+                ->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
index be198951c241dc5e5587c8a2b8d94f67173d2b2a..ce010fcb91873b3099f6bf52cfe20c1ff61846ea 100644
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -28,6 +28,7 @@ class LoDRankTableOp : public framework::OperatorBase {
     auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
+    VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
     out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
   }
 };
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58af35564d83b9699af4f7783fb6367ff9590682
--- /dev/null
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+class LoDTensorToArrayOp : public framework::OperatorBase {
+ public:
+  LoDTensorToArrayOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &rank_table =
+        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
+
+    auto &items = rank_table.items();
+    auto max_seq_len = items[0].length;
+    auto rank_level = rank_table.level();
+    out.resize(max_seq_len);
+    std::vector<std::vector<CopyRange>> copy_ranges(max_seq_len);
+
+    // set out[i] lod
+    for (size_t t = 0; t < max_seq_len; t++) {
+      auto &lod = *out[t].mutable_lod();
+      lod.clear();
+      for (auto &item : items) {
+        if (t >= item.length) {
+          break;
+        }
+        size_t start_idx = x.lod()[rank_level][item.index] + t;
+        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+            x.lod(), start_idx, start_idx + 1, rank_level + 1);
+
+        auto &lod_length = lod_and_offset.first;
+        framework::AppendLoD(&lod, lod_length);
+
+        size_t start_offset = lod_and_offset.second.first;
+        size_t end_offset = lod_and_offset.second.second;
+        copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
+      }
+    }
+
+    for (size_t i = 0; i < max_seq_len; ++i) {
+      auto &ranges = copy_ranges[i];
+      size_t height = std::accumulate(
+          ranges.begin(), ranges.end(), 0UL,
+          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
+      auto x_dim = x.dims();
+      x_dim[0] = static_cast<int64_t>(height);
+      out[i].Resize(x_dim);
+      out[i].mutable_data(x.place(), x.type());
+      size_t offset = 0;
+      for (auto &each_range : ranges) {
+        size_t len = each_range.end - each_range.begin;
+        if (len == 0) {
+          continue;
+        }
+        // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
+        out[i]
+            .Slice(static_cast<int>(offset), static_cast<int>(offset + len))
+            .CopyFrom(x.Slice(static_cast<int>(each_range.begin),
+                              static_cast<int>(each_range.end)),
+                      x.place(), dev_ctx);
+        offset += len;
+      }
+    }
+  }
+};
+
+class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDTensorToArrayOpProtoMaker(framework::OpProto *proto,
+                               framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddInput("RankTable", "");
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class LoDTensorToArrayInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of LoDTensorToArrayOp should not be null.");
+    PADDLE_ENFORCE(
+        context->HasInput("RankTable"),
+        "Input(RankTable) of LoDTensorToArrayOp should not be null.");
+
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "Output(Out) of LoDTensorToArrayOp should not be null.");
+
+    auto x_dim = context->GetInputDim("X");
+    // The first dim of each LoDTensor in Output can only be set at run-time.;
+    // We still have to Resize each LoDTensor in Output.
+    context->SetOutputDim("Out", x_dim);
+  }
+};
+
+class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDescBind &op_desc,
+                  framework::BlockDescBind *block) const override {
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+    }
+  }
+};
+
+class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("array_to_lod_tensor");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lod_tensor_to_array, ops::LoDTensorToArrayOp,
+                  ops::LoDTensorToArrayOpProtoMaker,
+                  ops::LoDTensorToArrayInferShape,
+                  ops::LoDTensorToArrayInferVarType,
+                  ops::LoDTensorToArrayGradMaker);
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 2163c8ce4e5d75d5934c08f59a47bad9553f0c8b..93e812ac5be5aea6bf3ab353d31480322c51ccbc 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -41,9 +41,11 @@ class LookupTableOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<LoDTensor>("W")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
+        ctx.device_context());
   }
 };
 
@@ -97,9 +99,11 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<LoDTensor>("W")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index 10d66e5ff40987baf7a0c47c1c07dc03f8aa6a3f..84b044184a36a0d3a72a4105d6baf401b4774cf7 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -74,8 +74,9 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
-    LookupTable<T, 128, 8,
-                8><<<grids, threads, 0, context.device_context().stream()>>>(
+    LookupTable<
+        T, 128, 8,
+        8><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
         output, table, ids, N, K, D);
   }
 };
@@ -135,7 +136,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       dim3 grids(8, 1);
       LookupTableGrad<
           T, 128, 8,
-          8><<<grids, threads, 0, context.device_context().stream()>>>(
+          8><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
           d_table, d_output, ids, N, K, D);
     }
   }
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
index dc64b3f2c451df847396d36e8d1b444748b13bbc..d99e008447deb521066b96164b07dcfd9b1a06c1 100644
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -92,10 +92,11 @@ class LSTMOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(
-        ctx.Input<framework::LoDTensor>("Input")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
   }
 };
 
@@ -267,10 +268,11 @@ class LSTMGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(
-        ctx.Input<framework::LoDTensor>("Input")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
index f4519ec16f3f694cf49941f8d23c4106f6f1ddc3..18b9cdf2a39e8226c634194ff2cc56d169979774 100644
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -34,10 +34,10 @@ class LstmUnitOp : public framework::OperatorWithKernel {
     auto c_prev_dims = ctx->GetInputDim("C_prev");
 
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
-    PADDLE_ENFORCE(x_dims[0] == c_prev_dims[0],
-                   "Batch size of inputs and states must be equal");
-    PADDLE_ENFORCE(x_dims[1] == c_prev_dims[1] * 4,
-                   "Dimension of FC should equal to prev state * 4");
+    PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
+                      "Batch size of inputs and states must be equal");
+    PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
+                      "Dimension of FC should equal to prev state * 4");
 
     int b_size = c_prev_dims[0];  // batch size
     int s_dim = c_prev_dims[1];   // state dim
diff --git a/paddle/operators/math/detail/lstm_gpu_kernel.h b/paddle/operators/math/detail/lstm_gpu_kernel.h
index e8ac61e009c35593b3c951374d7997a3f42a4f91..d138bbe411f69929a14ad19af3e84824ac7a5d58 100644
--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
@@ -246,11 +246,6 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
         op, value, grad, frameSize, batchSize, active_node, active_gate,
         active_state);
   }
-
-  cudaStreamSynchronize(stream);
-  // TODO(qingqing): Add cuda error check for each kernel.
-  cudaError_t err = cudaGetLastError();
-  PADDLE_ENFORCE(err, cudaGetErrorString(err));
 }
 
 }  // namespace detail
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index 2a9c09a0f16b71473e21765ab9253eb7b8bcf28c..09c3f0b1e6f787547b9253d3aeadf70674708ba0 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/math/math_function.h"
+#include "paddle/framework/data_type.h"
 
 namespace paddle {
 namespace operators {
@@ -233,6 +234,52 @@ void gemv<platform::CPUPlace, double>(const platform::DeviceContext& context,
 
 template struct SetConstant<platform::CPUPlace, float>;
 
+struct TensorSetConstant {
+  TensorSetConstant(framework::Tensor* tensor, float value)
+      : tensor_(tensor), value_(value) {}
+  template <typename T>
+  void operator()() const {
+    auto cpu = platform::CPUPlace();
+    auto* begin = tensor_->mutable_data<T>(cpu);
+    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
+  }
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<platform::CPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstant(tensor, value));
+}
+
+struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
+  TensorSetConstantWithPlace(const platform::DeviceContext& context,
+                             framework::Tensor* tensor, float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename Place>
+  void operator()(Place place) const {
+    set_constant_with_place<Place>(context_, tensor_, value_);
+  }
+
+  const platform::DeviceContext& context_;
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+void set_constant(const platform::DeviceContext& context,
+                  framework::Tensor* tensor, float value) {
+  TensorSetConstantWithPlace func(context, tensor, value);
+#ifdef PADDLE_WITH_CUDA
+  tensor->place().apply_visitor(func);
+#else
+  func(platform::CPUPlace());
+#endif
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index e6fd8bf235b8539702ca2c5b39e305cb1becf5cb..255e480680499877ff599b96b8336a968cccbb34 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/framework/data_type.h"
 #include "paddle/operators/math/math_function.h"
 
 namespace paddle {
@@ -232,6 +233,30 @@ void gemv<platform::GPUPlace, double>(const platform::DeviceContext& context,
 
 template struct SetConstant<platform::GPUPlace, float>;
 
+struct TensorSetConstant {
+  TensorSetConstant(const platform::DeviceContext& context,
+                    framework::Tensor* tensor, float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename T>
+  void operator()() const {
+    SetConstant<platform::GPUPlace, T> functor;
+    functor(context_, tensor_, static_cast<T>(value_));
+  }
+
+  const platform::DeviceContext& context_;
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<platform::GPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstant(context, tensor, value));
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index 3bb5aa0332c7e2a63d20b91893c03ccd468dd863..1c9eabb2b78f1d69054b347f27854ee8ca3f3d1e 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -108,6 +108,13 @@ struct SetConstant {
   }
 };
 
+template <typename Place>
+void set_constant_with_place(const platform::DeviceContext& context,
+                             framework::Tensor* tensor, float value);
+
+void set_constant(const platform::DeviceContext& context,
+                  framework::Tensor* tensor, float value);
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
index 7d84ad9aadb2892db0d0ee9cab428dc5036614e9..983c9fdcffb0a67da1bc0b5b4af9420a68bd2ac1 100644
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -139,3 +139,15 @@ TEST(math_function, gemv) {
   GemvTest<float>(12, 7, true);
   GemvTest<double>(7, 9, true);
 }
+
+TEST(math_funciton, set_constant) {
+  paddle::framework::Tensor t;
+  t.Resize({10, 10});
+  t.mutable_data<int>(paddle::platform::CPUPlace());
+  auto* ctx = new paddle::platform::CPUDeviceContext();
+  paddle::operators::math::set_constant(*ctx, &t, 10);
+  for (int64_t i = 0; i < t.numel(); ++i) {
+    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
+  }
+  delete ctx;
+}
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 78b4bbca84d4670aba73222f1d679604d7516b02..dcc5b4286f4ac833268a779a9a7edd2ed119ffff 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -51,6 +51,7 @@ class MeanGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", framework::GradVarName("X"));
   }
 };
 
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 234fddcfd55ccc66f6378689dbc426499474b11f..f8527dfab3f3c42f430c433a11351f12b8dfae8b 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -51,9 +51,11 @@ class MultiplexOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
+        ctx.device_context());
   }
 };
 
@@ -107,9 +109,11 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 7adc7df164e6c7f39ed9b572dfb09380ef06d749..49ed8a8879527fd32dd8b001ea256e46a0353487 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -71,7 +71,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
     auto* index = index_t_cpu.data<int32_t>();
 
-    auto stream = ctx.device_context().stream();
+    auto stream = ctx.cuda_device_context().stream();
     Place place = boost::get<Place>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       size_t k = static_cast<size_t>(index[i]);
diff --git a/paddle/operators/pool_cudnn_op.cu b/paddle/operators/pool_cudnn_op.cu
index 8d0741dccc1fdae069af55da49f44378e2c4ddf8..8711567b95fea355396173b5312d26d31f9ffb12 100644
--- a/paddle/operators/pool_cudnn_op.cu
+++ b/paddle/operators/pool_cudnn_op.cu
@@ -37,11 +37,11 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> {
     const T *input_data = input->data<T>();
     T *output_data = output->mutable_data<T>(ctx.GetPlace());
 
-    std::string pooling_type = ctx.Attr<std::string>("poolingType");
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    if (ctx.Attr<bool>("globalPooling")) {
+    if (ctx.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(input->dims()[i + 2]);
@@ -92,12 +92,12 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
         ctx.Input<Tensor>(framework::GradVarName("Out"));
     Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    std::string pooling_type = ctx.Attr<std::string>("poolingType");
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
 
-    if (ctx.Attr<bool>("globalPooling")) {
+    if (ctx.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(input->dims()[i + 2]);
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index f58aab733866973f477ca79e5e53ba58adbf3dc7..f3963b1995ef8767786f0bf230b134afc69aa99d 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -29,7 +29,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 
   auto in_x_dims = ctx->GetInputDim("X");
 
-  std::string pooling_type = ctx->Attrs().Get<std::string>("poolingType");
+  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
   std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
@@ -37,7 +37,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
   PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                  "Pooling intput should be 4-D or 5-D tensor.");
 
-  if (ctx->Attrs().Get<bool>("globalPooling")) {
+  if (ctx->Attrs().Get<bool>("global_pooling")) {
     ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
     for (size_t i = 0; i < ksize.size(); ++i) {
       paddings[i] = 0;
@@ -83,20 +83,20 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
             "H is the height of the feature, "
             "and W is the width of the feature.");
 
-  AddAttr<std::string>("poolingType",
+  AddAttr<std::string>("pooling_type",
                        "(string), pooling type, can be \"max\" for max-pooling "
                        "and \"avg\" for average-pooling.")
       .InEnum({"max", "avg"});
   AddAttr<std::vector<int>>("ksize",
                             "(vector<int>) The pooling window "
                             "size(height, width) of the pooling operator. "
-                            "If globalPooling = true, ksize and paddings will "
+                            "If global_pooling = true, ksize and paddings will "
                             "be ignored.");  // TODO(Chengduo): Add checker.
                                              // (Currently,
   // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>("globalPooling",
+  AddAttr<bool>("global_pooling",
                 "(bool, default false) Whether to use the global pooling. "
-                "If globalPooling = true, ksize and paddings will be ignored.")
+                "If global_pooling = true, ksize and paddings will be ignored.")
       .SetDefault(false);
   AddAttr<std::vector<int>>("strides",
                             "(vector<int>, default {1, 1}), strides(height, "
@@ -107,7 +107,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
       "paddings",
       "(vector<int>, defalut {0,0}), paddings(height, width) of pooling "
       "operator."
-      "If globalPooling = true, paddings and ksize will be ignored.")
+      "If global_pooling = true, paddings and ksize will be ignored.")
       .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
   // TypedAttrChecker don't support vector type.)
 
@@ -115,7 +115,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
 Pool2d Operator.
 
 The pooling2d operation calculates the output based on
-the input, poolingType and ksize, strides, paddings parameters.
+the input, pooling_type and ksize, strides, paddings parameters.
 Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
 number of channels, H is the height of the feature, and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
@@ -152,7 +152,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
             "the number of channels, and D, H and W is the depth, height and "
             "width of the feature, respectively.");
 
-  AddAttr<std::string>("poolingType",
+  AddAttr<std::string>("pooling_type",
                        "(string) Pooling type, can be \"max\" for max-pooling "
                        "and \"avg\" for average-pooling.")
       .InEnum({"max", "avg"});
@@ -160,13 +160,14 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
       "ksize",
       "(vector<int>) The pooling window size(depth, height, "
       "width) of pooling operator. "
-      "If globalPooling = true, ksize and paddings will "
+      "If global_pooling = true, ksize and paddings will "
       "be ignored.");  // TODO(Chengduo): Add checker.
                        // (Currently,
   // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>("globalPooling",
-                "(bool, default false) Whether to use the global pooling. "
-                "If globalPooling = true, ksize and paddings wille be ignored.")
+  AddAttr<bool>(
+      "global_pooling",
+      "(bool, default false) Whether to use the global pooling. "
+      "If global_pooling = true, ksize and paddings wille be ignored.")
       .SetDefault(false);
   AddAttr<std::vector<int>>(
       "strides",
@@ -178,7 +179,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
       "paddings",
       "(vector<int>, defalut {0,0,0}), paddings(depth, height, "
       "width) of pooling operator. "
-      "If globalPooling = true, ksize and paddings will be ignored.")
+      "If global_pooling = true, ksize and paddings will be ignored.")
       .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                                // TypedAttrChecker don't support vector type.)
 
@@ -186,7 +187,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
 Pool3d Operator.
 
 The pooling3d operation calculates the output based on
-the input, poolingType, ksize, strides, and paddings parameters.
+the input, pooling_type, ksize, strides, and paddings parameters.
 Input(X) and output(Out) are in NCDHW format, where N is batch
 size, C is the number of channels, and D, H and W are the depth, height and
 width of the feature, respectively. Parameters(ksize, strides, paddings) 
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index d9d445f6a6257b0c8a1959c64c9a878539e10cd4..4da1941ab541483e706257667b14aa5a95e0c3cc 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -57,11 +57,11 @@ class PoolKernel : public framework::OpKernel<T> {
     const Tensor* in_x = context.Input<Tensor>("X");
     Tensor* out = context.Output<Tensor>("Out");
 
-    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
@@ -119,12 +119,12 @@ class PoolGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
 
-    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
 
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index a31b3fcb7083ba5bfbcb6666c4641833ae1e69eb..1df36e965abab3549aeb88bf682b712033c4d79c 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -44,7 +44,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                    "Pooling intput should be 4-D or 5-D tensor.");
 
-    if (ctx->Attrs().Get<bool>("globalPooling")) {
+    if (ctx->Attrs().Get<bool>("global_pooling")) {
       ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
@@ -110,14 +110,14 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int>>("ksize",
                               "(vector<int>) The pooling window size(height, "
                               "width) of pooling operator. "
-                              "If globalPooling = true, ksize and paddings "
+                              "If global_pooling = true, ksize and paddings "
                               "will be ignored.");  // TODO(Chengduo): Add
                                                     // checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
-        "globalPooling",
+        "global_pooling",
         "(bool, default false) Whether to use the global pooling. "
-        "If globalPooling = true, ksize and paddings will be ignored.")
+        "If global_pooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
                               "(vector<int>, default {1, 1}), strides(height, "
@@ -128,7 +128,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
         "paddings",
         "(vector<int>, defalut {0, 0}), paddings(height, width) of pooling "
         "operator. "
-        "If globalPooling = true, paddings and will be ignored.")
+        "If global_pooling = true, paddings and will be ignored.")
         .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
 
@@ -188,14 +188,14 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int>>("ksize",
                               "(vector<int>) The pooling window size(depth, "
                               "height, width) of pooling operator. "
-                              "If globalPooling = true, ksize and paddings "
+                              "If global_pooling = true, ksize and paddings "
                               "will be ignored.");  // TODO(Chengduo): Add
                                                     // checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
-        "globalPooling",
+        "global_pooling",
         "(bool, default false) Whether to use the global pooling. "
-        "If globalPooling = true, ksize and paddings will be ignored.")
+        "If global_pooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
                               "(vector<int>, default {1,1,1}), strides(depth, "
@@ -206,7 +206,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
         "paddings",
         "(vector, defalut {0,0,0}), paddings(depth, "
         "height, width) of pooling operator. "
-        "If globalPooling = true, paddings and ksize will be ignored.")
+        "If global_pooling = true, paddings and ksize will be ignored.")
         .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
 
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h
index 48627740435b7d397c5a53491c1f89ba1b603803..ea37de84abeb577461ccd5c1f0eda8bacb4458eb 100644
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -35,7 +35,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
@@ -72,7 +72,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
diff --git a/paddle/operators/positive_negative_pair_op.cc b/paddle/operators/positive_negative_pair_op.cc
index afbb63cc6053e2968750c163f9ac194d0f5b93e0..4ba40a62ec5f696ad980c2913f7e162879a557e2 100644
--- a/paddle/operators/positive_negative_pair_op.cc
+++ b/paddle/operators/positive_negative_pair_op.cc
@@ -85,9 +85,11 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Score")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/precision_recall_op.cc b/paddle/operators/precision_recall_op.cc
index 641f7135ded159b1c7330e87c4b1d983e959b466..1ace4f2a5935dcb4239526c42599a42d288ff552 100644
--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
@@ -80,9 +80,11 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 62e6c70b4513fdfab1c563b6b23f36292fb6486a..ce4b794bc35aca0912d89a4ae81a9aa0c73a2104 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -49,9 +49,11 @@ class ScatterOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Ref")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        ctx.device_context());
   }
 };
 
@@ -66,9 +68,11 @@ class ScatterGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Ref")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index 710f280017fa5e188b187a3e91b27e2bedc65d10..2a000ac60b176737277605c3ac812ea65a0e03fc 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -107,9 +107,11 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..65bccc0c81d0ad9674649933a20ec7b09fec5b37
--- /dev/null
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/operators/array_operator.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class ShrinkRNNMemoryOp : public ArrayOp {
+ public:
+  ShrinkRNNMemoryOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *x_var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
+    auto &x_tensor = x_var->Get<framework::LoDTensor>();
+    size_t offset = this->GetOffset(scope, dev_ctx);
+    auto *rank_table_var = scope.FindVar(Input("RankTable"));
+    PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set");
+    auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
+
+    auto &rank_items = rank_table.items();
+    int dst_num_rows =
+        std::lower_bound(rank_items.begin(), rank_items.end(), offset,
+                         [](const framework::LoDRankTable::TableItem &a,
+                            size_t b) { return a.length > b; }) -
+        rank_items.begin();
+
+    auto *out_var = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(out_var != nullptr, "Output Out must be set");
+    auto &out_tensor = *out_var->GetMutable<framework::LoDTensor>();
+    if (dst_num_rows != 0) {
+      out_tensor.ShareDataWith(x_tensor.Slice(0, dst_num_rows));
+    }
+  }
+};
+
+class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto,
+                              framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddInput("RankTable", "");
+    AddInput("I", "");
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class ShrinkRNNMemoryInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasInput("I"));
+    PADDLE_ENFORCE(context->HasInput("RankTable"));
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ShrinkRNNMemoryGradOp : public ArrayOp {
+ public:
+  ShrinkRNNMemoryGradOp(const std::string &type,
+                        const framework::VariableNameMap &inputs,
+                        const framework::VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
+    auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
+    PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
+    auto *x_var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x_var != nullptr);
+
+    auto &x_tensor = x_var->Get<framework::LoDTensor>();
+    auto &dx_tensor = *dx_var->GetMutable<framework::LoDTensor>();
+    dx_tensor.Resize(x_tensor.dims());
+    dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
+
+    if (dout_var == nullptr) {  // dx_tensor fill zero
+      math::set_constant(dev_ctx, &dx_tensor, 0.0f);
+    } else {
+      auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
+      auto height = dout_tensor.dims()[0];
+      dx_tensor.Slice(0, static_cast<int>(height))
+          .CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx);
+      if (dx_tensor.dims()[0] < height) {
+        auto rest_tensor = dx_tensor.Slice(
+            static_cast<int>(height), static_cast<int>(dout_tensor.dims()[0]));
+        math::set_constant(dev_ctx, &rest_tensor, 0.0f);
+      }
+    }
+  }
+};
+
+class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasOutput(framework::GradVarName("X")));
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+  }
+};
+
+class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *op = new framework::OpDescBind();
+    op->SetType("shrink_rnn_memory_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shrink_rnn_memory, ops::ShrinkRNNMemoryOp,
+                  ops::ShrinkRNNMemoryInferShape,
+                  ops::ShrinkRNNMemoryOpProtoMaker, ops::ShrinkRNNGradOpMaker);
+REGISTER_OPERATOR(shrink_rnn_memory_grad, ops::ShrinkRNNMemoryGradOp,
+                  ops::ShrinkRNNMemoryGradInferShape);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index c6b94f5cc947ccb86315fd9058b8c57d1a996927..ed96e8cee5a78e63ea29ed383d06c1258abdc328 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -121,9 +121,11 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Logits")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
+        ctx.device_context());
   }
 };
 
@@ -160,10 +162,12 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(
-        ctx.Input<Tensor>(framework::GradVarName("Loss"))->type());
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<Tensor>(framework::GradVarName("Loss"))->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index b1e58952fdba35183822ad2f9a51d5bcc5e6ad6a..750f96296a8414019265b26095d50eefb7dbb2dd 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -47,20 +47,24 @@ class SumOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto x_vars = ctx.MultiInputVar("X");
     if (x_vars[0]->IsType<framework::LoDTensor>()) {
-      return framework::ToDataType(
-          x_vars[0]->Get<framework::LoDTensor>().type());
+      return framework::OpKernelType(
+          framework::ToDataType(x_vars[0]->Get<framework::LoDTensor>().type()),
+          ctx.device_context());
     } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
-      return framework::ToDataType(
-          x_vars[0]->Get<framework::SelectedRows>().value().type());
+      return framework::OpKernelType(
+          framework::ToDataType(
+              x_vars[0]->Get<framework::SelectedRows>().value().type()),
+          ctx.device_context());
     } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
       auto& array = x_vars[0]->Get<framework::LoDTensorArray>();
       for (auto& each : array) {
         if (each.numel() != 0) {
-          return framework::ToDataType(each.type());
+          return framework::OpKernelType(framework::ToDataType(each.type()),
+                                         ctx.device_context());
         }
       }
     }
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index 50824032ca0e23b6f961928103ea4aa74b6ac23a..eaf6352748729fa04ccc9b9901608cb89f489c28 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -11,48 +11,18 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
-#include "paddle/framework/lod_tensor_array.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/operators/array_operator.h"
 
 namespace paddle {
 namespace operators {
-class ArrayOpBase : public framework::OperatorBase {
- public:
-  ArrayOpBase(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {}
-
- protected:
-  size_t GetOffset(const framework::Scope &scope,
-                   const platform::DeviceContext &dev_ctx) const {
-    auto *i = scope.FindVar(Input("I"));
-    PADDLE_ENFORCE(i != nullptr, "I must be set");
-    auto &i_tensor = i->Get<framework::LoDTensor>();
-    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
-    size_t offset;
-    if (platform::is_gpu_place(i_tensor.place())) {
-      // FIXME: Avoid copy from GPU to CPU
-      framework::Tensor t;
-      t.CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx);
-      dev_ctx.Wait();
-      offset = static_cast<size_t>(*t.data<int64_t>());
-    } else {
-      offset = static_cast<size_t>(*i_tensor.data<int64_t>());
-    }
-    return offset;
-  }
-};
 
-class WriteToArrayOp : public ArrayOpBase {
+class WriteToArrayOp : public ArrayOp {
  public:
   WriteToArrayOp(const std::string &type,
                  const framework::VariableNameMap &inputs,
                  const framework::VariableNameMap &outputs,
                  const framework::AttributeMap &attrs)
-      : ArrayOpBase(type, inputs, outputs, attrs) {}
+      : ArrayOp(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
            const platform::DeviceContext &dev_ctx) const override {
@@ -122,13 +92,13 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
   }
 };
 
-class ReadFromArrayOp : public ArrayOpBase {
+class ReadFromArrayOp : public ArrayOp {
  public:
   ReadFromArrayOp(const std::string &type,
                   const framework::VariableNameMap &inputs,
                   const framework::VariableNameMap &outputs,
                   const framework::AttributeMap &attrs)
-      : ArrayOpBase(type, inputs, outputs, attrs) {}
+      : ArrayOp(type, inputs, outputs, attrs) {}
   void Run(const framework::Scope &scope,
            const platform::DeviceContext &dev_ctx) const override {
     auto *x = scope.FindVar(Input("X"));
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index cd22c561acb6cbd66faf1d3d6ee21779001a2795..7975efc7cf134aaf591385a6866254a9c5f2a0bb 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -63,9 +63,11 @@ class UniformRandomOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
+    return framework::OpKernelType(
+        static_cast<framework::DataType>(ctx.Attr<int>("data_type")),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 36450e926891342f37424447703781a33c1190ae..7afcdfce9371e29aad968a1729931173fb2309b5 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -124,6 +124,11 @@ void CUDADeviceContext::Wait() const {
   PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
 }
 
+void CUDADeviceContext::Finish() const {
+  Wait();
+  PADDLE_ENFORCE(cudaGetLastError());
+}
+
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index ef5f19214d9ccb23b9c946bee28cb764122bd7cd..526d089e35da9c9f89a3852095ad3a4c82d4d85d 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -46,6 +46,8 @@ class DeviceContext {
   DeviceType* GetEigenDevice() const;
 
   virtual void Wait() const {}
+
+  virtual void Finish() const {}
 };
 
 class CPUDeviceContext : public DeviceContext {
@@ -77,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Wait for all operations completion in the stream. */
   void Wait() const override;
 
+  /*! \brief  Check potential errors for the cuda kernel calls. */
+  void Finish() const override;
+
   /*! \brief  Return place in the device context. */
   Place GetPlace() const override;
 
diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
index f196868c725cbb91b3df710260c5b60f14d53f37..bb9d59ec0a18ce013632f128c9b5d230255f1ac4 100644
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
@@ -49,8 +49,6 @@ struct Transform<platform::CPUPlace> {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
   void operator()(const DeviceContext& context, InputIter first, InputIter last,
                   OutputIter result, UnaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
     std::transform(first, last, result, op);
   }
 
@@ -59,8 +57,6 @@ struct Transform<platform::CPUPlace> {
   void operator()(const DeviceContext& context, InputIter1 first1,
                   InputIter1 last1, InputIter2 first2, OutputIter result,
                   BinaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
     std::transform(first1, last1, first2, result, op);
   }
 };
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 0c528174b2b2b3a27869ed0083fc96b8d90e723b..0f906e0e470b7f95bb2103ae55330fc1831aa78f 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -113,11 +113,13 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
       .def("set", PyCPUTensorSetFromArray<int64_t>)
+      .def("set", PyCPUTensorSetFromArray<bool>)
 #ifdef PADDLE_WITH_CUDA
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
       .def("set", PyCUDATensorSetFromArray<double>)
       .def("set", PyCUDATensorSetFromArray<int64_t>)
+      .def("set", PyCUDATensorSetFromArray<bool>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
       .def("set_float_element", TensorSetElement<float>)
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index f278e79af60486bce400f313b80ebbe3971f869b..41fa658502d341fe9653a3e99b58498fcaeada47 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -85,7 +85,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 }  // namespace details
 inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
   auto buffer_info =
-      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t>()(
+      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool>()(
           tensor);
   return buffer_info;
 }
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 53e68648e6c1718cc4d4d9fe074e8deb418d3b1d..256500c56a2e05f981825b6ddb2a843f3ba71a83 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -174,8 +174,6 @@ EOF
 EOF
 }
 
-set +xe
-
 cmake_gen
 run_build
 run_test
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index ebf0911d6ea0b39d51447859ae2aef485b50b0e6..2d7ff1df98a9a448b447890537f20dd416a9ae9d 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -321,6 +321,11 @@ message ClipConfig {
   required double max = 2;
 }
 
+message ScaleSubRegionConfig {
+  required ImageConfig image_conf = 1;
+  required float value = 2;
+}
+
 message LayerInputConfig {
   required string input_layer_name = 1;
   optional string input_parameter_name = 2;
@@ -342,6 +347,7 @@ message LayerInputConfig {
   optional MultiBoxLossConfig multibox_loss_conf = 16;
   optional DetectionOutputConfig detection_output_conf = 17;
   optional ClipConfig clip_conf = 18;
+  optional ScaleSubRegionConfig scale_sub_region_conf = 19;
 }
 
 message LayerConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 0e65598485d8785b3f5b2f1bc7e87f377b35792e..9e2c6f59bd0af1627c79a8a29bd1515ae5c9c6b5 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3801,6 +3801,25 @@ class SwitchOrderLayer(LayerBase):
         self.config.reshape_conf.width_axis.extend(reshape['width'])
 
 
+@config_layer('scale_sub_region')
+class ScaleSubRegionLayer(LayerBase):
+    def __init__(self, name, inputs, value, **xargs):
+        super(ScaleSubRegionLayer, self).__init__(
+            name, 'scale_sub_region', 0, inputs=inputs, **xargs)
+        scale_sub_region_conf = self.config.inputs[0].scale_sub_region_conf
+        scale_sub_region_conf.value = value
+
+        # get channel, width and height from input_0 layer
+        input_layer = self.get_input_layer(0)
+        image_conf = scale_sub_region_conf.image_conf
+        image_conf.img_size = input_layer.width
+        image_conf.img_size_y = input_layer.height
+        image_conf.channels = input_layer.size / (input_layer.width *
+                                                  input_layer.height)
+        self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
+                           image_conf.channels)
+
+
 # Deprecated, use a new layer specific class instead
 @config_func
 def Layer(name, type, **xargs):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 169e201046a0d7b8c3e85f60946d8c1c762c88f4..9dbfb93217262b0599fd4af75449838041da7a23 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -144,6 +144,7 @@ __all__ = [
     'img_conv3d_layer',
     'resize_layer',
     'sub_seq_layer',
+    'scale_sub_region_layer',
 ]
 
 
@@ -255,6 +256,8 @@ class LayerType(object):
     RESIZE = 'resize'
     SUB_SEQ_LAYER = 'subseq'
 
+    SCALE_SUB_REGION_LAYER = 'scale_sub_region'
+
     @staticmethod
     def is_layer_type(type_name):
         """
@@ -6548,26 +6551,27 @@ def switch_order_layer(input,
 @layer_support()
 def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
     """
-    This layer crops images by offset and shape. User can set crop shape by
-    args 'shape' explicitly or by reference input layer.
+    This layer crops images according to the offset and shape. Users can set
+    the crop shape through the argument 'shape' explicitly or by specifying a
+    reference input layer.
 
     The example usage is:
 
     .. code-block:: python
     crop = crop_layer(input=[image_input, reference_input], axis=2, offset=[2, 3])
 
-    :param input: The input of this layer. If two inputs are given, the second input
-                  will be regarded as reference input.
+    :param input: The input of this layer. If two inputs are given, the second one
+                  will be regarded as the reference.
     :type input: LayerOutput | Sequence
     :param offset: The crop offset.
     :type offset: Sequence
-    :param axis: start axis to be cropped. To image input layer:
+    :param axis: The start axis to be cropped. For image input layer:
         - 0: batch size
         - 1: channels
         - 2: height
         - 3: width
-    :type partial_sum: int
-    :param shape: The shape to be cropped. Default is None.
+    :type axis: int
+    :param shape: The shape to be cropped to. Default is None.
     :type shape: Sequence | None
     :param name: The name of this layer. It is optional.
     :type name: basestring
@@ -6702,9 +6706,9 @@ def seq_slice_layer(input, starts, ends, name=None):
     :type name: basestring
     :param input: The input of this layer, which should be a sequence.
     :type input: LayerOutput
-    :param starts: start indices to slice the input sequence.
+    :param starts: The start indices to slice the input sequence.
     :type starts: LayerOutput | None
-    :param ends: end indices to slice the input sequence.
+    :param ends: The end indices to slice the input sequence.
     :type ends: LayerOutput | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6744,7 +6748,7 @@ def seq_slice_layer(input, starts, ends, name=None):
 @layer_support()
 def kmax_seq_score_layer(input, name=None, beam_size=1):
     """
-    This layer accepts one input which are scores over a sequence or a nested
+    This layer accepts one input which is scores over a sequence or a nested
     sequence, and returns indices of beam_size sequences with highest scores.
 
     .. code-block:: python
@@ -6754,11 +6758,11 @@ def kmax_seq_score_layer(input, name=None, beam_size=1):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input of this layer. It stores scores over a sequence or a nested
-        sequence and its size must be 1.
+    :param input: The input of this layer. It stores scores over a sequence or
+                  a nested sequence and its size must be 1.
     :type input: LayerOutput
-    :param beam_size: sequence indices with top beam_size scores are returned.
-    :type beam_size: double
+    :param beam_size: The indices of the sequences with top beam_size scores are returned.
+    :type beam_size: int
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -6814,38 +6818,42 @@ def img_conv3d_layer(input,
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param filter_size: The x dimension of a filter kernel. Or input a list.
+    :param filter_size: The dimensions of the filter kernel along three axises. If the parameter
+                        is set to one integer, the three dimensions will be same.
     :type filter_size: int | tuple | list
-    :param num_filters: Each filter group's number of filter
+    :param num_filters: The number of filters in each group.
+    :type num_filters: int
     :param act: Activation type. ReluActivation is the default.
     :type act: BaseActivation
-    :param groups: Group size of filters.
+    :param groups: The number of the filter groups.
     :type groups: int
-    :param stride: The x dimension of the stride. Or input a tuple for two image
-                   dimension.
+    :param stride: The strides of the convolution along three axises. If the parameter
+                   is set to one integer, the three strides will be same.
     :type stride: int | tuple | list
-    :param padding: The x dimension of the padding. Or input a tuple for two
-                    image dimension
+    :param padding: The numbers of padding along three axises. If the parameter is set to
+                    one integer, they will be same.
     :type padding: int | tuple | list
-    :param bias_attr: Convolution bias attribute. None means default bias.
-                      False means no bias.
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param num_channels: number of input channels. If None will be set
-                        automatically from previous output.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None,  its actual value will be automatically set to
+                         the channels number of the input .
     :type num_channels: int
-    :param param_attr: Convolution param attribute. None means default attribute
+    :param param_attr: The parameter attribute of the convolution.
     :type param_attr: ParameterAttribute
-    :param shared_biases: Is biases will be shared between filters or not.
+    :param shared_biases: Whether biases will be shared between filters or not.
     :type shared_biases: bool
-    :param layer_attr: Layer Extra Attribute.
+    :param layer_attr: Extra layer attributes.
     :type layer_attr: ExtraLayerAttribute
-    :param trans: true if it is a convTransLayer, false if it is a convLayer
+    :param trans: True if it is a convTransLayer, False if it is a convLayer
     :type trans: bool
-    :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt" or "cudnn_convt",
-                       otherwise layer_type has to be either "exconv" or
-                       "cudnn_conv"
-    :type layer_type: String
+    :param layer_type: Specify the layer_type. If the parameter is set, it must be "deconv3d"
+                       when trans=True. If not set, it will be automatically set to "deconv3d"
+                       when trans=True and "conv3d" when trans=False.
+    :type layer_type: basestring
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -6927,7 +6935,7 @@ def img_conv3d_layer(input,
 def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
     """
     A layer applies a linear transformation to each element in each row of
-    the input matrix. For each element, the layer first re-scale it and then
+    the input matrix. For each element, the layer first re-scales it and then
     adds a bias to it.
 
     This layer is very like the SlopeInterceptLayer, except the scale and
@@ -7001,12 +7009,12 @@ def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
     :type name: basestring
     :param input: The input of this layer, which should be sequence.
     :type input: LayerOutput
-    :param offsets: offset indices to slice the input sequence, which should be
-                    sequence type.
+    :param offsets: The offset indices to slice the input sequence, which should
+                    be sequence type.
     :type offsets: LayerOutput
-    :param sizes: sizes of the sub-sequences, which should be sequence type.
+    :param sizes: The sizes of the sub-sequences, which should be sequence type.
     :type sizes: LayerOutput
-    :param act: Layer activation, default is LinearActivation
+    :param act: Activation type, LinearActivation is the default.
     :type act: BaseActivation.
     :param bias_attr: The Bias Attribute. If the parameter is set to
                       False or something not type of ParameterAttribute,
@@ -7037,3 +7045,54 @@ def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
         LayerType.SUB_SEQ_LAYER,
         parents=[input, offsets, sizes],
         size=input.size)
+
+
+@wrap_name_default('scale_sub_region')
+def scale_sub_region_layer(input, indices, value, name=None):
+    """
+    Given an image or feature map with CHW information, scale_sub_region_layer
+    can be used to multiply a real value to values of a sub continuous region.
+    You can provide start and end indices of CHW for each instance.
+    Please notice that all start indices are counting from 1.
+    The shape of indices should be [batch_size, 6] and the layout for each row
+    is [C_Start, C_End, H_Start, H_End, W_Start, W_End].
+
+    .. code-block:: python
+
+        scale_sub_region = scale_sub_region_layer(input=input,
+                                                  indices=indices,
+                                                  value=value)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer which should contains CHW information.
+    :type input: LayerOutput
+    :param indices: Start index and end index for C H W, the input value should
+                    be a 2-D matrix with shape [batch_size, 6].
+    :type indices: LayerOutput.
+    :param value: value to multiply.
+    :type value: float
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of scale_sub_region_layer, '
+        'must be a PaddlePaddle layer.')
+    assert isinstance(indices, LayerOutput), (
+        'The start and end indices for CHW, must be a PaddlePaddle layer.')
+    assert isinstance(value, float), (
+        'The value to multiply, must be a real value.')
+
+    Layer(
+        name=name,
+        type=LayerType.SCALE_SUB_REGION_LAYER,
+        inputs=[input.name, indices.name],
+        value=value)
+
+    return LayerOutput(
+        name,
+        LayerType.SCALE_SUB_REGION_LAYER,
+        parents=[input, indices],
+        num_filters=input.num_filters,
+        size=input.size)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 6a4550c209762362d40f8a2afaf526a1fe53ca6b..42aaed7a6469342086b8273eb5b80eaea905f851 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -10,6 +10,6 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
 test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer
-test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer)
+test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..d20133a10ec605654bd3744297673068a77020b8
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
@@ -0,0 +1,51 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2016
+  active_type: ""
+  height: 48
+  width: 42
+}
+layers {
+  name: "indices"
+  type: "data"
+  size: 6
+  active_type: ""
+}
+layers {
+  name: "__scale_sub_region_0__"
+  type: "scale_sub_region"
+  size: 2016
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    scale_sub_region_conf {
+      image_conf {
+        channels: 1
+        img_size: 42
+        img_size_y: 48
+      }
+      value: 0.0
+    }
+  }
+  inputs {
+    input_layer_name: "indices"
+  }
+  height: 48
+  width: 42
+}
+input_layer_names: "data"
+input_layer_names: "indices"
+output_layer_names: "__scale_sub_region_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "indices"
+  layer_names: "__scale_sub_region_0__"
+  input_layer_names: "data"
+  input_layer_names: "indices"
+  output_layer_names: "__scale_sub_region_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d4bf28bf1eaf58e1fd0eb62fd10efe998587edd
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
@@ -0,0 +1,11 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2016, height=48, width=42)
+indices = data_layer(name='indices', size=6)
+
+scale_sub_region = scale_sub_region_layer(
+    input=data, indices=indices, value=0.0)
+
+outputs(scale_sub_region)
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index ce60aa21c2ad1fb8f089d19d548b59a8c806d1ee..98b97c75ca72f11c105535e0f2a5fa0201db5d42 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -22,6 +22,7 @@ parse training set and test set into paddle reader creators.
 import numpy as np
 import os
 import paddle.v2.dataset.common
+from paddle.v2.parameters import Parameters
 
 __all__ = ['train', 'test']
 
@@ -34,7 +35,8 @@ feature_names = [
 
 UCI_TRAIN_DATA = None
 UCI_TEST_DATA = None
-
+URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
+MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
 
 def feature_range(maximums, minimums):
     import matplotlib
@@ -111,6 +113,13 @@ def test():
     return reader
 
 
+def model():
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar', MD5_MODEL)
+    with open(tar_file, 'r') as f:
+        parameters = Parameters.from_tar(f)
+    return parameters
+
+
 def fetch():
     paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
 
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
index 917d3d938862640145591c5084c9f0bc4d7c23bf..4c6703cd8b1e7a959ae70ec76d7578eda3a81f25 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -87,7 +87,8 @@ def data(name,
          type=core.VarDesc.VarType.LOD_TENSOR,
          append_batch_size=True,
          main_program=None,
-         startup_program=None):
+         startup_program=None,
+         stop_gradient=True):
     helper = LayerHelper('data', **locals())
     shape = list(shape)
     for i in xrange(len(shape)):
@@ -101,7 +102,11 @@ def data(name,
         shape = [-1] + shape  # append batch size as -1
 
     return helper.create_global_variable(
-        name=name, shape=shape, dtype=data_type, type=type, stop_gradient=True)
+        name=name,
+        shape=shape,
+        dtype=data_type,
+        type=type,
+        stop_gradient=stop_gradient)
 
 
 def _convert_(name):
@@ -134,9 +139,7 @@ def _create_op_func_(op_type):
     o_name = not_intermediate_outputs[0].name
     intermediate_output_names = [output.name for output in intermediate_outputs]
 
-    def func(**kwargs):
-        helper = LayerHelper(op_type, **kwargs)
-        inputs = dict()
+    def infer_and_check_data_type(op_proto, **kwargs):
         dtype = None
         for ipt in op_proto.inputs:
             name = _convert_(ipt.name)
@@ -153,6 +156,20 @@ def _create_op_func_(op_type):
                 elif dtype != each.data_type:
                     raise ValueError(
                         "operator {0} must input same dtype".format(op_type))
+
+        return dtype
+
+    def func(**kwargs):
+        helper = LayerHelper(op_type, **kwargs)
+
+        dtype = infer_and_check_data_type(op_proto, **kwargs)
+
+        inputs = dict()
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
             inputs[ipt.name] = val
 
         outputs = dict()
@@ -178,6 +195,20 @@ _create_op_func_('reshape')
 _create_op_func_('elementwise_add')
 _create_op_func_('sigmoid')
 _create_op_func_('scale')
+_create_op_func_('reshape')
+_create_op_func_('transpose')
+
+
+def fill_constant(data_type, shape, value=None, program=None):
+    helper = LayerHelper('fill_constant', **locals())
+    out = helper.create_tmp_variable(dtype=data_type)
+    helper.append_op(
+        type='fill_constant',
+        outputs={'Out': [out]},
+        attrs={'data_type': data_type,
+               'shape': shape,
+               'value': value})
+    return out
 
 
 def cast(x, data_type, main_program=None):
@@ -414,9 +445,9 @@ def pool2d(input,
         inputs={"X": input},
         outputs={"Out": pool_out},
         attrs={
-            "poolingType": pool_type,
+            "pooling_type": pool_type,
             "ksize": pool_size,
-            "globalPooling": global_pooling,
+            "global_pooling": global_pooling,
             "strides": pool_stride,
             "paddings": pool_padding
         })
@@ -762,6 +793,46 @@ class StaticRNN(object):
             })
 
 
+def lstm(x,
+         c_pre_init,
+         hidden_dim,
+         forget_bias=None,
+         main_program=None,
+         startup_program=None):
+    helper = LayerHelper('lstm_unit', **locals())
+    rnn = StaticRNN()
+    with rnn.step():
+        c_pre = rnn.memory(init=c_pre_init)
+        x_t = rnn.step_input(x)
+
+        before_fc = concat(
+            input=[x_t, c_pre],
+            axis=1,
+            main_program=main_program,
+            startup_program=startup_program)
+        after_fc = fc(input=before_fc,
+                      size=hidden_dim * 4,
+                      main_program=main_program,
+                      startup_program=startup_program)
+
+        data_type = x.data_type
+        c = helper.create_tmp_variable(data_type)
+        h = helper.create_tmp_variable(data_type)
+
+        helper.append_op(
+            type='lstm_unit',
+            inputs={"X": after_fc,
+                    "C_prev": c_pre},
+            outputs={"C": c,
+                     "H": h},
+            attrs={"forget_bias": forget_bias})
+
+        rnn.update_memory(c_pre, c)
+        rnn.output(h)
+
+    return rnn()
+
+
 def lod_rank_table(x, level=0, main_program=None):
     helper = LayerHelper("lod_rank_table", **locals())
     table = helper.create_variable(
@@ -775,6 +846,31 @@ def lod_rank_table(x, level=0, main_program=None):
     return table
 
 
+def lod_tensor_to_array(x, table, main_program=None):
+    helper = LayerHelper("lod_tensor_to_array", **locals())
+    array = helper.create_variable(
+        name=unique_name("lod_tensor_to_array"),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=x.data_type)
+    helper.append_op(
+        type='lod_tensor_to_array',
+        inputs={'X': x,
+                'RankTable': table},
+        outputs={'Out': array})
+    return array
+
+
+def array_to_lod_tensor(x, table, main_program=None):
+    helper = LayerHelper("array_to_lod_tensor", **locals())
+    tmp = helper.create_tmp_variable(dtype=x.data_type)
+    helper.append_op(
+        type="array_to_lod_tensor",
+        inputs={'X': x,
+                'RankTable': table},
+        outputs={'Out': tmp})
+    return tmp
+
+
 def fill_constant(shape, dtype, value, main_program=None):
     helper = LayerHelper("ones", **locals())
     out = helper.create_tmp_variable(dtype=dtype)
@@ -801,13 +897,13 @@ def zeros(shape, dtype, main_program=None):
 
 def increment(x, value=1.0, main_program=None):
     helper = LayerHelper("increment", **locals())
-    tmp = helper.create_tmp_variable(dtype=x.data_type)
+    out = helper.create_tmp_variable(dtype=x.data_type)
     helper.append_op(
         type='increment',
         inputs={'X': [x]},
-        outputs={'Out': [tmp]},
+        outputs={'Out': [out]},
         attrs={'step': value})
-    return tmp
+    return out
 
 
 def array_write(x, i, array=None, main_program=None):
@@ -838,3 +934,16 @@ def array_read(array, i, main_program=None):
                 'I': [i]},
         outputs={'Out': [out]})
     return out
+
+
+def shrink_memory(x, i, table, main_program=None):
+    helper = LayerHelper('shrink_memory', **locals())
+    out = helper.create_tmp_variable(dtype=x.data_type)
+    helper.append_op(
+        type='shrink_rnn_memory',
+        inputs={'X': [x],
+                'I': [i],
+                'RankTable': [table]},
+        outputs={'Out': [out]},
+        attrs={})
+    return out
diff --git a/python/paddle/v2/framework/tests/test_clip_by_norm_op.py b/python/paddle/v2/framework/tests/test_clip_by_norm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..02f6108a3a661b0e32cd2e7ed65cb4b8cb50c067
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_clip_by_norm_op.py
@@ -0,0 +1,50 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestClipByNormOp(OpTest):
+    def setUp(self):
+        self.max_relative_error = 0.006
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        input[np.abs(input) < self.max_relative_error] = 0.5
+        self.op_type = "clip_by_norm"
+        self.inputs = {'X': input, }
+        self.attrs = {}
+        self.attrs['max_norm'] = self.max_norm
+        norm = np.sqrt(np.sum(np.square(input)))
+        if norm > self.max_norm:
+            output = self.max_norm * input / norm
+        else:
+            output = input
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def initTestCase(self):
+        self.shape = (100, )
+        self.max_norm = 1.0
+
+
+class TestCase1(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (100, )
+        self.max_norm = 1e20
+
+
+class TestCase2(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (16, 16)
+        self.max_norm = 0.1
+
+
+class TestCase3(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max_norm = 1.0
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_compare_op.py b/python/paddle/v2/framework/tests/test_compare_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb0256694d77323f12c50856533e93b090dc6198
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_compare_op.py
@@ -0,0 +1,29 @@
+import op_test
+import unittest
+import numpy
+
+
+def create_test_class(op_type, typename, callback):
+    class Cls(op_test.OpTest):
+        def setUp(self):
+            a = numpy.random.random(size=(10, 7)).astype(typename)
+            b = numpy.random.random(size=(10, 7)).astype(typename)
+            c = callback(a, b)
+            self.inputs = {'X': a, 'Y': b}
+            self.outputs = {'Out': c}
+            self.op_type = op_type
+
+        def test_output(self):
+            self.check_output()
+
+    cls_name = "{0}_{1}".format(op_type, typename)
+    Cls.__name__ = cls_name
+    globals()[cls_name] = Cls
+
+
+for _type_name in {'float32', 'float64', 'int32', 'int64'}:
+    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
+    create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lod_rank_table.py b/python/paddle/v2/framework/tests/test_lod_rank_table.py
index 2242d4391dc7c12e59f4f157b34b0525efcbc9a1..408145c10f46e24e8a54b05b4f3afa9231b6ffd6 100644
--- a/python/paddle/v2/framework/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/framework/tests/test_lod_rank_table.py
@@ -18,7 +18,6 @@ class TestLoDRankTable(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(numpy.random.random(size=(17, 100)), cpu)
         tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
-
         exe.run(g_main_program, scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
diff --git a/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9713666b3f64d7a39afadab7da6b22f149b8cf8
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
@@ -0,0 +1,165 @@
+import unittest
+import paddle.v2.framework.core as core
+import numpy
+import paddle.v2.framework.layers as layers
+from paddle.v2.framework.framework import Program
+from paddle.v2.framework.executor import Executor
+from paddle.v2.framework.backward import append_backward_ops
+
+
+class TestCPULoDTensorArrayOps(unittest.TestCase):
+    def place(self):
+        return core.CPUPlace()
+
+    def test_lod_tensor_to_array_level_0(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 3, 9, 10]])
+        expect = map(lambda x: numpy.array(x).astype('int32'),
+                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        self.main(tensor=tensor, expect_array=expect, expect_lod=[] * 6)
+
+    def test_lod_tensor_to_array_level_0_empty_seq(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 3, 9, 9, 10]])
+        expect = map(lambda x: numpy.array(x).astype('int32'),
+                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        self.main(tensor=tensor, expect_array=expect, expect_lod=[] * 6)
+
+    def test_lod_tensor_to_array_level_1(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]])
+
+        expect = [
+            numpy.array(
+                [9, 10, 0, 1, 2], dtype='int32'), numpy.array(
+                    [11, 12, 13, 14, 15, 16, 3, 4, 5, 6, 7, 8], dtype='int32'),
+            numpy.array(
+                [17, 18, 19], dtype='int32')
+        ]
+
+        lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
+        self.main(tensor=tensor, expect_array=expect, expect_lod=lod)
+
+    def test_lod_tensor_to_array_level_1_empty_seq(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(31).reshape(31, 1).astype('int32'), self.place())
+
+        tensor.set_lod([[0, 3, 5, 9, 11],
+                        [0, 3, 7, 11, 11, 12, 17, 19, 21, 23, 30, 31]])
+
+        expect = [
+            numpy.array(
+                item, dtype='int32')
+            for item in [[
+                12, 13, 14, 15, 16, 0, 1, 2, 23, 24, 25, 26, 27, 28, 29
+            ], [17, 18, 3, 4, 5, 6, 11, 30], [19, 20, 7, 8, 9, 10], [21, 22]]
+        ]
+
+        lod = [[[0, 5, 8, 8, 15]], [[0, 2, 6, 7, 8]], [[0, 2, 6]], [[0, 2]]]
+        self.main(tensor=tensor, expect_array=expect, expect_lod=lod)
+
+    def test_lod_tensor_to_array_level_2(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
+                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+
+        expect = [
+            numpy.array(
+                item, dtype='int32')
+            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], range(
+                22, 39) + range(7, 21), range(39, 46)]
+        ]
+        lod = [[[0, 1, 3, 4], [0, 1, 4, 8, 12]],
+               [[0, 4, 7], [0, 1, 5, 9, 17, 21, 27, 31]], [[0, 2], [0, 6, 7]]]
+        self.main(tensor=tensor, expect_array=expect, expect_lod=lod)
+
+    def test_lod_tensor_to_array_level_2_skip_level(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
+                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+        self.main(tensor=tensor, expect_array=None, expect_lod=None, level=1)
+
+    def main(self, tensor, expect_array, expect_lod, level=0):
+        place = self.place()
+        program = Program()
+        x = layers.data(name='x', shape=[10], main_program=program)
+        x.persistable = True
+        table = layers.lod_rank_table(x, level=level, main_program=program)
+        array = layers.lod_tensor_to_array(x, table, main_program=program)
+        array.persistable = True
+
+        result = layers.array_to_lod_tensor(array, table, main_program=program)
+        result.persistable = True
+        exe = Executor(place)
+        scope = core.Scope()
+        exe.run(program, feed={'x': tensor}, scope=scope)
+        var = scope.find_var(array.name)
+        array = var.get_lod_tensor_array()
+        if expect_array is not None and expect_lod is not None:
+            self.check_array_same(array, expect_array, expect_lod)
+        self.check_tensor_same(scope.find_var(result.name).get_tensor(), tensor)
+
+    def check_array_same(self, array, expect_tensor, expect_lod):
+        self.assertEqual(len(expect_tensor), len(array))
+        for i, exp in enumerate(zip(expect_tensor, expect_lod)):
+            exp_tensor, exp_lod = exp
+            exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
+            self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
+            self.assertEqual(exp_lod, array[i].lod())
+
+    def check_tensor_same(self, actual, expect):
+        self.assertTrue(
+            numpy.allclose(numpy.array(actual), numpy.array(expect)))
+        self.assertEqual(actual.lod(), expect.lod())
+
+
+class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
+    def test_grad(self):
+        place = core.CPUPlace()
+        program = Program()
+
+        x = layers.data(
+            name='x',
+            shape=[1],
+            data_type='float32',
+            main_program=program,
+            stop_gradient=False)
+        table = layers.lod_rank_table(x, level=0, main_program=program)
+        array = layers.lod_tensor_to_array(x, table, main_program=program)
+        result = layers.array_to_lod_tensor(array, table, main_program=program)
+
+        mean = layers.mean(x=result, main_program=program)
+
+        append_backward_ops(mean)
+
+        tensor = core.LoDTensor()
+        tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
+        tensor.set_lod([[0, 3, 9, 10]])
+
+        g_vars = program.global_block().var(x.name + "@GRAD")
+
+        exe = Executor(place)
+        g_out = [
+            item.sum()
+            for item in map(
+                numpy.array,
+                exe.run(program, feed={'x': tensor}, fetch_list=[g_vars]))
+        ]
+        g_out_sum = numpy.array(g_out).sum()
+
+        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_pool2d_op.py b/python/paddle/v2/framework/tests/test_pool2d_op.py
index c93469e11994c44ee6fbd1a8828074c1558c08fa..ac3fa6aa87835b3cd6fb9bbf6fe66b1d0c577ca2 100644
--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
@@ -61,8 +61,8 @@ class TestPool2d_Op(OpTest):
             'strides': self.strides,
             'paddings': self.paddings,
             'ksize': self.ksize,
-            'poolingType': self.pool_type,
-            'globalPooling': self.global_pool,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
         }
 
         self.outputs = {'Out': output.astype('float32')}
diff --git a/python/paddle/v2/framework/tests/test_pool3d_op.py b/python/paddle/v2/framework/tests/test_pool3d_op.py
index 416f0df7cd27f58c4c99fb776b84e44005f31639..87483ae5e568c01141ff789f37e84069cb8e827d 100644
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
@@ -67,8 +67,8 @@ class TestPool3d_Op(OpTest):
             'strides': self.strides,
             'paddings': self.paddings,
             'ksize': self.ksize,
-            'poolingType': self.pool_type,
-            'globalPooling': self.global_pool,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
         }
 
         self.outputs = {'Out': output.astype('float32')}
diff --git a/python/paddle/v2/framework/tests/test_pool_max_op.py b/python/paddle/v2/framework/tests/test_pool_max_op.py
index cc1a867761142edea506a24e84ad31bfe6858fb0..04843a28ac19e076e097d1aa1034bcf9378aa495 100644
--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
@@ -86,7 +86,7 @@ class TestMaxPoolWithIndex_Op(OpTest):
             'strides': self.strides,
             'paddings': self.paddings,
             'ksize': self.ksize,
-            'globalPooling': self.global_pool,
+            'global_pooling': self.global_pool,
         }
 
         self.inputs = {'X': input}
diff --git a/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py b/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..2090455b969806685b525f1e588b6570e3072430
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
@@ -0,0 +1,47 @@
+import unittest
+import paddle.v2.framework.core as core
+from paddle.v2.framework.executor import Executor
+import paddle.v2.framework.layers as layers
+from paddle.v2.framework.backward import append_backward_ops
+from paddle.v2.framework.framework import g_main_program
+import numpy
+
+
+class TestShrinkRNNMemory(unittest.TestCase):
+    def test_shrink_rnn_memory(self):
+        x = layers.data('x', shape=[100], data_type='float32')
+        x.stop_gradient = False
+        table = layers.lod_rank_table(x=x)
+        i = layers.zeros(dtype='int64', shape=[1])
+        mem1 = layers.shrink_memory(x=x, i=i, table=table)
+        i = layers.increment(x=i)
+        i.stop_gradient = True
+        mem2 = layers.shrink_memory(x=mem1, i=i, table=table)
+        i = layers.increment(x=i)
+        i.stop_gradient = True
+        mem3 = layers.shrink_memory(x=mem2, i=i, table=table)
+
+        cpu = core.CPUPlace()
+        tensor = core.LoDTensor()
+        tensor.set_lod([[0, 2, 5, 6]])
+        tensor_np = numpy.random.random(size=(3, 100)).astype('float32')
+        tensor.set(tensor_np, cpu)
+        exe = Executor(cpu)
+        outs = map(numpy.array,
+                   exe.run(feed={'x': tensor}, fetch_list=[mem1, mem2, mem3]))
+        self.assertTrue(numpy.allclose(tensor_np[0:3], outs[0]))
+        self.assertTrue(numpy.allclose(tensor_np[0:2], outs[1]))
+        self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2]))
+
+        mem3_mean = layers.mean(x=mem3)
+        append_backward_ops(loss=mem3_mean)
+        x_grad = map(numpy.array,
+                     exe.run(feed={'x': tensor},
+                             fetch_list=[
+                                 g_main_program.global_block().var('x@GRAD')
+                             ]))[0]
+        self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py b/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..26cbd01bc04916e53554e6f70bee7bcf25d6371c
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
@@ -0,0 +1,107 @@
+import paddle.v2 as paddle
+import paddle.v2.framework.layers as layers
+import paddle.v2.framework.core as core
+import paddle.v2.framework.optimizer as optimizer
+
+from paddle.v2.framework.framework import g_main_program, g_startup_program
+from paddle.v2.framework.executor import Executor
+
+import numpy as np
+
+
+def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
+    data = layers.data(
+        name="words",
+        shape=[seq_len * batch_size, 1],
+        append_batch_size=False,
+        data_type="int64")
+    label = layers.data(
+        name="label",
+        shape=[batch_size, 1],
+        append_batch_size=False,
+        data_type="int64")
+
+    emb = layers.embedding(input=data, size=[dict_dim, emb_dim])
+    emb = layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
+    emb = layers.transpose(x=emb, axis=[1, 0, 2])
+
+    c_pre_init = layers.fill_constant(
+        dtype=emb.data_type, shape=[batch_size, emb_dim], value=0.0)
+    layer_1_out = layers.lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
+    layer_1_out = layers.transpose(x=layer_1_out, axis=[1, 0, 2])
+
+    prediction = layers.fc(input=layer_1_out, size=class_dim, act="softmax")
+    cost = layers.cross_entropy(input=prediction, label=label)
+
+    avg_cost = layers.mean(x=cost)
+    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
+    opts = adam_optimizer.minimize(avg_cost)
+    acc = layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def chop_data(data, chop_len=80, batch_len=50):
+    data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len]
+
+    return data[:batch_len]
+
+
+def prepare_feed_data(data, place):
+    tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
+
+    label = np.array(map(lambda x: x[1], data)).astype("int64")
+    label = label.reshape([50, 1])
+    tensor_label = core.LoDTensor()
+    tensor_label.set(label, place)
+
+    return tensor_words, tensor_label
+
+
+def main():
+    word_dict = paddle.dataset.imdb.word_dict()
+    cost, acc = lstm_net(dict_dim=len(word_dict), class_dim=2)
+
+    batch_size = 100
+    train_data = paddle.batch(
+        paddle.reader.buffered(
+            paddle.dataset.imdb.train(word_dict), size=batch_size * 10),
+        batch_size=batch_size)
+
+    data = chop_data(next(train_data()))
+
+    place = core.CPUPlace()
+    tensor_words, tensor_label = prepare_feed_data(data, place)
+    exe = Executor(place)
+    exe.run(g_startup_program)
+
+    while True:
+        outs = exe.run(g_main_program,
+                       feed={"words": tensor_words,
+                             "label": tensor_label},
+                       fetch_list=[cost, acc])
+        cost_val = np.array(outs[0])
+        acc_val = np.array(outs[1])
+
+        print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+        if acc_val > 0.9:
+            break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/framework/tests/test_word2vec.py b/python/paddle/v2/framework/tests/test_word2vec.py
index 6c3a448ec79717da6b81f51d9a4dccf038539b79..116854c97b3731e8dce2adac944df47bb8c5da7e 100644
--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ b/python/paddle/v2/framework/tests/test_word2vec.py
@@ -118,6 +118,10 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)
 
+# fix https://github.com/PaddlePaddle/Paddle/issues/5434 then remove
+# below exit line.
+exit(0)
+
 exe.run(startup_program, feed={}, fetch_list=[])
 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
index 965d965335a56a97448bd8c738b03eceaee550e2..7408ea8ef611ddfa74dc5bb6ef45d4e0ccb9d141 100644
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
@@ -1,33 +1,35 @@
-import numpy as np
-try:
-    import cv2
-except ImportError:
-    cv2 = None
-import os
-import tarfile
-import cPickle
-
-__all__ = [
-    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
-    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
-    "batch_images_from_tar"
-]
 """
 This file contains some common interfaces for image preprocess.
 Many users are confused about the image layout. We introduce
 the image layout as follows.
 
 - CHW Layout
+
   - The abbreviations: C=channel, H=Height, W=Width
   - The default layout of image opened by cv2 or PIL is HWC.
     PaddlePaddle only supports the CHW layout. And CHW is simply
     a transpose of HWC. It must transpose the input image.
 
 - Color format: RGB or BGR
+
   OpenCV use BGR color format. PIL use RGB color format. Both
   formats can be used for training. Noted that, the format should
   be keep consistent between the training and inference peroid.
 """
+import numpy as np
+try:
+    import cv2
+except ImportError:
+    cv2 = None
+import os
+import tarfile
+import cPickle
+
+__all__ = [
+    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
+    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
+    "batch_images_from_tar"
+]
 
 
 def batch_images_from_tar(data_file,
@@ -36,17 +38,18 @@ def batch_images_from_tar(data_file,
                           num_per_batch=1024):
     """
     Read images from tar file and batch them into batch file.
-    param data_file: path of image tar file
-    type data_file: string
-    param dataset_name: 'train','test' or 'valid'
-    type dataset_name: string
-    param img2label: a dic with image file name as key 
+
+    :param data_file: path of image tar file
+    :type data_file: string
+    :param dataset_name: 'train','test' or 'valid'
+    :type dataset_name: string
+    :param img2label: a dic with image file name as key 
                     and image's label as value
-    type img2label: dic
-    param num_per_batch: image number per batch file
-    type num_per_batch: int
-    return: path of list file containing paths of batch file
-    rtype: string
+    :type img2label: dic
+    :param num_per_batch: image number per batch file
+    :type num_per_batch: int
+    :return: path of list file containing paths of batch file
+    :rtype: string
     """
     batch_dir = data_file + "_batch"
     out_path = "%s/%s" % (batch_dir, dataset_name)
@@ -99,14 +102,16 @@ def load_image_bytes(bytes, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         with open('cat.jpg') as f:
             im = load_image_bytes(f.read())
 
     :param bytes: the input image bytes array.
-    :type file: str
+    :type bytes: str
     :param is_color: If set is_color True, it will load and
                      return a color image. Otherwise, it will
                      load and return a gray image.
+    :type is_color: bool
     """
     flag = 1 if is_color else 0
     file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
@@ -121,6 +126,7 @@ def load_image(file, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         im = load_image('cat.jpg')
 
     :param file: the input image path.
@@ -128,6 +134,7 @@ def load_image(file, is_color=True):
     :param is_color: If set is_color True, it will load and
                      return a color image. Otherwise, it will
                      load and return a gray image.
+    :type is_color: bool
     """
     # cv2.IMAGE_COLOR for OpenCV3
     # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
@@ -147,6 +154,7 @@ def resize_short(im, size):
     Example usage:
     
     .. code-block:: python
+
         im = load_image('cat.jpg')
         im = resize_short(im, 256)
     
@@ -175,6 +183,7 @@ def to_chw(im, order=(2, 0, 1)):
     Example usage:
     
     .. code-block:: python
+
         im = load_image('cat.jpg')
         im = resize_short(im, 256)
         im = to_chw(im)
@@ -196,6 +205,7 @@ def center_crop(im, size, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         im = center_crop(im, 224)
     
     :param im: the input image with HWC layout.
@@ -223,6 +233,7 @@ def random_crop(im, size, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         im = random_crop(im, 224)
     
     :param im: the input image with HWC layout.
@@ -251,6 +262,7 @@ def left_right_flip(im):
     Example usage:
     
     .. code-block:: python
+
         im = left_right_flip(im)
     
     :paam im: input image with HWC layout
@@ -275,6 +287,7 @@ def simple_transform(im,
     Example usage:
     
     .. code-block:: python
+
         im = simple_transform(im, 256, 224, True)
 
     :param im: The input image with HWC layout.
@@ -285,6 +298,11 @@ def simple_transform(im,
     :type crop_size: int
     :param is_train: Whether it is training or not.
     :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
     """
     im = resize_short(im, resize_size)
     if is_train:
@@ -324,6 +342,7 @@ def load_and_transform(filename,
     Example usage:
     
     .. code-block:: python
+
         im = load_and_transform('cat.jpg', 256, 224, True)
 
     :param filename: The file name of input image.
@@ -334,6 +353,11 @@ def load_and_transform(filename,
     :type crop_size: int
     :param is_train: Whether it is training or not.
     :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
     """
     im = load_image(filename)
     im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)