Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into clip_by_norm

update

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into clip_by_norm
update
c8c4b6e4 · wwhu · 59cbaf9f · ce08645d · c8c4b6e4 · c8c4b6e4
296 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -30,6 +30,7 @@ addons:
      - automake
      - libtool
      - ccache
+  ssh_known_hosts: 52.76.173.135
 before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
@@ -42,6 +43,14 @@ script:
  - |
    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+  - |
+    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
+    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
+    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
+    export DOCS_DIR=`pwd`
+    cd ..
+    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc   
 notifications:
  email:
    on_success: change

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,7 +126,7 @@ include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
-include(external/pybind11)    # download pybind11
+include(external/pybind11)  # download pybind11
 include(external/nccl)
 include(cudnn)              # set cudnn libraries, must before configure

--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -23,7 +23,7 @@ On each machine, we will test and compare the performance of training on single
 ## Benchmark Model
 ### Server
-Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
+Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
 Input image size - 3 * 224 * 224, Time: images/second

--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg("layer_num", int, 50)
+is_test = get_config_arg("is_test", bool, False)
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+#######################Network Configuration #############
+def conv_bn_layer(name,
+                  input,
+                  filter_size,
+                  num_filters,
+                  stride,
+                  padding,
+                  channels=None,
+                  active_type=ReluActivation()):
+    """
+    A wrapper for conv layer with batch normalization layers.
+    Note:
+    conv layer has no activation.
+    """
+    tmp = img_conv_layer(
+        name=name + "_conv",
+        input=input,
+        filter_size=filter_size,
+        num_channels=channels,
+        num_filters=num_filters,
+        stride=stride,
+        padding=padding,
+        act=LinearActivation(),
+        bias_attr=False)
+    return batch_norm_layer(
+        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
+def bottleneck_block(name, input, num_filters1, num_filters2):
+    """
+    A wrapper for bottlenect building block in ResNet.
+    Last conv_bn_layer has no activation.
+    Addto layer has activation of relu.
+    """
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=1,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+    return addto_layer(
+        name=name + "_addto", input=[input, last_name], act=ReluActivation())
+def mid_projection(name, input, num_filters1, num_filters2, stride=2):
+    """
+    A wrapper for middile projection in ResNet.
+    projection shortcuts are used for increasing dimensions,
+    and other shortcuts are identity
+    branch1: projection shortcuts are used for increasing
+    dimensions, has no activation.
+    branch2x: bottleneck building block, shortcuts are identity.
+    """
+    # stride = 2
+    branch1 = conv_bn_layer(
+        name=name + '_branch1',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=stride,
+        padding=0,
+        active_type=LinearActivation())
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=stride,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+    return addto_layer(
+        name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
+img = data_layer(name='image', size=height * width * 3)
+def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
+    """
+    A wrapper for 50,101,152 layers of ResNet.
+    res2_num: number of blocks stacked in conv2_x
+    res3_num: number of blocks stacked in conv3_x
+    res4_num: number of blocks stacked in conv4_x
+    res5_num: number of blocks stacked in conv5_x
+    """
+    # For ImageNet
+    # conv1: 112x112
+    tmp = conv_bn_layer(
+        "conv1",
+        input=img,
+        filter_size=7,
+        channels=3,
+        num_filters=64,
+        stride=2,
+        padding=3)
+    tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
+    # conv2_x: 56x56
+    tmp = mid_projection(
+        name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
+    for i in xrange(2, res2_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
+    # conv3_x: 28x28
+    tmp = mid_projection(
+        name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
+    for i in xrange(2, res3_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res3_" + str(i),
+            input=tmp,
+            num_filters1=128,
+            num_filters2=512)
+    # conv4_x: 14x14
+    tmp = mid_projection(
+        name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
+    for i in xrange(2, res4_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res4_" + str(i),
+            input=tmp,
+            num_filters1=256,
+            num_filters2=1024)
+    # conv5_x: 7x7
+    tmp = mid_projection(
+        name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
+    for i in xrange(2, res5_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res5_" + str(i),
+            input=tmp,
+            num_filters1=512,
+            num_filters2=2048)
+    tmp = img_pool_layer(
+        name='avgpool',
+        input=tmp,
+        pool_size=7,
+        stride=1,
+        pool_type=AvgPooling())
+    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
+if layer_num == 50:
+    resnet = deep_res_net(3, 4, 6, 3)
+elif layer_num == 101:
+    resnet = deep_res_net(3, 4, 23, 3)
+elif layer_num == 152:
+    resnet = deep_res_net(3, 8, 36, 3)
+else:
+    print("Wrong layer number.")
+lbl = data_layer(name="label", size=num_class)
+loss = cross_entropy(name='loss', input=resnet, label=lbl)
+inputs(img, lbl)
+outputs(loss)
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@@ -5,22 +5,23 @@ function train() {
  export OMP_DYNAMIC="FALSE"
  export KMP_AFFINITY="granularity=fine,compact,0,0"
  topology=$1
-  bs=$2
+  layer_num=$2
-  use_mkldnn=$3
+  bs=$3
-  if [ $3 == "True" ]; then
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
    thread=1
-    log="logs/${topology}-mkldnn-${bs}.log"
+    log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
-  elif [ $3 == "False" ]; then
+  elif [ $4 == "False" ]; then
    thread=`nproc`
    # each trainer_count use only 1 core to avoid conflict
    export OMP_NUM_THREADS=1
    export MKL_NUM_THREADS=1
-    log="logs/${topology}-${thread}mklml-${bs}.log"
+    log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
  else
    echo "Wrong input $3, use True or False."
    exit 0
  fi
-  args="batch_size=${bs}"
+  args="batch_size=${bs},layer_num=${layer_num}"
  config="${topology}.py"
  paddle train --job=time \
    --config=$config \
@@ -40,12 +41,9 @@ if [ ! -d "logs" ]; then
  mkdir logs
 fi
-#========== mkldnn ==========#
+for use_mkldnn in True False; do
-train vgg 64 True
+  for batchsize in 64 128 256; do
-train vgg 128 True
+    train vgg 19 $batchsize $use_mkldnn
-train vgg 256 True
+    train resnet 50  $batchsize $use_mkldnn
+  done
-#========== mklml ===========#
+done
-train vgg 64 False
-train vgg 128 False
-train vgg 256 False
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -79,9 +79,8 @@ if(NOT DEFINED IOS_ARCH)
    # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
    set(IOS_ARCH "arm64")
  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    set(IOS_ARCH "i386;x86_64")
+    # FIXME(liuyiqun): support "i386;x86_64" future
-  elseif(IOS_PLATFORM STREQUAL "WATCHOS")
+    set(IOS_ARCH "x86_64")
-    set(IOS_ARCH armv7k)
  endif()
 endif()
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")

--- a/cmake/external/nccl.cmake
+++ b/cmake/external/nccl.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+if(NOT WITH_GPU)
+  return()
+endif()
 include(ExternalProject)
 set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
-INCLUDE(ExternalProject)
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
-SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+if(NOT WITH_PYTHON)
+    return()
+endif()
+include(ExternalProject)
-INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
 ExternalProject_Add(
        extern_pybind
@@ -17,14 +35,12 @@ ExternalProject_Add(
        TEST_COMMAND      ""
 )
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";")
    add_library(pybind STATIC ${dummyfile})
 else()
    add_library(pybind INTERFACE)
 endif()
 add_dependencies(pybind extern_pybind)
-LIST(APPEND external_project_dependencies pybind)
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
 # This file is use to check all support level of AVX on your machine
 # so that PaddlePaddle can unleash the vectorization power of muticore.
-INCLUDE(CheckCXXSourceRuns)
+include(CheckCXXSourceRuns)
-INCLUDE(CheckCXXSourceCompiles)
+include(CheckCXXSourceCompiles)
-IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
    set(MMX_FLAG "-mmmx")
    set(SSE2_FLAG "-msse2")
    set(SSE3_FLAG "-msse3")
-    SET(AVX_FLAG "-mavx")
+    set(AVX_FLAG "-mavx")
-    SET(AVX2_FLAG "-mavx2")
+    set(AVX2_FLAG "-mavx2")
-ELSEIF(MSVC)
+elseif(MSVC)
    set(MMX_FLAG "/arch:MMX")
    set(SSE2_FLAG "/arch:SSE2")
    set(SSE3_FLAG "/arch:SSE3")
    SET(AVX_FLAG "/arch:AVX")
    SET(AVX2_FLAG "/arch:AVX2")
-ENDIF()
+endif()
 set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
 # Check  MMX
 set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
+set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <mmintrin.h>
 int main()
@@ -32,6 +33,7 @@ int main()
 # Check SSE2
 set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
+set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <emmintrin.h>
 int main()
@@ -42,6 +44,7 @@ int main()
 # Check SSE3
 set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
+set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <pmmintrin.h>
 int main()
@@ -55,6 +58,7 @@ int main()
 # Check AVX
 set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
@@ -67,6 +71,7 @@ int main()
 # Check AVX 2
 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()

--- a/doc/design/float16.md
+++ b/doc/design/float16.md
+# Design Doc: float16
+## Why float16
+Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. 
+When high precision computation is not required, using float16 data type could potentially 
+- reduce storage space, memory bandwidth, and power usages; 
+- increase the chance of data fitting into a smaller cache of lower latency; 
+- provide arithmetic speed up if supported by hardware. 
+## Survey of current float16 support
+A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info.
+The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernel. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. 
+### Compiler
+- nvcc supports `__half` data type after CUDA 7.5.
+- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4.
+- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9.
+### Hardware
+- `__half` is supported on GPU with compute capability >= 5.3.
+- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above.
+- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018).
+### Libraries
+- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
+- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
+## Implementation
+The float16 class holds a 16-bit `uint16_t` data internally.
+```
+struct float16 {
+  uint16_t x;
+};
+``` 
+float16 supports the following features:
+  - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. 
+  - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen.
+  - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. 
+  - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. 
+To support the above features, two fundamental conversion functions are provided:
+```
+float16 float_to_half_rn(float f);  // convert to half precision in round-to-nearest-even mode
+float half_to_float(float16 h);
+```
+which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion.
+## To do
+After float16 class is available, some of the future items are below:
+- Update pybind/tensor_py.h to bind c++ float16 with numpy float16. 
+- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16.
+- Create a type-casting operator that can convert the data type in tensor between float16 and other types.
--- a/doc/design/images/asgd.gif
+++ b/doc/design/images/asgd.gif
--- a/doc/design/images/theta_star.gif
+++ b/doc/design/images/theta_star.gif
--- a/doc/design/parameter_average.md
+++ b/doc/design/parameter_average.md
+# Averaging Parameter in PaddlePaddle
+## Why Averaging
+In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can.
+Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="./images/theta_star.gif"/><br/> . The averaging is done as follows:
+<img src="./images/asgd.gif" align="center"/><br/>
+We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
+### How to perform Parameter Averaging in PaddlePaddle
+Parameter Averaging in PaddlePaddle works in the following way during training :
+1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer
+2. The optimizer itself is responsible for updating the parameters.
+3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
+    1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches.
+    2. However, saving all the N instances of the parameters in memory is not feasible.
+    3. Therefore, an approximation algorithm is used.
+Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
+During the testing/ saving the model phase, we perform the following steps:
+1. Perform the delayed operations.
+2. Save current values of the parameters to a temporary variable.
+3. Replace the values of the parameters with the averaged values.
+4. Perform testing and/or save the parameters.
+5. Restore the values of the parameters once done.
+### How to implement Averaging of Parameter in PaddlePaddle
+We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training.
+	**Advantages**:
+    - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op.
+    - Makes it easy for the users to customize and extend the framework.
+	**Disadvantages**:
+    - Implementation requires re-writing the averaging methodology in Python.  
+### Low-Level implementation
+In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input:
+- the optimizer
+- the window_size to keep the updates
+The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
+The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
+### Python API implementation for ParameterAverageOptimizer
+Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following:
+- Any optimizer (RMSProp , AdaGrad etc.)
+- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
+Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
+We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.cc)
+#### Creation of the ParameterAverageOptimizer operator
+There are two ways for creating the ParameterAverageOptimizer op:
+1. We create the op immediately while building the computation graph.
+2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added.
+The proposal is to add the op immediately while building the computation graph.
+#### High-level API
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
--- a/doc/faq/parameter/index_cn.rst
+++ b/doc/faq/parameter/index_cn.rst
@@ -75,7 +75,7 @@ PaddlePaddle目前支持8种learning_rate_schedule，这8种learning_rate_schedu
      optimizer = paddle.optimizer.Adam(
          learning_rate=1e-3,
-          learning_rate_schedule="manual",
+          learning_rate_schedule="pass_manual",
          learning_rate_args="1:1.0,2:0.9,3:0.8",)
  在该示例中，当已训练pass数小于等于1时，学习率为 :code:`1e-3 * 1.0`；当已训练pass数大于1小于等于2时，学习率为 :code:`1e-3 * 0.9`；当已训练pass数大于2时，学习率为 :code:`1e-3 * 0.8`。

--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -145,7 +145,7 @@ PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以
 Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
-PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
 如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
 我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：

--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -63,7 +63,7 @@
 </tr>
 <tr>
-<td class="left" rowspan="15">训练</td><td class="left">dot_period</td>
+<td class="left" rowspan="14">训练</td><td class="left">dot_period</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>

--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -8,3 +8,4 @@ PaddlePaddle 文档
  howto/index_cn.rst
  api/index_cn.rst
  faq/index_cn.rst
+  mobile/index_cn.rst
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -7,3 +7,4 @@ PaddlePaddle Documentation
  getstarted/index_en.rst
  howto/index_en.rst
  api/index_en.rst
+  mobile/index_en.rst
--- a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
 # 构建Android平台上的PaddlePaddle库
 用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
 - 基于Docker容器的编译方式
 - 基于Linux交叉编译环境的编译方式
 ## 基于Docker容器的编译方式
@@ -20,20 +20,42 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android
 构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
 Android的Docker开发镜像向用户提供两个可配置的参数：
-| Argument        | Optional Values         | Default |
+<table class="docutils">
-|-----------------|-------------------------|---------|
+<colgroup>
-|`ANDROID_ABI`    |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` |
+  <col width="25%" />
-|`ANDROID_API`    |`>= 21` | `21` |
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 21</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
-```bash
+  ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
-```
+  ```
 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
-```bash
+  ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
-```
+  ```
 执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
@@ -82,16 +104,16 @@ CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cm
 Android平台可选配置参数：
 - `ANDROID_STANDALONE_TOOLCHAIN`，独立工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别；否则，用户需要在cmake时手动设置这些值。无默认值。
 - `ANDROID_TOOLCHAIN`，目标工具链。可设置`gcc/clang`，默认值为`clang`。
 	- CMake 3.7以上，将会始终使用`clang`工具链；CMake 3.7以下，可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。
 	- Android官方提供的`clang`编译器要求系统支持`GLIBC 2.15`以上。
 - `ANDROID_ABI`，目标架构ABI。目前支持`armeabi-v7a`和`arm64-v8a`，默认值为`armeabi-v7a`。
 - `ANDROID_NATIVE_API_LEVEL`，工具链的Android API级别。若没有显式设置，PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。
 - `ANROID_ARM_MODE`，是否使用ARM模式。
 	- `ANDROID_ABI=armeabi-v7a`时，可设置`ON/OFF`，默认值为`ON`；
 	- `ANDROID_ABI=arm64-v8a`时，不需要设置。
 - `ANDROID_ARM_NEON`，是否使用NEON指令。
 	- `ANDROID_ABI=armeabi-v7a`时，可设置`ON/OFF`，默认值为`ON`；
 	- `ANDROID_ABI=arm64-v8a`时，不需要设置。
 其他配置参数：
@@ -119,7 +141,7 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
      -DANDROID_ABI=arm64-v8a \
      -DUSE_EIGEN_FOR_BLAS=OFF \
-      -DCMAKE_INSTALL_PREFIX=your/path/to/install \  
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
      -DWITH_C_API=ON \
      -DWITH_SWIG_PY=OFF \
      ..
@@ -128,8 +150,8 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
 用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
 **性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：
 - 设置`CMAKE_BUILD_TYPE`为`Release`
 - 使用`clang`编译工具链
 - `armeabi-v7a`时，设置`USE_EIGEN_BLAS=ON`，使用Eigen进行矩阵计算；`arm64-v8a`时，设置`USE_EIGEN_FOR_BLAS=OFF`，使用OpenBLAS进行矩阵计算
 ### 编译和安装

--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
+# Build PaddlePaddle for Android
+There are two approaches to build PaddlePaddle for Android: using Docker and on Linux without Docker. 
+## Cross-Compiling Using Docker
+Docker-based cross-compiling is the recommended approach because Docker runs on all major operating systems, including Linux, Mac OS X, and Windows.
+### Build the Docker Image
+The following steps pack all the tools that we need to build PaddlePaddle into a Docker image.
+```bash
+$ git clone https://github.com/PaddlePaddle/Paddle.git
+$ cd Paddle
+$ docker build -t paddle:dev-android . -f Dockerfile.android
+```
+### Build the Inference Library
+We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android
+```
+The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
+<table class="docutils">
+<colgroup>
+  <col width="25%" />
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 21</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>
+The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
+The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
+The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`.
+## Cross-Compiling on Linux
+The Linux-base approach to cross-compile is to run steps in `Dockerfile.android` manually on a Linux x64 computer.
+### Setup the Environment
+To build for Android's, we need [Android NDK](
+https://developer.android.com/ndk/downloads/index.html):
+```bash
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+```
+Android NDK includes everything we need to build the [*standalone toolchain*](https://developer.android.com/ndk/guides/standalone_toolchain.html), which in then used to build PaddlePaddle for Android.  (We plan to remove the intermediate stage of building the standalone toolchain in the near future.)
+- To build the standalone toolchain for `armeabi-v7a` and Android API level 21:
+  ```bash
+  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+          --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+  ```
+  The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`.
+- To build the standalone toolchain for `arm64-v8a` and Android API level 21:
+  ```bash
+  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+          --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+  ```
+  The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
+**Please be aware that the minimum level of Android API required by PaddlePaddle is 21.**
+### Cross-Compiling Arguments
+CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling).  PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake.  `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling).
+Some other CMake arguments you need to know:
+- `CMAKE_SYSTEM_NAME` must be `Android`.  This tells PaddlePaddle's CMake system to cross-compile third-party dependencies.  This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, and `WITH_RDMA=OFF`.
+- `WITH_C_API` must be `ON`, to build the C-based inference library for Android.
+- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API.
+Some Android-specific arguments:
+- `ANDROID_STANDALONE_TOOLCHAIN`: the absolute path of the Android standalone toolchain, or the path relative to the CMake build directory.  PaddlePaddle's CMake extensions would derive the cross-compiler, sysroot and Android API level from this argument.
+- `ANDROID_TOOLCHAIN`: could be `gcc` or `clang`.  The default value is `clang`.
+  - For CMake >= 3.7, it should anyway be `clang`.  For older versions, it could be `gcc`.
+  - Android's official `clang` requires `glibc` >= 2.15.
+- `ANDROID_ABI`: could be `armeabi-v7a` or `arm64-v8a`.  The default value is `armeabi-v7a`.
+- `ANDROID_NATIVE_API_LEVEL`: could be derived from the value of `ANDROID_STANDALONE_TOOLCHAIN`.
+- `ANROID_ARM_MODE`:
+  - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`;
+  - no need to specify when `ANDROID_ABI=arm64-v8a`.
+- `ANDROID_ARM_NEON`: indicates if to use NEON instructions.
+  - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`;
+  - no need to specify when `ANDROID_ABI=arm64-v8a`.
+Other useful arguments:
+- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen.  Could be `ON` or `OFF`, defaults to `OFF`.
+- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS.  It defaults to the value of the environment variable `CC`, or `cc`.
+Some frequent configurations for your reference:
+```bash
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DUSE_EIGEN_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+```
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
+      -DANDROID_ABI=arm64-v8a \
+      -DUSE_EIGEN_FOR_BLAS=OFF \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+There are some other arguments you might want to configure.
+- `CMAKE_BUILD_TYPE=MinSizeRel` minimizes the size of library.
+- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance.
+Our own tip for performance optimization to use clang and Eigen or OpenBLAS:
+- `CMAKE_BUILD_TYPE=Release`
+- `ANDROID_TOOLCHAIN=clang`
+- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`.
+### Build and Install
+After running `cmake`, we can run `make; make install` to build and install.
+Before building, you might want to remove the `third_party` and `build` directories including pre-built libraries for other architectures.
+After building，in the directory `CMAKE_INSTALL_PREFIX`, you will find three sub-directories:
+- `include`: the header file of the inference library,
+- `lib`: the inference library built for various Android ABIs,
+- `third_party`: dependent third-party libraries built for Android.
--- a/doc/howto/cross_compiling/cross_compiling_for_ios_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_ios_cn.md
@@ -27,10 +27,28 @@ iOS平台可选配置参数：
  - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
 - `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示：
-   | IOS_PLATFORM | IOS_ARCH             |
+    <table class="docutils">
-   |--------------|----------------------|
+    <colgroup>
-   |   OS         | armv7, armv7s, arm64 (默认) |
+      <col width="35%" />
-   | SIMULATOR    | i386, x86_64 (默认)         |   
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 (默认)</td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 (默认)</td>
+    </tr>
+    </tbody>
+    </table>
 - `IOS_DEPLOYMENT_TARGET`，最小的iOS部署版本，默认值为`7.0`。
 - `IOS_ENABLE_BITCODE`，是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3)，可设置`ON/OFF`，默认值为`ON`。

--- a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
--- a/doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md
--- a/doc/mobile/index_cn.rst
+++ b/doc/mobile/index_cn.rst
+MOBILE
+======
+..  toctree::
+  :maxdepth: 1
+  cross_compiling_for_android_cn.md
+  cross_compiling_for_ios_cn.md
+  cross_compiling_for_raspberry_cn.md
--- a/doc/mobile/index_en.rst
+++ b/doc/mobile/index_en.rst
+MOBILE
+======
+..  toctree::
+  :maxdepth: 1
+  cross_compiling_for_android_en.md
+  cross_compiling_for_raspberry_en.md
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -29,32 +29,32 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
 add_dependencies(paddle_capi paddle_proto)
 # TODO: paddle_capi_whole will be removed.
+set(PADDLE_CAPI_LAYERS_LIBS
+    paddle_function
+    paddle_gserver)
 if(MOBILE_INFERENCE)
-    set(PADDLE_CAPI_INFER_LIBS
+  set(PADDLE_CAPI_ENGINE_LIBS
-        paddle_utils
+      paddle_utils
-        paddle_parameter
+      paddle_parameter
-        paddle_math
+      paddle_math
-        paddle_cuda
+      paddle_cuda
-        paddle_function
+      paddle_proto)
-        paddle_gserver
-        paddle_proto)
 else()
-    set(PADDLE_CAPI_INFER_LIBS
+  set(PADDLE_CAPI_ENGINE_LIBS
-        paddle_utils
+      paddle_utils
-        paddle_parameter
+      paddle_parameter
-        paddle_math
+      paddle_math
-        paddle_cuda
+      paddle_cuda
-        paddle_function
+      paddle_proto
-        paddle_gserver
+      paddle_pserver
-        paddle_proto
+      paddle_network)
-        paddle_pserver
-        paddle_network)
 endif()
+set(PADDLE_CAPI_INFER_LIBS ${PADDLE_CAPI_LAYERS_LIBS} ${PADDLE_CAPI_ENGINE_LIBS})
 cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
 # Link the static library for inference
-cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto)
+cc_library(paddle_capi_engine DEPS paddle_capi ${PADDLE_CAPI_ENGINE_LIBS})
-cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver)
+cc_library(paddle_capi_layers DEPS ${PADDLE_CAPI_LAYERS_LIBS})
 # Link the shared library for inference
 if(NOT IOS)

--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -300,4 +300,12 @@ extern void hl_matrix_col2Vol(real* dataDst,
                              real alpha,
                              real beta);
+/**
+ * @brief  Matrix col2Vol: Convert col matrix into 3D volume
+ * @param[out]  out     output int vector.
+ * @param[in]   vec     input float vector.
+ * @param[in]   size    size of the vector.
+ */
+extern void hl_vector_cast2int(int* out, real* vec, int size);
 #endif /* HL_MATRIX_H_ */
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -133,4 +133,6 @@ inline void hl_matrix_col2Vol(real* dataDst,
                              real alpha,
                              real beta) {}
+inline void hl_vector_cast2int(int* out, real* vec, int size) {}
 #endif  // HL_MATRIX_STUB_H_
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -793,3 +793,14 @@ void hl_matrix_col2Vol(real* dataDst,
  CHECK_SYNC("hl_matrix_col2Vol failed");
 }
+__global__ void keVectorCast2Int(int* out, real* vec, int size) {
+  for (int i = threadIdx.x; i < (size); i += blockDim.x) {
+    out[i] = int(vec[i]);
+  }
+}
+void hl_vector_cast2int(int* out, real* vec, int size) {
+  keVectorCast2Int<<<1, 512, 0, STREAM_DEFAULT>>>(out, vec, size);
+  CHECK_SYNC("hl_vector_cast2int failed");
+}
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -20,7 +20,8 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
-cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc)
+cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
+device_context)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
@@ -44,8 +45,9 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
+cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog)
+cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -117,7 +117,7 @@ int64_t DDim::operator[](int idx) const {
  return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
-int64_t DDim::size() const { return arity(*this); }
+int DDim::size() const { return arity(*this); }
 bool DDim::operator==(DDim d) const {
  if (var.which() != d.getVar().which()) {

--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -71,7 +71,7 @@ struct DDim {
  DDim operator*(DDim d) const;
-  int64_t size() const;
+  int size() const;
 };
 /**

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -21,7 +21,9 @@ limitations under the License. */
 #include <vector>
 #include "paddle/framework/feed_fetch_type.h"
+#include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
@@ -70,10 +72,14 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
    var->GetMutable<FeedFetchList>();
  } else if (var_type == VarDesc::STEP_SCOPES) {
    var->GetMutable<std::vector<framework::Scope>>();
+  } else if (var_type == VarDesc::LOD_RANK_TABLE) {
+    var->GetMutable<LoDRankTable>();
+  } else if (var_type == VarDesc::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
  } else {
    PADDLE_THROW(
        "Variable type %d is not in "
-        "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST]",
+        "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE]",
        var_type);
  }
 }
@@ -83,7 +89,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
  // TODO(tonyyang-svail):
  //    - only runs on the first device (i.e. no interdevice communication)
  //    - will change to use multiple blocks for RNN op and Cond Op
-  PADDLE_ENFORCE_LT(block_id, pdesc.Size());
+  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
  auto& block = pdesc.Block(block_id);
  auto& device = device_contexts_[0];

--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -109,6 +109,11 @@ message LoDTensorDesc {
  optional int32 lod_level = 2 [ default = 0 ];
 }
+message LoDTensorArrayDesc {
+  required TensorDesc tensor = 1;
+  optional int32 lod_level = 2 [ default = 0 ];
+}
 message VarDesc {
  enum VarType {
    LOD_TENSOR = 1;
@@ -116,11 +121,14 @@ message VarDesc {
    FEED_MINIBATCH = 3;
    FETCH_LIST = 4;
    STEP_SCOPES = 5;
+    LOD_RANK_TABLE = 6;
+    LOD_TENSOR_ARRAY = 7;
  }
  required string name = 1;
  required VarType type = 2;
  optional LoDTensorDesc lod_tensor = 3;
  optional TensorDesc selected_rows = 4;
+  optional LoDTensorArrayDesc tensor_array = 6;
  optional bool persistable = 5 [ default = false ];
 }

--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/lod_rank_table.h"
+namespace paddle {
+namespace framework {
+void LoDRankTable::Reset(const LoD& lod, size_t level) {
+  this->coarse_lod_.clear();
+  this->items_.clear();
+  PADDLE_ENFORCE(level < lod.size(),
+                 "Cannot rank lod since the level %d is less than lod size %d",
+                 level, lod.size());
+  coarse_lod_.reserve(level);
+  for (size_t i = 0; i < level; ++i) {
+    coarse_lod_.push_back(lod[i]);
+  }
+  auto& vec = lod[level];
+  for (size_t i = 0; i < vec.size() - 1; ++i) {
+    TableItem item;
+    item.index = i;
+    item.length = vec[i + 1] - vec[i];
+    VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
+    items_.emplace_back(item);
+  }
+  // NOTE(yuyang18):
+  //
+  // The time complexity of stable_sort is O(N*log(N)) if additional memory is
+  // available. It is easy to debug and unit test when using `stable_sort`
+  // instead of `sort`. Also, the items of a rank table will not be too large.
+  std::stable_sort(items_.begin(), items_.end(),
+                   [](const TableItem& a, const TableItem& b) {
+                     return a.length > b.length;
+                   });
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/lod_rank_table.h
+++ b/paddle/framework/lod_rank_table.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/lod_tensor.h"
+namespace paddle {
+namespace framework {
+// LoD Rank Table stores the `level` of `lod` which is ordered by sequence
+// length in descending order. It is useful when implement dynamic RNN and is
+// shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice
+// output operators.
+//
+// The table item contains two element. The length of sequence and the index of
+// sequence in that level.
+//
+// LoDRankTable also stores the coarse_lod, which is the lod information whose
+// level is less than input level, in order to restore the output LoD
+// information.
+class LoDRankTable {
+ public:
+  struct TableItem {
+    size_t index;
+    size_t length;
+  };
+  LoDRankTable() {}
+  void Reset(const LoD& lod, size_t level);
+  const std::vector<TableItem>& items() const { return this->items_; }
+  const LoD& coarse_lod() const { return this->coarse_lod_; }
+  size_t level() const { return coarse_lod_.size(); }
+ private:
+  LoD coarse_lod_;
+  std::vector<TableItem> items_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -27,6 +27,20 @@
 namespace paddle {
 namespace framework {
+std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+  os << "{";
+  for (auto& v : lod) {
+    os << "{";
+    for (auto& i : v) {
+      os << i << ",";
+    }
+    os << "}";
+  }
+  os << "}";
+  return os;
+}
 LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
  LoD new_lod;
  new_lod.reserve(level_end - level_begin);
@@ -135,5 +149,41 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
  PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
  ShareDataWith(Slice(begin, end));
 }
+using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+                                        size_t end_idx, size_t start_level) {
+  LoD sub_lod;
+  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
+    PADDLE_ENFORCE_LE(start_idx, end_idx);
+    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
+    std::vector<size_t> level_lens;
+    for (size_t i = start_idx; i < end_idx; ++i) {
+      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
+    }
+    sub_lod.emplace_back(level_lens);
+    start_idx = lod[level_idx][start_idx];
+    end_idx = lod[level_idx][end_idx];
+  }
+  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
+}
+void AppendLoD(LoD* lod, const LoD& lod_length) {
+  PADDLE_ENFORCE(
+      lod->empty() || lod->size() == lod_length.size(),
+      "The lod_length should has the same size with the appended lod.");
+  if (lod->empty()) {
+    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
+  }
+  for (size_t i = 0; i < lod->size(); ++i) {
+    auto& level = (*lod)[i];
+    for (size_t len : lod_length[i]) {
+      level.push_back(level.back() + len);
+    }
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -56,6 +56,8 @@ using Vector = thrust::host_vector<
 */
 using LoD = std::vector<Vector<size_t>>;
+std::ostream& operator<<(std::ostream& os, const LoD& lod);
 /*
 * Slice levels from a LoD.
 * NOTE the lowest level should always be the absolute offsets of the underlying
@@ -181,5 +183,10 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
  return tensor;
 }
+std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
+    const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
+void AppendLoD(LoD* lod, const LoD& lod_length);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
@@ -140,19 +140,9 @@ Similarly, the lengths in the top level LoD
 are transformed into offsets of elements/words as follows:
 ```
-0 9     10  15
+0 3 4   6
-  =     =   =
+  = =   =
-  3+2+4 1+9 2+3+10
+  3 3+1 4+2
-```
-so we can tell that the first article is from word 0 to word 9, and the second article is from word 9 to word 10.
-The complete offset representation is as follows:
-```
-0           9 10       15
-0   3  5    9 10  12   15
- ||| || |||| |  ||  |||
 ```
 ## Slicing of LoD Tensors

--- a/paddle/framework/lod_tensor_array.h
+++ b/paddle/framework/lod_tensor_array.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/framework/lod_tensor.h"
+namespace paddle {
+namespace framework {
+using LoDTensorArray = std::vector<LoDTensor>;
+}
+}  // namespace paddle
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -144,5 +144,48 @@ TEST(LodExpand, test) {
  }
 }
+TEST(LoD, GetFineGrainedLoDLength) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+  lod.push_back(
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29}));
+  auto lod_and_offset =
+      paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0);
+  LoD lod_length = lod_and_offset.first;
+  size_t start_offset = lod_and_offset.second.first;
+  size_t end_offset = lod_and_offset.second.second;
+  LoD expected;
+  expected.push_back(std::vector<size_t>{2});
+  expected.push_back(std::vector<size_t>{2, 2});
+  expected.push_back(std::vector<size_t>{2, 3, 4, 2});
+  EXPECT_EQ(lod_length, expected);
+  EXPECT_EQ(start_offset, 15UL);
+  EXPECT_EQ(end_offset, 26UL);
+}
+TEST(LoD, AppendLoD) {
+  LoD lod_lens;
+  lod_lens.push_back(std::vector<size_t>({2}));
+  lod_lens.push_back(std::vector<size_t>({2, 2}));
+  lod_lens.push_back(std::vector<size_t>({2, 3, 4, 2}));
+  LoD origin;
+  origin.push_back(std::vector<size_t>({0, 2}));
+  origin.push_back(std::vector<size_t>({0, 1, 6}));
+  origin.push_back(std::vector<size_t>({0, 2, 5, 7, 10, 12, 15}));
+  paddle::framework::AppendLoD(&origin, lod_lens);
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 2, 4}));
+  expected.push_back(std::vector<size_t>({0, 1, 6, 8, 10}));
+  expected.push_back(
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26}));
+  EXPECT_EQ(origin, expected);
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -67,8 +67,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
                      out);
    in_var->SetLoDLevel(out_var->GetLodLevel());
  }
+  bool IsRuntime() const override;
+ protected:
+  VarDesc::VarType GetVarType(const std::string &name) const override;
- private:
  DDim GetDim(const std::string &name) const override;
  void SetDim(const std::string &name, const DDim &dim) override;
@@ -349,6 +352,9 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
    info.infer_var_type_(*this, block);
  } else {
    // all output type is LoDTensor by default
+    VLOG(10) << this->Type()
+             << " has not registered InferVarType. Set output variables to "
+                "LOD_TENSOR";
    for (auto &out_pair : this->outputs_) {
      for (auto &out_var_name : out_pair.second) {
        block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
@@ -448,6 +454,12 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
                                          const DDim &dim) {
  block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
 }
+bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
+VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
+    const std::string &name) const {
+  return block_.FindVarRecursive(name)->GetType();
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -92,8 +92,7 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
  void operator()(const char* op_type) const {
    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    OperatorWithKernel::OpKernelKey key(ToDataType(std::type_index(typeid(T))),
+    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType());
-                                        PlaceType());
    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/framework/operator.h"
 #include <algorithm>
 #include <atomic>
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/shape_inference.h"
+#include "paddle/framework/var_type.h"
 namespace paddle {
 namespace framework {
@@ -252,8 +254,7 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
  return res;
 }
-std::ostream& operator<<(std::ostream& os,
+std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key) {
-                         const OperatorWithKernel::OpKernelKey& kernel_key) {
  os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
     << "]";
  return os;
@@ -365,7 +366,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
    out_tensor->set_lod(in_tensor.lod());
  }
- private:
+  bool IsRuntime() const override { return true; }
+ protected:
  DDim GetDim(const std::string& name) const override {
    Variable* var = scope_.FindVar(name);
    if (var->IsType<LoDTensor>()) {
@@ -388,6 +391,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
    }
  }
+  VarDesc::VarType GetVarType(const std::string& name) const override {
+    auto* var = scope_.FindVar(name);
+    return ToVarType(var->Type());
+  }
+ private:
  const OperatorBase& op_;
  const Scope& scope_;
 };
@@ -422,7 +431,7 @@ void OperatorWithKernel::Run(const Scope& scope,
  // check if op[type] have kernel for kernel_key
  OpKernelMap& kernels = kernels_iter->second;
-  auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx);
+  auto kernel_key = GetKernelType(ctx);
  auto kernel_iter = kernels.find(kernel_key);
  if (kernel_iter == kernels.end()) {
@@ -430,6 +439,41 @@ void OperatorWithKernel::Run(const Scope& scope,
  }
  kernel_iter->second->Compute(ctx);
+  // throws errors if have.
+  dev_ctx.Finish();
+}
+OpKernelType OperatorWithKernel::GetKernelType(
+    const ExecutionContext& ctx) const {
+  return OpKernelType(IndicateDataType(ctx), ctx.device_context());
+}
+DataType OperatorWithKernel::IndicateDataType(
+    const ExecutionContext& ctx) const {
+  auto& scope = ctx.scope();
+  int data_type = -1;
+  for (auto& input : this->inputs_) {
+    for (auto& ipt_name : input.second) {
+      auto* var = scope.FindVar(ipt_name);
+      if (var != nullptr) {
+        const Tensor* t = nullptr;
+        if (var->IsType<Tensor>()) {
+          t = &var->Get<Tensor>();
+        } else if (var->IsType<LoDTensor>()) {
+          t = &var->Get<LoDTensor>();
+        } else if (var->IsType<SelectedRows>()) {
+          t = &(var->Get<SelectedRows>().value());
+        }
+        if (t != nullptr) {
+          int tmp = static_cast<int>(ToDataType(t->type()));
+          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+                         "DataType of Paddle Op %s must be the same.", Type());
+          data_type = tmp;
+        }
+      }
+    }
+  }
+  PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
+  return static_cast<DataType>(data_type);
 }
 }  // namespace framework

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -298,11 +298,10 @@ class ExecutionContext {
  }
 #ifdef PADDLE_WITH_CUDA
-  const platform::CUDADeviceContext& cuda_device_context() const {
+  const inline platform::CUDADeviceContext& cuda_device_context() const {
    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
-    auto cuda_ctx =
+    return *reinterpret_cast<const platform::CUDADeviceContext*>(
-        reinterpret_cast<const platform::CUDADeviceContext*>(&device_context_);
+        &device_context_);
-    return *cuda_ctx;
  }
 #endif
@@ -346,27 +345,10 @@ class OpKernel : public OpKernelBase {
  using ELEMENT_TYPE = T;
 };
-class OperatorWithKernel : public OperatorBase {
+struct OpKernelType {
- public:
+  struct Hash {
-  struct OpKernelKey {
-    platform::Place place_;
-    DataType data_type_;
-    OpKernelKey(DataType data_type, platform::Place place)
-        : place_(place), data_type_(data_type) {}
-    OpKernelKey(DataType data_type, const platform::DeviceContext& dev_ctx)
-        : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
-    bool operator==(const OpKernelKey& o) const {
-      return platform::places_are_same_class(place_, o.place_) &&
-             data_type_ == o.data_type_;
-    }
-  };
-  struct OpKernelHash {
    std::hash<int> hash_;
-    size_t operator()(const OpKernelKey& key) const {
+    size_t operator()(const OpKernelType& key) const {
      int place = key.place_.which();
      int data_type = static_cast<int>(key.data_type_);
      int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
@@ -375,9 +357,26 @@ class OperatorWithKernel : public OperatorBase {
    }
  };
+  platform::Place place_;
+  DataType data_type_;
+  OpKernelType(DataType data_type, platform::Place place)
+      : place_(place), data_type_(data_type) {}
+  OpKernelType(DataType data_type, const platform::DeviceContext& dev_ctx)
+      : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
+  bool operator==(const OpKernelType& o) const {
+    return platform::places_are_same_class(place_, o.place_) &&
+           data_type_ == o.data_type_;
+  }
+};
+class OperatorWithKernel : public OperatorBase {
+ public:
  using OpKernelMap =
-      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernelBase>,
+      std::unordered_map<OpKernelType, std::unique_ptr<OpKernelBase>,
-                         OpKernelHash>;
+                         OpKernelType::Hash>;
  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
                     const VariableNameMap& outputs, const AttributeMap& attrs)
@@ -405,42 +404,15 @@ class OperatorWithKernel : public OperatorBase {
  }
 protected:
+  virtual OpKernelType GetKernelType(const ExecutionContext& ctx) const;
+ private:
  // indicate kernel DataType by input data. Defaultly all input data must be
  // same.
-  virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
+  DataType IndicateDataType(const ExecutionContext& ctx) const;
-    VLOG(3) << "Default IndicateDataType " << this->Type();
-    auto& scope = ctx.scope();
-    int data_type = -1;
-    for (auto& input : this->inputs_) {
-      for (auto& ipt_name : input.second) {
-        auto* var = scope.FindVar(ipt_name);
-        if (var != nullptr) {
-          const Tensor* t = nullptr;
-          if (var->IsType<Tensor>()) {
-            t = &var->Get<Tensor>();
-          } else if (var->IsType<LoDTensor>()) {
-            t = &var->Get<LoDTensor>();
-          } else if (var->IsType<SelectedRows>()) {
-            t = &(var->Get<SelectedRows>().value());
-          }
-          if (t != nullptr) {
-            int tmp = static_cast<int>(ToDataType(t->type()));
-            VLOG(3) << "Input " << ipt_name << " with data_type " << tmp;
-            PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                           "DataType of Paddle Op %s must be the same.",
-                           Type());
-            data_type = tmp;
-          }
-        }
-      }
-    }
-    PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
-    return static_cast<DataType>(data_type);
-  }
 };
-std::ostream& operator<<(std::ostream& os,
+std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key);
-                         const OperatorWithKernel::OpKernelKey& kernel_key);
 extern bool OpSupportGPU(const std::string& op_type);

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -114,8 +114,8 @@ class OpWithKernelTest : public OperatorWithKernel {
 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {}
-  DataType IndicateDataType(const ExecutionContext& ctx) const override {
+  OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
-    return DataType::FP32;
+    return OpKernelType(DataType::FP32, ctx.device_context());
  }
 };

--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -46,6 +46,23 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
    SetDim(names[i], dims[i]);
  }
 }
+std::vector<VarDesc::VarType> InferShapeContext::GetInputsVarType(
+    const std::string &name) const {
+  return GetVarTypes(Inputs(name));
+}
+std::vector<VarDesc::VarType> InferShapeContext::GetOutputsVarType(
+    const std::string &name) const {
+  return GetVarTypes(Outputs(name));
+}
+std::vector<VarDesc::VarType> InferShapeContext::GetVarTypes(
+    const std::vector<std::string> &names) const {
+  std::vector<VarDesc::VarType> retv;
+  retv.resize(names.size());
+  std::transform(names.begin(), names.end(), retv.begin(),
+                 std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
+                           std::placeholders::_1));
+  return retv;
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/framework.pb.h"
 namespace paddle {
 namespace framework {
@@ -26,6 +27,10 @@ class InferShapeContext {
  virtual bool HasInput(const std::string &name) const = 0;
  virtual bool HasOutput(const std::string &name) const = 0;
+  std::vector<VarDesc::VarType> GetInputsVarType(const std::string &name) const;
+  std::vector<VarDesc::VarType> GetOutputsVarType(
+      const std::string &name) const;
  virtual bool HasInputs(const std::string &name) const = 0;
  virtual bool HasOutputs(const std::string &name) const = 0;
@@ -46,6 +51,8 @@ class InferShapeContext {
  virtual void ShareLoD(const std::string &in, const std::string &out,
                        size_t i = 0, size_t j = 0) const = 0;
+  virtual bool IsRuntime() const = 0;
 protected:
  virtual framework::DDim GetDim(const std::string &name) const = 0;
  virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
@@ -55,6 +62,11 @@ class InferShapeContext {
  void SetDims(const std::vector<std::string> &names,
               const std::vector<framework::DDim> &dims);
+  std::vector<VarDesc::VarType> GetVarTypes(
+      const std::vector<std::string> &names) const;
+  virtual VarDesc::VarType GetVarType(const std::string &name) const = 0;
 };
 }  // namespace framework

--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -52,7 +52,7 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };
 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t> functor;
+  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool> functor;
  size_t size = functor(type);
  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
  return size;

--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -37,13 +37,29 @@ std::vector<int64_t> VarDescBind::Shape() const {
 DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }
 void VarDescBind::SetLoDLevel(int32_t lod_level) {
-  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
+  switch (desc_.type()) {
-  desc_.mutable_lod_tensor()->set_lod_level(lod_level);
+    case VarDesc::LOD_TENSOR:
+      desc_.mutable_lod_tensor()->set_lod_level(lod_level);
+      break;
+    case VarDesc::LOD_TENSOR_ARRAY:
+      desc_.mutable_tensor_array()->set_lod_level(lod_level);
+      break;
+    default:
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+                   desc_.tensor_array().lod_level());
+  }
 }
 int32_t VarDescBind::GetLodLevel() const {
-  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
+  switch (desc_.type()) {
-  return desc_.lod_tensor().lod_level();
+    case VarDesc::LOD_TENSOR:
+      return desc_.lod_tensor().lod_level();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().lod_level();
+    default:
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+                   desc_.tensor_array().lod_level());
+  }
 }
 const TensorDesc &VarDescBind::tensor_desc() const {
@@ -53,6 +69,8 @@ const TensorDesc &VarDescBind::tensor_desc() const {
      return desc_.selected_rows();
    case VarDesc::LOD_TENSOR:
      return desc_.lod_tensor().tensor();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().tensor();
    default:
      PADDLE_THROW("Unexpected branch.");
  }
@@ -66,6 +84,8 @@ TensorDesc *VarDescBind::mutable_tensor_desc() {
      return desc_.mutable_selected_rows();
    case VarDesc::LOD_TENSOR:
      return desc_.mutable_lod_tensor()->mutable_tensor();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.mutable_tensor_array()->mutable_tensor();
    default:
      PADDLE_THROW("Unexpected branch.");
  }

--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <vector>
+#include "glog/logging.h"
 #include "paddle/framework/framework.pb.h"
 namespace paddle {

--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/lod_tensor_array.h"
+namespace paddle {
+namespace framework {
+inline VarDesc::VarType ToVarType(std::type_index type) {
+  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
+    return VarDesc_VarType_LOD_TENSOR;
+  } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
+    return VarDesc_VarType_LOD_RANK_TABLE;
+  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
+    return VarDesc_VarType_LOD_TENSOR_ARRAY;
+  } else {
+    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -48,6 +48,11 @@ class Variable {
  void Clear() { holder_.reset(); }
+  std::type_index Type() const {
+    PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
+    return holder_->Type();
+  }
 private:
  struct Placeholder {
    virtual ~Placeholder() {}

--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -395,14 +395,24 @@ real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
  CHECK_LE(arguments.size(), (size_t)3);
  MatrixPtr output = arguments[0].value;
  IVectorPtr label = arguments[1].ids;
+  MatrixPtr labelval = arguments[1].value;
  bool supportWeight = (3 == arguments.size()) ? true : false;
  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-  if (nullptr == output || nullptr == label ||
-      (supportWeight && nullptr == weight)) {
+  if (nullptr == output || (supportWeight && nullptr == weight)) {
    return 0;
  }
  size_t insNum = output->getHeight();
  size_t outputDim = output->getWidth();
+  // Copy label from value to a vector.
+  if (nullptr == label && nullptr != labelval) {
+    // label width is 1
+    CHECK_EQ(1U, labelval->getWidth());
+    VectorPtr vec =
+        Vector::create(labelval->getData(), insNum, output->useGpu());
+    label = vec->castToInt();
+  }
  CHECK_EQ(insNum, label->getSize());
  if (supportWeight) {
    CHECK_EQ(insNum, weight->getHeight());
@@ -443,6 +453,7 @@ real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
  int* labelD = label->getData();
  real* weightD = supportWeight ? weight->getData() : nullptr;
  size_t pos = realColumnIdx_;
  for (size_t i = 0; i < insNum; ++i) {
    real value = outputD[pos];
    uint32_t binIdx = static_cast<uint32_t>(value * kBinNum_);

--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
-ThreadLocalD<std::vector<MemoryHandle *>> ConvBaseProjection::convMem_;
+ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
 ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
                                       ParameterPtr parameter,
@@ -175,18 +175,18 @@ void ConvBaseProjection::reshape(int batchSize) {
 }
 void *ConvBaseProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandle *> &convMem = *convMem_;
+  std::vector<MemoryHandlePtr> &convMem = *convMem_;
  if (convMem.empty()) {
    int numDevices = hl_get_device_count();
    convMem.resize(numDevices);
  }
  int devId = hl_get_device();
-  MemoryHandle **localMem = &(convMem[devId]);
+  MemoryHandlePtr localMem = convMem[devId];
-  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
+  if (NULL == localMem || size > localMem->getAllocSize()) {
-    *localMem = new GpuMemoryHandle(size);
+    localMem = std::make_shared<GpuMemoryHandle>(size);
  }
-  return (*localMem)->getBuf();
+  return localMem->getBuf();
 }
 ConvBaseProjection::~ConvBaseProjection() {

--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -105,7 +105,7 @@ protected:
  bool bias_;
  std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+  static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "MKLDNNAddtoLayer.h"
+using namespace mkldnn;  // NOLINT
+namespace paddle {
+REGISTER_LAYER(mkldnn_addto, MKLDNNAddtoLayer);
+bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  layerSize_ = getSize();
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(layerSize_, inputLayers_[i]->getSize()) << "input size must equal";
+  }
+  if (biasParameter_.get() != NULL) {
+    biases_ =
+        std::unique_ptr<Weight>(new Weight(1, layerSize_, biasParameter_, 0));
+  }
+  return true;
+}
+void MKLDNNAddtoLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+  CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
+  reshapeInput(bs, ih, iw);
+  ic = inputLayers_[0]->getSize() / ih / iw;
+  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
+  CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
+    CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
+  }
+  oc = ic;
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+  printSizeInfo();
+}
+void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& wgt,
+                                MKLDNNMatrixPtr& bias,
+                                MKLDNNMatrixPtr& out) {
+  if (biases_) {
+    LOG(FATAL) << "not implemented yet";
+  }
+  resetFwdBuffers(inVals_, out);
+  in = inVals_[0];
+  std::shared_ptr<sum::primitive_desc> fwdPD;
+  resetFwdPD(fwdPD, inVals_, out);
+  resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+}
+void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& wgt,
+                                MKLDNNMatrixPtr& bias,
+                                MKLDNNMatrixPtr& out) {
+  resetBwdBuffers(inGrads_, out);
+  in = inGrads_[0];
+  // backward only need share output grad to input grad
+  for (size_t i = 0; i < inGrads_.size(); i++) {
+    if (inGrads_[i] != nullptr) {
+      inGrads_[i] = out;
+      inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
+    }
+  }
+}
+void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& out) {
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInValue(inputs[i], nullptr, i);
+    CHECK(inputs[i]);
+    inputs[i]->downSpatial();
+  }
+  for (size_t i = 1; i < inputs.size(); i++) {
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inputs[0]->getPrimitiveDesc());
+  }
+  resetOutValue(out, inputs[0]->getPrimitiveDesc());
+}
+void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
+                                  std::vector<MKLDNNMatrixPtr>& inputs,
+                                  MKLDNNMatrixPtr out) {
+  std::vector<double> scales(inputs.size(), 1.0);
+  std::vector<memory::primitive_desc> srcPDs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
+  }
+  CHECK(out);
+  pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+}
+void MKLDNNAddtoLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<sum::primitive_desc>& pd,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  std::vector<primitive::at> srcs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcs.push_back(*(inputs[i]));
+  }
+  fwd_.reset(new sum(*pd, srcs, *out));
+  pipeline.push_back(*fwd_);
+}
+void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& out) {
+  CHECK(outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  CHECK(out);
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+namespace paddle {
+/**
+ * @brief A subclass of MKLDNNLayer Addto layer.
+ *
+ * The config file api is mkldnn_addto
+ */
+class MKLDNNAddtoLayer : public MKLDNNLayer {
+protected:
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
+  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
+  size_t layerSize_;
+  // TODO(TJ): this part has not been optimized by MKL-DNN
+  std::unique_ptr<Weight> biases_;
+public:
+  explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+  ~MKLDNNAddtoLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+  void updateWeights(const UpdateCallback& callback) override;
+  void printValueFormat() override {
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
+    }
+    if (extOutVal_) {
+      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+    }
+  }
+  void printGradFormat() override {
+    if (extOutGrad_) {
+      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
+    }
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+    }
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
+    }
+  }
+protected:
+  /**
+   * Forward functions: reset buffers(inputs, output, bias),
+   *                    reset primitive descriptor,
+   *                    reset pipeline.
+   */
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+  /**
+   * Backward functions: reset buffers(inputs, output, bias)
+   */
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -60,18 +60,16 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
  }
  CHECK(wgtVal_) << "should have been initialized";
-  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
  auto targetDim = wgtVal_->getDims();
-  auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo;
+  auto srcFmt = targetDim.size() == 2 ? format::io : format::ihwo;
  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
  hasInitedWgt_ = true;
 }
 void MKLDNNFcLayer::convertWeightsToPaddle() {
  CHECK(wgtVal_) << "should have been initialized";
-  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
  auto targetDim = wgtVal_->getDims();
-  auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo;
+  auto dstFmt = targetDim.size() == 2 ? format::io : format::ihwo;
  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }

--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -77,7 +77,7 @@ void MKLDNNLayer::forward(PassType passType) {
      needResetBwd_ = true;
    }
-    if (inputLayers_[0]->getType() == "data") {
+    if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
      // Update input value data when input layer is "data" type,
      // since the input value data address might be changed.
      CHECK(extInVal_);
@@ -171,29 +171,27 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
 }
 void MKLDNNLayer::resetInValue(
-    MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) {
+    MKLDNNMatrixPtr& in,
+    const std::shared_ptr<memory::primitive_desc>& intPD,
+    size_t inputIdx) {
  cvtInVal_ = nullptr;
  extInVal_ = nullptr;
  in = nullptr;
  CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
  auto extPD = MKLDNNMatrix::createPrimitiveDesc(
      {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  const MatrixPtr& inMat = inputLayers_[inputIdx]->getOutputValue();
-  in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+  extInVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-  CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
+  CHECK_EQ(inputIsOnlyMKLDNN(), extInVal_ != nullptr);
-  if (in == nullptr || in->getFormat() == format::nc) {
+  if (extInVal_ == nullptr || extInVal_->getFormat() == format::nc) {
-    in = MKLDNNMatrix::create(extPD, inMat);
+    extInVal_ = MKLDNNMatrix::create(extPD, inMat);
-  }
-  extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
-  if (in->getFormat() == format::nc) {
-    CHECK(ih_ == 1 && iw_ == 1);
  }
+  in = extInVal_;
  if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
    return;
  }
  // need create reorder
  in = MKLDNNMatrix::create(*intPD);
-  extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
  cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
  CHECK(cvtInVal_) << "should not be emptry";
 }
@@ -216,11 +214,12 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
 }
 void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
-                              memory::primitive_desc intPD) {
+                              memory::primitive_desc intPD,
+                              size_t inputIdx) {
  cvtInGrad_ = nullptr;
  extInGrad_ = nullptr;
  in = nullptr;
-  LayerPtr& input = inputLayers_[0];
+  LayerPtr& input = inputLayers_[inputIdx];
  if (input->getOutputGrad() == nullptr) {
    // no need input grad
    return;
@@ -245,7 +244,6 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
    return;
  }
  // need create reorder
-  // TODO(TJ): add macro definition to simplify it
  CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
      << "should have external input value and the format must be nchw(nc)";
  extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);

--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -199,7 +199,8 @@ protected:
   */
  void resetInValue(
      MKLDNNMatrixPtr& in,
-      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr);
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
+      size_t inputIdx = 0);
  /**
   * reset output value from internal primitive desc.
@@ -212,7 +213,9 @@ protected:
   * reset input grad from internal primitive desc.
   * reset both internal and external buffer and create reorder if necessary.
   */
-  void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD);
+  void resetInGrad(MKLDNNMatrixPtr& in,
+                   mkldnn::memory::primitive_desc intPD,
+                   size_t inputIdx = 0);
  /**
   * reset output grad from internal primitive desc.

--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -98,8 +98,19 @@ void SubSequenceLayer::forward(PassType passType) {
  CHECK_EQ(numSequences2, numSequences3);
  MatrixPtr inputValue = input.value;
-  IVectorPtr offsetValue = offsetSeq.ids;
+  IVectorPtr offsetValue;
-  IVectorPtr sizeValue = sizeSeq.ids;
+  IVectorPtr sizeValue;
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
  CHECK_EQ(offsetValue->getSize(), numSequences1);
  CHECK_EQ(sizeValue->getSize(), numSequences1);
@@ -176,8 +187,21 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) {
  size_t numSequences1 = startPositions1->getSize() - 1;
  const int* starts1 = startPositions1->getData();
-  IVectorPtr offsetValue = getInput(1).ids;
+  const Argument& offsetSeq = getInput(1);
-  IVectorPtr sizeValue = getInput(2).ids;
+  const Argument& sizeSeq = getInput(2);
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
  int* offsets = offsetValue->getData();
  int* sizes = sizeValue->getData();

--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -132,7 +132,7 @@ void MKLDNNTester::checkForward() {
  VLOG(MKLDNN_TESTS) << "Check Forward";
  printTopDatas();
  double delta =
-      compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
+      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
  EXPECT_LE(fabs(delta), eps_);
 }
@@ -147,7 +147,7 @@ void MKLDNNTester::checkBackwardData() {
    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
    printMatrix(refDiff);
-    double delta = compareMatrix(dnnDiff, refDiff);
+    double delta = compareMatrix(refDiff, dnnDiff);
    EXPECT_LE(fabs(delta), eps_);
    if (isBN) {
      // the other two inputs in batch norm are for moving mean and var
@@ -177,7 +177,7 @@ void MKLDNNTester::checkBackwardWgts() {
                     << parameters_[REF][i]->getName();
    printVector(ref);
-    double delta = compareVector(dnn, ref);
+    double delta = compareVector(ref, dnn);
    EXPECT_LE(fabs(delta), eps_);
  }

--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -271,20 +271,53 @@ TEST(MKLDNNLayer, BatchNormLayer) {
  testBatchNormLayer({16, 32, 16, 16});
 }
-struct testActDesc {
+struct testImageDesc {
  int bs, ic, ih, iw;
 };
-static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
+static void getAddtoConfig(TestConfig& cfg,
+                           const testImageDesc& pm,
+                           const size_t nInputs = 1) {
  cfg.biasSize = 0;
  cfg.layerConfig.set_type("addto");
  size_t layerSize = pm.ic * pm.ih * pm.iw;
  cfg.layerConfig.set_size(layerSize);
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
+  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.add_inputs();
+  for (size_t i = 0; i < nInputs; ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(pm.ic);
+    img_conf->set_img_size_y(pm.ih);
+    img_conf->set_img_size(pm.iw);
+  }
+}
+void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
+  CHECK_GE(nInputs, 1);
+  TestConfig dnnConfig;
+  getAddtoConfig(dnnConfig, pm, nInputs);
+  dnnConfig.layerConfig.set_type("mkldnn_addto");
+  // TODO(TJ): test with bias
+  for (auto withBias : {false}) {
+    if (withBias) {
+      dnnConfig.biasSize = pm.ic * pm.ih * pm.iw;
+    } else {
+      dnnConfig.biasSize = 0;
+    }
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
+  }
+}
+TEST(MKLDNNLayer, AddtoLayer) {
+  testAddtoLayer({16, 5, 14, 14}, 1);
+  testAddtoLayer({8, 10, 8, 8}, 2);
+  testAddtoLayer({4, 12, 1, 1}, 3);
 }
-void testActivation(std::string actType, const testActDesc& pm) {
+void testActivation(std::string actType, const testImageDesc& pm) {
  // TODO(TJ): remove me when paddle support elu activation
  if (actType == "mkldnn_elu") {
    return;

--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 #include "Matrix.h"
 #include "hl_gpu.h"
+#include "hl_matrix.h"
 #include "hl_table_apply.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Logging.h"
@@ -99,6 +100,19 @@ MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
  return mat;
 }
+template <>
+std::shared_ptr<VectorT<int>> VectorT<real>::castToInt() {
+  std::shared_ptr<VectorT<int>> ret = IVector::create(this->getSize(), useGpu_);
+  if (useGpu_) {
+    hl_vector_cast2int(ret->getData(), this->getData(), this->getSize());
+  } else {
+    for (size_t i = 0; i < getSize(); ++i) {
+      ret->getData()[i] = int(this->getData()[i]);
+    }
+  }
+  return ret;
+}
 template <class T>
 GpuVectorT<T>::GpuVectorT(size_t size)
    : VectorT<T>(size,

--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -162,6 +162,13 @@ public:
   */
  std::shared_ptr<Matrix> toOneHotSparseMatrix(size_t idRange, bool useGpu);
+  /**
+   * @brief cast vector of "real" elements to "int" elements.
+   *
+   * @note: float -> int must be casted, or you'll get wrong data.
+   */
+  std::shared_ptr<VectorT<int>> castToInt();
  /**
   * This function will crash if the size of src and dest is different.
   */

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -62,6 +62,11 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
    endif()
+    if ("${TARGET}" STREQUAL "compare_op")
+        set(pybind_flag 1)
+        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
+    endif()
    # pool_with_index_op contains several operators
    if ("${TARGET}" STREQUAL "pool_with_index_op")
        set(pybind_flag 1)
@@ -69,6 +74,20 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
    endif()
+    # conv_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
+    endif()
+    # conv_transpose_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_transpose_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
+    endif()
    # pool_cudnn_op contains several operators
    if ("${TARGET}" STREQUAL "pool_cudnn_op")
        set(pybind_flag 1)
@@ -96,7 +115,7 @@ function(op_library TARGET)
        # It's enough to just adding one operator to pybind
        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
    endif()
    # reduce_op contains several operators
    if ("${TARGET}" STREQUAL "reduce_op")
        set(pybind_flag 1)
@@ -104,6 +123,11 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
    endif()
+    if ("${TARGET}" STREQUAL "tensor_array_read_write_op")
+        set(pybind_flag 1)
+        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n")
+    endif()
    # pybind USE_NO_KERNEL_OP
    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
    file(READ ${TARGET}.cc TARGET_CONTENT)
@@ -139,24 +163,42 @@ set(DEPS_OPS
    sum_op
    pool_op
    pool_with_index_op
+    conv_op
+    lstm_op
+    conv_transpose_op
    nccl_op
    sequence_conv_op
-    lstm_op)
+    sequence_pool_op
+    lod_rank_table_op
+    lod_tensor_to_array_op
+    array_to_lod_tensor_op
+    lstm_op
+    tensor_array_read_write_op
+    gru_op)
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
+op_library(conv_op DEPS vol2col)
 op_library(sum_op DEPS net_op selected_rows_functor)
 op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)
+op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
+op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
+op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
+op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
 if(WITH_GPU)
 op_library(nccl_op DEPS nccl_common)
 endif()
 op_library(sequence_conv_op DEPS context_project)
+op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(conv_transpose_op DEPS vol2col)
+op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
        DEPS net_op tensor_array)
 op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
    op_library(${src})

--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -33,7 +33,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
    auto inference_dim = ctx->GetInputDim("Out");
    auto label_dim = ctx->GetInputDim("Label");
-    // Assume indices has same shape with infernece, because
+    // Assume indices has same shape as inference, because
    // it's the output of topk.
    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
@@ -47,10 +47,11 @@ class AccuracyOp : public framework::OperatorWithKernel {
  }
 protected:
-  // IndicateDataType
+  framework::OpKernelType GetKernelType(
-  framework::DataType IndicateDataType(
      const framework::ExecutionContext &ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Out")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        ctx.device_context());
  }
 };
@@ -60,20 +61,24 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
                  framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    // TODO(typhoonzero): support both inference value and indices.
-    AddInput("Out", "topk (inferences) the network output");
+    AddInput("Out", "The network output of topk (inferences)");
-    AddInput("Indices", "topk (indices) the network output");
+    AddInput("Indices", "The the network output of topk (indices)");
    AddInput("Label", "Label of the training data");
    // TODO(typhoonzero): AddInput("Weight", ...
    AddOutput("Accuracy", "The accuracy of current batch");
    AddComment(R"DOC(
-Accuracy. It will print accuracy rate for classification.
+Accuracy Operator. 
-The accuracy is:
-..  math::
+It will print accuracy rate for classification.
-accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})
+The accuracy is calculated as follows:
+$$accuracy = \frac{NumOfCorrectPredicts}{NumOfAllSamples}$$
+Both the input Out and Label can carry the LoD (Level of Details)
+information, or not. But the output only shares the LoD information 
+with the input Out(Inference).
-Both the input `Out` and `Label` can carry the LoD (Level of Details)
-information, or not. But the output only shares the LoD with input `Inference`.
 )DOC");
  }
 };

--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -72,11 +72,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
    }
    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
-        1, PADDLE_CUDA_NUM_THREADS, 0,
+        1, PADDLE_CUDA_NUM_THREADS, 0, ctx.cuda_device_context().stream()>>>(
-        reinterpret_cast<const platform::CUDADeviceContext&>(
+        num_samples, infer_width, indices_data, label_data, accuracy_data);
-            ctx.device_context())
-            .stream()>>>(num_samples, infer_width, indices_data, label_data,
-                         accuracy_data);
  }
 };

--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -43,7 +43,12 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Sigmoid operator");
    AddOutput("Y", "Output of Sigmoid operator");
-    AddComment("Sigmoid activation operator, sigmoid = 1 / (1 + exp(-x))");
+    AddComment(R"DOC(
+Sigmoid Activation Operator.
+$y = 1 / (1 + e^{-x})$
+)DOC");
  }
 };
@@ -54,8 +59,12 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of LogSigmoid operator");
    AddOutput("Y", "Output of LogSigmoid operator");
-    AddComment(
+    AddComment(R"DOC(
-        "Logsigmoid activation operator, logsigmoid = log (1 / (1 + exp(-x)))");
+Logsigmoid Activation Operator.
+$y = \log(1 / (1 + e^{-x}))$
+)DOC");
  }
 };
@@ -65,7 +74,12 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Exp operator");
    AddOutput("Y", "Output of Exp operator");
-    AddComment("Exp activation operator, exp(x) = e^x");
+    AddComment(R"DOC(
+Exp Activation Operator.
+$y = e^x$
+)DOC");
  }
 };
@@ -75,7 +89,12 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Relu operator");
    AddOutput("Y", "Output of Relu operator");
-    AddComment("Relu activation operator, relu(x) = max(x, 0)");
+    AddComment(R"DOC(
+Relu Activation Operator.
+$y = \max(x, 0)$
+)DOC");
  }
 };
@@ -87,11 +106,14 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of LeakyRelu operator");
    AddOutput("Y", "Output of LeakyRelu operator");
-    AddComment(
-        "LeakyRelu activation operator, "
-        "leaky_relu = max(x, alpha * x)");
    AddAttr<AttrType>("alpha", "The small negative slope")
        .SetDefault(static_cast<AttrType>(0.02f));
+    AddComment(R"DOC(
+LeakyRelu Activation Operator.
+$y = \max(x, \alpha * x)$
+)DOC");
  }
 };
@@ -103,12 +125,20 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Softshrink operator");
    AddOutput("Y", "Output of Softshrink operator");
-    AddComment(
-        "Softshrink activation operator, "
-        "softshrink = x - lambda, if x > lambda;"
-        " x + lambda, if x < lambda; 0 otherwise");
    AddAttr<AttrType>("lambda", "non-negative offset")
        .SetDefault(static_cast<AttrType>(0.5f));
+    AddComment(R"DOC(
+Softshrink Activation Operator.
+$$
+y = \begin{cases} 
+    x - \lambda, \text{if } x > \lambda \\
+    x + \lambda, \text{if } x < -\lambda \\
+    0,  \text{otherwise}
+    \end{cases}
+$$
+)DOC");
  }
 };
@@ -118,9 +148,12 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Tanh operator");
    AddOutput("Y", "Output of Tanh operator");
-    AddComment(
+    AddComment(R"DOC(
-        "Tanh activation operator, tanh = (exp(x) - exp(-x)) / (exp(x) + "
+Tanh Activation Operator.
-        "exp(-x))");
+$$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+)DOC");
  }
 };
@@ -131,7 +164,12 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of TanhShrink operator");
    AddOutput("Y", "Output of TanhShrink operator");
-    AddComment("TanhShrink activation operator, tanhshrink(x) = x - tanh(x)");
+    AddComment(R"DOC(
+TanhShrink Activation Operator.
+$$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+)DOC");
  }
 };
@@ -143,13 +181,20 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of HardShrink operator");
    AddOutput("Y", "Output of HardShrink operator");
-    AddComment(
-        "HardShrink activation operator, "
-        "hard_shrink(x) = x if x > lambda"
-        "hard_shrink(x) = x if x < -lambda"
-        "hard_shrink(x) = 0 otherwise");
    AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
        .SetDefault(static_cast<AttrType>(0.5));
+    AddComment(R"DOC(
+HardShrink Activation Operator.
+$$
+y = \begin{cases} 
+    x, \text{if } x > \lambda \\
+    x, \text{if } x < -\lambda \\
+    0,  \text{otherwise}
+    \end{cases}
+$$
+)DOC");
  }
 };
@@ -159,7 +204,12 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Sqrt operator");
    AddOutput("Y", "Output of Sqrt operator");
-    AddComment("Sqrt activation operator, sqrt(x) = x^(1/2)");
+    AddComment(R"DOC(
+Sqrt Activation Operator.
+$y = \sqrt{x}$
+)DOC");
  }
 };
@@ -169,7 +219,12 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Abs operator");
    AddOutput("Y", "Output of Abs operator");
-    AddComment("Abs activation operator, abs(x) = |x|");
+    AddComment(R"DOC(
+Abs Activation Operator.
+$y = |x|$
+)DOC");
  }
 };
@@ -180,7 +235,12 @@ class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Reciprocal operator");
    AddOutput("Y", "Output of Reciprocal operator");
-    AddComment("Reciprocal activation operator, reciprocal(x) = 1 / x");
+    AddComment(R"DOC(
+Reciprocal Activation Operator.
+$$y = \frac{1}{x}$$
+)DOC");
  }
 };
@@ -190,7 +250,14 @@ class LogOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Log operator");
    AddOutput("Y", "Output of Log operator");
-    AddComment("Log activation operator, log(x) = natural logarithm of x");
+    AddComment(R"DOC(
+Log Activation Operator.
+$y = \ln(x)$
+Natural logarithm of x.
+)DOC");
  }
 };
@@ -200,7 +267,12 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Square operator");
    AddOutput("Y", "Output of Square operator");
-    AddComment("Square activation operator, square(x) = x^2");
+    AddComment(R"DOC(
+Square Activation Operator.
+$y = x^2$
+)DOC");
  }
 };
@@ -211,7 +283,12 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Softplus operator");
    AddOutput("Y", "Output of Softplus operator");
-    AddComment("Softplus activation operator, softplus(x) = log(1 + exp(x))");
+    AddComment(R"DOC(
+Softplus Activation Operator.
+$y = \ln(1 + e^{x})$
+)DOC");
  }
 };
@@ -222,7 +299,12 @@ class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Softsign operator");
    AddOutput("Y", "Output of Softsign operator");
-    AddComment("Softsign activation operator, softsign(x) = x / (1 + |x|)");
+    AddComment(R"DOC(
+Softsign Activation Operator.
+$$y = \frac{x}{1 + |x|}$$
+)DOC");
  }
 };
@@ -233,11 +315,16 @@ class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of BRelu operator");
    AddOutput("Y", "Output of BRelu operator");
-    AddComment("BRelu activation operator, brelu = max(min(x, t_min), t_max)");
    AddAttr<AttrType>("t_min", "The min marginal value of BRelu")
        .SetDefault(static_cast<AttrType>(0));
    AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
        .SetDefault(static_cast<AttrType>(24));
+    AddComment(R"DOC(
+BRelu Activation Operator.
+$y = \max(\min(x, t_{min}), t_{max})$
+)DOC");
  }
 };
@@ -249,11 +336,14 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of SoftRelu operator");
    AddOutput("Y", "Output of SoftRelu operator");
-    AddComment(
-        "SoftRelu activation operator, soft_relu = log(1 + exp(max(min(x, "
-        "threshold), threshold)))");
    AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
        .SetDefault(static_cast<AttrType>(40));
+    AddComment(R"DOC(
+SoftRelu Activation Operator.
+$y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
+)DOC");
  }
 };
@@ -262,19 +352,19 @@ class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
+    AddInput("X", "Input of ELU operator");
-             "(Tensor) The input of ELU operator, it shouldn't be empty. Input "
+    AddOutput("Y", "Output of ELU operator");
-             "is flattened and treated as a 1D array.");
+    AddAttr<AttrType>("alpha", "The alpha value of ELU")
-    AddOutput("Y",
+        .SetDefault(static_cast<AttrType>(1.0f));
-              "(Tensor) The output of ELU operator. It has the same shape as "
-              "the input.");
-    AddAttr<AttrType>(
-        "alpha", "(float, default 1.0) Alpha value in the elu formulation.")
-        .SetDefault(static_cast<AttrType>(1.));
    AddComment(R"DOC(
-        ELU activation operator. It applies this element-wise computation on
+ELU Activation Operator.
-        the input: f(x) = max(0, x) + min(0, alpha * (exp(x) - 1)).
-        Check .. _Link: https://arxiv.org/abs/1511.07289 for more details.)DOC");
+Applies the following element-wise computation on the input according to
+https://arxiv.org/abs/1511.07289.
+$y = \max(0, x) + \min(0, \alpha * (e^x - 1))$
+)DOC");
  }
 };
@@ -285,9 +375,14 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Relu6 operator");
    AddOutput("Y", "Output of Relu6 operator");
-    AddComment("Relu6 activation operator, relu6 = min(max(0, x), 6)");
    AddAttr<AttrType>("threshold", "The threshold value of Relu6")
        .SetDefault(static_cast<AttrType>(6));
+    AddComment(R"DOC(
+Relu6 Activation Operator.
+$y = \min(\max(0, x), 6)$
+)DOC");
  }
 };
@@ -298,9 +393,14 @@ class PowOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Pow operator");
    AddOutput("Y", "Output of Pow operator");
-    AddComment("Pow activation operator, pow(x, factor) = x^factor");
    AddAttr<AttrType>("factor", "The exponential factor of Pow")
        .SetDefault(static_cast<AttrType>(1));
+    AddComment(R"DOC(
+Pow Activation Operator.
+$y = x^{factor}$
+)DOC");
  }
 };
@@ -311,11 +411,16 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of STanh operator");
    AddOutput("Y", "Output of STanh operator");
-    AddComment("STanh activation operator, stanh = b * tanh(a * x)");
    AddAttr<AttrType>("scale_a", "The scale parameter of a for the input")
        .SetDefault(static_cast<AttrType>(2 / 3));
    AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
        .SetDefault(static_cast<AttrType>(1.7159));
+    AddComment(R"DOC(
+STanh Activation Operator.
+$$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
+)DOC");
  }
 };
@@ -327,12 +432,19 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of ThresholdedRelu operator");
    AddOutput("Y", "Output of ThresholdedRelu operator");
-    AddComment(
-        "ThresholdedRelu activation operator, "
-        "thresholded_relu = x for x > threshold, "
-        "thresholded_relu = 0 otherwise.");
    AddAttr<AttrType>("threshold", "The threshold location of activation")
        .SetDefault(static_cast<AttrType>(1.0));
+    AddComment(R"DOC(
+ThresholdedRelu Activation Operator.
+$$
+y = \begin{cases} 
+    x, \text{if } x > threshold \\
+    0,  \text{otherwise}
+    \end{cases}
+$$
+)DOC");
  }
 };
@@ -344,27 +456,23 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of HardSigmoid operator");
    AddOutput("Y", "Output of HardSigmoid operator");
+    AddAttr<AttrType>("slope", "Slope for linear approximation of sigmoid")
+        .SetDefault(static_cast<AttrType>(0.2));
+    AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
+        .SetDefault(static_cast<AttrType>(0.5));
    AddComment(R"DOC(
-Hard Sigmoid activation operator.
+HardSigmoid Activation Operator.
-Segment-wise linear approximation of sigmoid[1].
+Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
-This is much faster than sigmoid.
+which is much faster than sigmoid.
-hard_sigmoid = max(0, min(1, slope * x + shift))
+$y = \max(0, \min(1, slope * x + shift))$
 The slope should be positive. The offset can be either positive or negative.
-The default slope and shift are set from [1].
+The default slope and shift are set according to the above reference.
 It is recommended to use the defaults for this activation.
-References:
+)DOC");
-  [1] Noisy Activation Functions
-      (https://arxiv.org/abs/1603.00391)
-    )DOC");
-    AddAttr<AttrType>("slope", "Slope for linear approximation of sigmoid")
-        .SetDefault(static_cast<AttrType>(0.2));
-    AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
-        .SetDefault(static_cast<AttrType>(0.5));
  }
 };

--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -232,7 +232,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
  }
 };
-// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < lambda; 0
+// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
 // otherwise
 template <typename T>
 struct SoftShrinkFunctor : public BaseActivationFunctor<T> {

--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -64,16 +64,15 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("AvgSquaredGrad",
+    AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
-             "(Tensor) Input expectation of squared gradient");
    AddInput("AvgSquaredUpdate",
-             "(Tensor) Input expectation of squared parameter updates");
+             "(Tensor) Input average of squared parameter updates");
    AddOutput("ParamOut", "(Tensor) Output parameter");
    AddOutput("AvgSquaredGradOut",
-              "(Tensor) Output expectation of squared gradient");
+              "(Tensor) Output average of squared gradient");
    AddOutput("AvgSquaredUpdateOut",
-              "(Tensor) Output expectation of squared parameter updates");
+              "(Tensor) Output average of squared parameter updates");
    AddAttr<float>("rho",
                   "(float, default 0.95) Exponential decay rate "
@@ -84,22 +83,21 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
                   "numerical stability")
        .SetDefault(1.0e-6f);
    AddComment(R"DOC(
-Adadelta Updates Operator.
+Adadelta Optimizer.
-This implements the Adadelta optimizer[1]. Adadelta is a per-dimension
+Adadelta optimizer is implemented as explained in:
-adaptive learning rate method for gradient descent.
+https://arxiv.org/abs/1212.5701
+Adadelta is a per-dimension adaptive learning rate method used
+for gradient descent.
-Adadelta updates:
+Adadelta updates are as follows:
-avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * grad * grad
+$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
-param_update =  - sqrt((avg_squared_update + epsilon) /
+paramUpdate =  - $\sqrt{((avgSquaredUpdate + \epsilon) /
-                       (avg_squared_grad_out + epsilon)) * grad
+                       (avgSquaredGrad_out + \epsilon))}$ * grad \break
-avg_squared_update_out = rho * avg_squared_update + (1 - rho) * param_update**2
+avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
-param_out = param + param_update
+                                  {(paramUpdate)}^2 \break
+paramOut = param + paramUpdate$$
-References:
-  [1] ADADELTA: An Adaptive Learning Rate Method
-      https://arxiv.org/abs/1212.5701
 )DOC");
  }

--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -73,12 +73,16 @@ class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
 Adaptive Gradient Algorithm (Adagrad).
-moment_out = moment + grad * grad
+The update is done as follows:
-param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
+$$momentOut = moment + grad * grad \break
+paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
+$$
 The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-does not have the epsilon attribute. It is added here for numerical stability 
+does not have the epsilon attribute. It is added here in our implementation
-by avoiding division by zero.
+as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
+for numerical stability to avoid the division by zero error.
 )DOC");
  }

--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -51,8 +51,8 @@ class AdamOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
                      "Beta1 power accumulator should have 1 dimension");
    auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
-                      "Beta1 power accumulator should have 1 dimension");
+                      "Beta2 power accumulator should have 1 dimension");
    auto param_dims = ctx->GetInputDim("Param");
    PADDLE_ENFORCE_EQ(
@@ -60,10 +60,10 @@ class AdamOp : public framework::OperatorWithKernel {
        "Param and Grad input of AdamOp should have same dimension");
    PADDLE_ENFORCE_EQ(
        param_dims, ctx->GetInputDim("Moment1"),
-        "Param and Moment input of AdamOp should have same dimension");
+        "Param and Moment1 input of AdamOp should have same dimension");
    PADDLE_ENFORCE_EQ(
        param_dims, ctx->GetInputDim("Moment2"),
-        "Param and InfNorm input of AdamOp should have same dimension");
+        "Param and Moment2 input of AdamOp should have same dimension");
    ctx->SetOutputDim("ParamOut", param_dims);
    ctx->SetOutputDim("Moment1Out", param_dims);
@@ -103,23 +103,20 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(1.0e-8f);
    AddComment(R"DOC(
-Adam Updates Operator.
+Adam Optimizer.
 This implements the Adam optimizer from Section 2 of the Adam
-paper[1]. Adam is a first-order gradient-based optimization
+paper : https://arxiv.org/abs/1412.6980.
-method based on adaptive estimates of lower-order moments.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
 Adam updates:
-moment1_out = beta1 * moment1 + (1 − beta1) * grad
+$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
-moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad
+moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
-learning_rate_t = learning_rate_t *
+learningRate = learningRate *
-                  sqrt(1 - beta2_pow) / (1 - beta1_pow)
+                  $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
-param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon)
+paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
-References:
-  [1] Adam: A Method for Stochastic Optimization
-      (https://arxiv.org/abs/1412.6980)
 )DOC");
  }

--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -99,26 +99,22 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
                   "Constant for numerical stability")
        .SetDefault(1.0e-8f);
    AddComment(R"DOC(
-Adamax Updates Operator.
+Adamax Optimizer.
-This implements the Adamax optimizer from Section 7 of the Adam
+We implement the Adamax optimizer from Section 7 of the Adam
-paper[1]. Adamax is a variant of the
+paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
 Adam algorithm based on the infinity norm.
 Adamax updates:
-moment_out = beta1 * moment + (1 - beta1) * grad
+$$momentOut = \beta_1 * moment + (1 - \beta_1) * grad \break
-inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad))
+infNormOut = max(\beta_2 * infNorm + \epsilon, |grad|) \break
-learning_rate_t = learning_rate/(1 - beta1_pow)
+learningRate = learningRate /(1 - \beta_1_{pow}) \break
-param_out = param - learning_rate_t * moment_out/inf_norm_out
+paramOut = param - learningRate * momentPut / infNormOut$$
 The original paper does not have an epsilon attribute.
-However, it is added here for numerical stability
+However, it is added here for numerical stability to prevent the
-by preventing divide by 0.
+division by 0 error.
-References:
-  [1] Adam: A Method for Stochastic Optimization
-      (https://arxiv.org/abs/1412.6980)
 )DOC");
  }

--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <numeric>
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+namespace paddle {
+namespace operators {
+using LoD = framework::LoD;
+class ArrayToLoDTensorOp : public framework::OperatorBase {
+ public:
+  ArrayToLoDTensorOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    auto &rank_table =
+        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    // Check dims, place and data type of input's elements and infer output's
+    // dim
+    PADDLE_ENFORCE(!x.empty(), "There's no element in the input array.");
+    int rank = x[0].dims().size();
+    platform::Place place = x[0].place();
+    std::type_index data_type = x[0].type();
+    framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank);
+    int64_t batch_size = x[0].dims()[0];
+    for (size_t i = 1; i < x.size(); ++i) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims,
+                        "The dimension of the %zu'th element in LoDTensorArray "
+                        "differs from previous ones.",
+                        i);
+      PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place),
+                     "The place class of the %zu'th element in LoDTensorArray "
+                     "differs from previous ones.",
+                     i);
+      PADDLE_ENFORCE(x[i].type() == data_type,
+                     "The date type of the %zu'th element in LoDTensorArray "
+                     "differs from previous ones.",
+                     i);
+      batch_size += x[i].dims()[0];
+    }
+    auto ins_dim_vec = framework::vectorize(ins_dims);
+    ins_dim_vec.insert(ins_dim_vec.begin(), batch_size);
+    framework::DDim out_dims = framework::make_ddim(ins_dim_vec);
+    out->Resize(out_dims);
+    out->mutable_data(place, data_type);
+    auto &table_items = rank_table.items();
+    std::vector<size_t> table_item_idx(table_items.size());
+    // table_item_idx = range(table_items_idx.size())
+    std::iota(table_item_idx.begin(), table_item_idx.end(), 0);
+    std::sort(table_item_idx.begin(), table_item_idx.end(),
+              [&](size_t a, size_t b) {
+                return table_items[a].index < table_items[b].index;
+              });
+    // Build LoDTensor `out`
+    framework::LoD *out_lod = out->mutable_lod();
+    out_lod->clear();
+    size_t out_offset = 0;
+    auto prefix_lod = rank_table.coarse_lod();
+    prefix_lod.emplace_back();
+    auto &cur_level_lod = prefix_lod.back();
+    cur_level_lod.push_back(0);
+    for (size_t idx : table_item_idx) {
+      cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
+      for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) {
+        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+            x[x_idx].lod(), idx, idx + 1, 0);
+        auto &lod_length = lod_and_offset.first;
+        framework::AppendLoD(out_lod, lod_length);
+        size_t start_offset = lod_and_offset.second.first;
+        size_t end_offset = lod_and_offset.second.second;
+        VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
+                 << ", " << end_offset << "]";
+        // Copy data
+        PADDLE_ENFORCE_GE(end_offset, start_offset);
+        size_t len = end_offset - start_offset;
+        if (len == 0) {
+          continue;
+        }
+        out->Slice(out_offset, out_offset + len)
+            .CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx);
+        out_offset += len;
+      }
+    }
+    out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end());
+  }
+};
+class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ArrayToLoDTensorOpProtoMaker(framework::OpProto *proto,
+                               framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(std::vector<LodTensor>) A vector of tensors that is going to "
+             "be casted to a big LoDTensor.");
+    AddInput("RankTable",
+             "(LoDRankTable) RankTable provides the coarse lod infomation to "
+             "build the output LoDTensor. See "
+             "'paddle/framework/lod_rank_table.h' for more details.");
+    AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
+    AddComment(
+        R"DOC(This Op build a big LoDTensor from a std::vector<LoDTensor> 
+          and a LoDRankTable. It is supposed to be used in getting dynamic RNN's
+          outputs back to a normal LoDTensor. The std::vector<LoDTensor> 
+          would be the output of RNN Op and the LoDRankTable would be build 
+          with RNN's input.)DOC");
+  }
+};
+class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "ArrayToLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("RankTable"),
+                   "ArrayToLoDTensorOp must has input RankTable.");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp,
+                  ops::ArrayToLoDTensorOpProtoMaker,
+                  ops::ArrayToLoDTensorInferShape);
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -23,11 +23,11 @@ class AucOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input of Indices must be initialized.");
+                   "Input of Indices should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input of Label must be initialized.");
+                   "Input of Label should not be null.");
    auto inference_height = ctx->GetInputDim("Out")[0];
    auto label_height = ctx->GetInputDim("Label")[0];
@@ -39,10 +39,11 @@ class AucOp : public framework::OperatorWithKernel {
  }
 protected:
-  // IndicateDataType
+  framework::OpKernelType GetKernelType(
-  framework::DataType IndicateDataType(
      const framework::ExecutionContext &ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Out")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        ctx.device_context());
  }
 };
@@ -52,20 +53,20 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Out",
             "A floating point 2D tensor, values are in the range [0, 1]."
-             "Each row is descend sorted. This input should be the"
+             "Each row is sorted in descending order. This input should be the"
             "output of topk."
             "Typically, this tensor indicates the probability of each label");
    AddInput("Indices",
             "An int 2D tensor, indicating the indices of original"
-             "tensor before sort. Typically, this tensor indicates which label"
+             "tensor before sorting. Typically, this tensor indicates which "
-             "the probability stands for.");
+             "label the probability stands for.");
    AddInput("Label",
             "A 2D int tensor indicating the label of the training data."
             "The height is batch size and width is always 1.");
    // TODO(typhoonzero): support weight input
    AddOutput("AUC",
              "A scalar representing the "
-              "current area-under-curve.");
+              "current area-under-the-curve.");
    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
        .SetDefault("ROC");
@@ -74,19 +75,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
                 " roc curve.")
        .SetDefault(200);
-    AddComment(
+    AddComment(R"DOC(
-        R"DOC(Computes the AUC according forward output and label.
+Area Under The Curve (AUC) Operator.
-Best to use for binary classification evaluations.
+This implementation computes the AUC according to forward output and label.
+It is used very widely in binary classification evaluation. As a note:
 If input label contains values other than 0 and 1, it will be cast
-to bool.
+to bool. You can find the relevant definitions here:
-You can find the definations here: 
 https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
-Possible curves are:
+There are two types of possible curves:
- ROC: Receiver operating characteristic
+1. ROC: Receiver operating characteristic
- PR: Precision Recall
+2. PR: Precision Recall
 )DOC");
  }
 };

--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -51,6 +51,10 @@ class BatchNormOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
    PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
+    const float epsilon = ctx->Attrs().Get<float>("epsilon");
+    PADDLE_ENFORCE_GE(epsilon, 0.0, "epsilon should be larger than 0");
+    PADDLE_ENFORCE_LE(epsilon, 0.001, "epsilon should not be too large");
    // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
    PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
                      "Mean and MeanOut should share the same memory");
@@ -66,7 +70,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
                                             : x_dims[x_dims.size() - 1]);
    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "Input x must have 3 to 5 dimensions.");
+                   "Input X must have 3 to 5 dimensions.");
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
@@ -93,16 +97,16 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "The input tensor");
    AddInput("Scale",
             "Scale is a 1-dimensional tensor of size C "
-             "to be applied to the output");
+             "that is applied to the output");
    AddInput("Bias",
             "Bias is a 1-dimensional tensor of size C "
-             "to be applied to the output");
+             "that is applied to the output");
    AddInput("Mean",
-             "The global mean (for training) or the "
+             "The global mean (for training) or "
             "estimated mean (for testing)");
    AddInput("Variance",
             "The global variance (for training) "
-             "or the estimated Variance (for testing)");
+             "or estimated Variance (for testing)");
    AddOutput("Y", "result after normalization");
    AddOutput("MeanOut",
              "Share memory with Mean. "
@@ -119,10 +123,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
              "will apply to output when training")
        .AsIntermediate();
    AddComment(R"DOC(
-https://arxiv.org/pdf/1502.03167.pdf
+Batch Normalization.
-NHWC `[batch, in_height, in_width, in_channels]`
+Batch Norm has been implemented as discussed in the paper:
-NCHW `[batch, in_channels, in_height, in_width]`
+https://arxiv.org/pdf/1502.03167.pdf
+Can be used as a normalizer function for conv2d and fully_connected operations.
+The required data format for this layer is one of the following:
+1. NHWC `[batch, in_height, in_width, in_channels]`
+2. NCHW `[batch, in_channels, in_height, in_width]`
 )DOC");
  }
@@ -295,9 +303,9 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
  }
-  framework::DataType IndicateDataType(
+ protected:
+  framework::OpKernelType GetKernelType(
      const framework::ExecutionContext &ctx) const override {
-    VLOG(3) << "IndicateDataType " << this->Type();
    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
    if (var == nullptr) {
      PADDLE_THROW("can't find Y@GRAD");
@@ -311,7 +319,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
    if (t == nullptr) {
      PADDLE_THROW("can't find Y@GRAD");
    }
-    return framework::ToDataType(t->type());
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.device_context());
  }
 };

--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -23,13 +23,17 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  CastOpProtoMaker(framework::OpProto *proto,
                   framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input tensor of cast op");
+    AddInput("X", "The input tensor of cast op");
-    AddOutput("Out", "the output tensor of cast op");
+    AddOutput("Out", "The output tensor of cast op");
-    AddComment(R"DOC(Cast operator.
-cast the input tensor to other data type.
-)DOC");
    AddAttr<int>("out_data_type", "output data type");
    AddAttr<int>("in_data_type", "input data type");
+    AddComment(R"DOC(
+Cast Operator.
+This Operator casts the input tensor to another data type and
+returns tha Output Tensor.
+)DOC");
  }
 };

--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -49,8 +49,11 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>(
        "max", "(float)Maximum value, above which element is replaced by max");
    AddComment(R"DOC(
-Clip operator limits the given input within an interval. The interval is
+Clip Operator.
+The clip operator limits the value of given input within an interval. The interval is
 specified with arguments 'min' and 'max'.
 )DOC");
  }
 };

--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/compare_op.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename OpComment>
+class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CompareOpProtoMaker(framework::OpProto *proto,
+                      framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    OpComment comment;
+    AddInput("X",
+             string::Sprintf("(LoDTensor) the left hand operand of %s operator",
+                             comment.type));
+    AddInput("Y", string::Sprintf(
+                      "(LoDTensor) the right hand operand of %s operator",
+                      comment.type));
+    AddOutput("Out", string::Sprintf(
+                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(%s Operator
+It operates element-wise on X and Y, and returns the Out. Each of them is a
+N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
+calculated by %s
+)DOC",
+                               comment.type, comment.equation));
+  }
+};
+template <typename OpComment>
+class CompareOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    PADDLE_ENFORCE(context->HasInput("X"), "%s operator must has input X",
+                   comment.type);
+    PADDLE_ENFORCE(context->HasInput("Y"), "%s operator must has input Y",
+                   comment.type);
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
+                      "The number of elements in X and Y should be same");
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+#define REGISTER_LOGICAL_OP(op_type, _equation)                               \
+  struct _##op_type##Comment {                                                \
+    static char type[];                                                       \
+    static char equation[];                                                   \
+  };                                                                          \
+  char _##op_type##Comment::type[]{#op_type};                                 \
+  char _##op_type##Comment::equation[]{_equation};                            \
+  REGISTER_OP_WITH_KERNEL(                                                    \
+      op_type, ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::CompareOpInferShape<_##op_type##Comment>,          \
+      ::paddle::framework::EmptyGradOpMaker);
+REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
+REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_OP(equal, "Out = X == Y");
+REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
--- a/paddle/operators/compare_op.cu
+++ b/paddle/operators/compare_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/compare_op.h"
+REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor);
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <math.h>
+#include <type_traits>
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+struct LessThanFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
+};
+template <typename T>
+struct EqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+    if (std::is_floating_point<T>::value) {
+      // This branch will be optimized while compiling if T is integer. It is
+      // safe to cast a and b to double.
+      return fabs(static_cast<double>(a - b)) < 1e-8;
+    } else {
+      return (a == b);
+    }
+  }
+};
+template <typename Place, typename Functor>
+class CompareOpKernel
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* y = context.Input<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
+    Functor binary_func;
+    platform::Transform<Place> trans;
+    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
+          y->data<T>(), out->mutable_data<bool>(context.GetPlace()),
+          binary_func);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                     \
+  REGISTER_OP_##dev##_KERNEL(                                              \
+      op_type,                                                             \
+      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+                                           functor<int>>,                  \
+      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+                                           functor<int64_t>>,              \
+      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+                                           functor<float>>,                \
+      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+                                           functor<double>>);
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -56,20 +56,24 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  ConcatOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input tensors of concat operator.").AsDuplicable();
+    AddInput("X", "Input tensors of concat operator.").AsDuplicable();
-    AddOutput("Out", "the output tensor of concat operator.");
+    AddOutput("Out", "Output tensor of concat operator.");
-    AddComment(R"DOC(
+    AddAttr<int>("axis",
-            Join the input tensors along with the axis.
+                 "The axis along which the input tensors will be concatenated.")
-            Examples:
-              Input[0] = [[1,2],[3,4]]
-              Input[1] = [[5,6]]
-              axis = 0
-              Output = [[1,2],
-                        [3,4],
-                        [5,6]]
-        )DOC");
-    AddAttr<int>("axis", "The axis which the inputs will be joined with.")
        .SetDefault(0);
+    AddComment(R"DOC(
+Concat Operator.
+Concatenate the input tensors along dimension axis.
+Examples:
+  Input[0] = [[1,2],[3,4]]
+  Input[1] = [[5,6]]
+  axis = 0
+  Output = [[1,2],
+            [3,4],
+            [5,6]]
+)DOC");
  }
 };

--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -216,11 +216,12 @@ class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
    AddComment(R"DOC(
-Sample dependent Cond Operator:
+Sample Dependent Conditional Operator.
-Given Cond[i] as a 1/0 vector to indicate true/false
-The equation is:
+Given Cond[i] as a 1/0 vector to indicate true/false:
-Out[i] = subnet_t[i], if Cond[i] == true
+Out[i] = subnet_true[i], if Cond[i] == true
-Out[i] = subnet_t[i], if Cond[i] == false
+Out[i] = subnet_false[i], if Cond[i] == false
 )DOC");
  }
 };

--- a/paddle/operators/conv2d_transpose_cudnn_op.cc
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/conv_transpose_op.h"
+namespace paddle {
+namespace operators {
+class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
+ public:
+  CudnnConv2DTransposeOpMaker(framework::OpProto* proto,
+                              framework::OpAttrChecker* op_checker)
+      : Conv2DTransposeOpMaker(proto, op_checker) {
+    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
+        .SetDefault(std::vector<int>{1, 1});
+    AddAttr<int>("workspace_size_MB",
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardward. This size should be carefully setted.")
+        .SetDefault(4096);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp,
+            ops::CudnnConv2DTransposeOpMaker, conv2d_transpose_cudnn_grad,
+            ops::ConvTransposeOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose_cudnn,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose_cudnn_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv2d_transpose_cudnn_op.cu
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memory.h"
+#include "paddle/operators/conv_transpose_op.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cudnn_helper.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
+using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
+using DataLayout = platform::DataLayout;
+static constexpr size_t kConvCudnnWorkspaceLimitBytes = 1024 * 1024 * 1024;
+template <typename T>
+class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    // cudnn v5 does not support dilations
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedFilterDescriptor filter_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout = DataLayout::kNCHW;
+    // N, M, H, W
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    // N, C, O_h, O_w
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output->dims()));
+    // M, C, K_h, K_w
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize2int(filter->dims()));
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+    // ------------------- cudnn conv workspace ---------------------
+    void* cudnn_workspace = nullptr;
+    size_t workspace_size_in_bytes;  // final workspace to allocate.
+    size_t workspace_size_limit = kConvCudnnWorkspaceLimitBytes;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+    // ------------------- cudnn conv algorithm ---------------------
+    cudnnConvolutionBwdDataAlgo_t algo;
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+    // Get the algorithm
+    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+        handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
+        // dxDesc: Handle to the previously initialized output tensor
+        // descriptor.
+        cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        workspace_size_limit, &algo));
+    // get workspace size able to allocate
+    PADDLE_ENFORCE(
+        platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+            handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
+            cudnn_output_desc, algo, &workspace_size_in_bytes));
+    // Allocate on GPU memory
+    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    // ------------------- cudnn conv transpose forward ---------------------
+    T alpha = 1.0f, beta = 0.0f;
+    PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+        handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc,
+        input_data, cudnn_conv_desc, algo, cudnn_workspace,
+        workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+template <typename T>
+class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    auto input = ctx.Input<Tensor>("Input");
+    auto filter = ctx.Input<Tensor>("Filter");
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    const T* input_data = input->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+    const T* filter_data = filter->data<T>();
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    // cudnn v5 does not support dilations
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedFilterDescriptor filter_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout = DataLayout::kNCHW;
+    // Input: (N, M, H, W)
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    // Output: (N, C, O_H, O_W)
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output_grad->dims()));
+    // Filter (M, C, K_H, K_W)
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize2int(filter->dims()));
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+    // ------------------- cudnn backward algorithm ---------------------
+    cudnnConvolutionFwdAlgo_t data_algo;
+    cudnnConvolutionBwdFilterAlgo_t filter_algo;
+    size_t bwd_filter_ws_size, fwd_ws_size;
+    size_t workspace_size_in_bytes = 0;
+    size_t workspace_size_limit = kConvCudnnWorkspaceLimitBytes;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+    if (input_grad) {
+      // choose backward algorithm for data
+      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+          handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
+          cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+          workspace_size_limit, &data_algo));
+      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+          handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
+          cudnn_input_desc, data_algo, &fwd_ws_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
+    }
+    if (filter_grad) {
+      // choose backward algorithm for filter
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+              handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
+              cudnn_filter_desc,
+              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &filter_algo));
+      // get workspace for backwards filter algorithm
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+              handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
+              cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
+      workspace_size_in_bytes =
+          std::max(workspace_size_in_bytes, bwd_filter_ws_size);
+    }
+    // ------------------- cudnn conv workspace ---------------------
+    // Already on GPU
+    void* cudnn_workspace = nullptr;
+    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    // ------------------- cudnn conv backward data ---------------------
+    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
+    T alpha = 1.0f, beta = 0.0f;
+    if (input_grad) {
+      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      auto t = framework::EigenVector<T>::Flatten(*input_grad);
+      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
+          t.constant(static_cast<T>(0));
+      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+          handle, &alpha, cudnn_output_desc, output_grad_data,
+          cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
+          cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+          input_grad_data));
+    }
+    // ------------------- cudnn conv backward filter ---------------------
+    if (filter_grad) {
+      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
+      auto t = framework::EigenVector<T>::Flatten(*filter_grad);
+      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
+          t.constant(static_cast<T>(0));
+      // Gradient with respect to the filter
+      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+          handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
+          input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
+          workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
+    }
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn,
+                       ops::CudnnConvTransposeOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad,
+                       ops::CudnnConvTransposeGradOpKernel<float>);
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -12,7 +12,7 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"
 namespace paddle {
 namespace operators {
@@ -29,7 +29,7 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
                 "workspace is a section of GPU memory which will be "
                 "allocated/freed each time the operator runs, larger "
                 "workspace size can increase performance but also requires "
-                 "better hardward. This size should be carefully setted.")
+                 "better hardware. This size should be chosen carefully.")
        .SetDefault(4096);
  }
 };
@@ -38,10 +38,11 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(conv_cudnn, ops::Conv2DOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
+REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
-            ops::Conv2DOpGrad);
+            ops::ConvOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    conv_cudnn, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(conv_cudnn,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
    conv_cudnn_grad,
-    ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv_cudnn_op.cu
+++ b/paddle/operators/conv_cudnn_op.cu
@@ -15,7 +15,7 @@
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memory.h"
-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cudnn_helper.h"
@@ -27,7 +27,6 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
-using CUDADeviceContext = platform::CUDADeviceContext;
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024;

--- a/paddle/operators/conv2d_op.cc
+++ b/paddle/operators/conv2d_op.cc
@@ -12,18 +12,18 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"
 namespace paddle {
 namespace operators {
-void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
+void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of Conv2DOp should not be null.");
+                 "Input(Input) of ConvOp should not be null.");
  PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                 "Input(Filter) of Conv2DOp should not be null.");
+                 "Input(Filter) of ConvOp should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                 "Output(Output) of Conv2DOp should not be null.");
+                 "Output(Output) of ConvOp should not be null.");
  auto in_dims = ctx->GetInputDim("Input");
  auto filter_dims = ctx->GetInputDim("Filter");
@@ -33,8 +33,17 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
  int input_channels = in_dims[1];
  int output_channels = filter_dims[0];
-  PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D.");
+  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
-  PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D.");
+                 "Conv intput should be 4-D or 5-D tensor.");
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(), filter_dims.size(),
+      "Conv input dimension and filter dimension should be the same.");
+  PADDLE_ENFORCE(
+      in_dims.size() - strides.size() == 2U,
+      "Conv input dimension and strides dimension should be consistent.");
+  PADDLE_ENFORCE_EQ(
+      paddings.size(), strides.size(),
+      "Conv paddings dimension and Conv strides dimension should be the same.");
  PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
                    "The number of input channels should be equal to filter "
                    "channels * groups.");
@@ -42,12 +51,12 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
      output_channels % groups, 0,
      "The number of output channels should be divided by groups.");
-  auto output_height =
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-      OutputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]);
+  for (size_t i = 0; i < paddings.size(); ++i) {
-  auto output_width =
+    output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2],
-      OutputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]);
+                                      paddings[i], strides[i]));
-  ctx->SetOutputDim("Output",
+  }
-                    {in_dims[0], filter_dims[0], output_height, output_width});
+  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }
 Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
@@ -55,18 +64,19 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput(
      "Input",
-      "The input tensor of convolution operator. "
+      "(Tensor) The input tensor of convolution operator. "
-      "The format of input tensor is NCHW. Where N is batch size, C is the "
+      "The format of input tensor is NCHW, where N is batch size, C is the "
-      "number of channels, H and W is the height and width of image.");
+      "number of channels, H is the height of the feature, "
+      "and W is the width of the feature.");
  AddInput("Filter",
-           "The filter tensor of convolution operator."
+           "(Tensor) The filter tensor of convolution operator. "
           "The format of the filter tensor is MCHW, where M is the number of "
           "output image channels, C is the number of input image channels, "
-           "H and W is height and width of filter. "
+           "H is the height of the filter, and W is the width of the filter. "
-           "If the groups attribute is greater than 1, C equal the number of "
+           "If the groups attribute is greater than 1, C equals the number of "
           "input image channels divided by the groups.");
  AddOutput("Output",
-            "The output tensor of convolution operator."
+            "(Tensor) The output tensor of convolution operator. "
            "The format of output tensor is also NCHW.");
  AddAttr<std::vector<int>>("strides", "strides of convolution operator.")
      .SetDefault({1, 1});
@@ -74,20 +84,100 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
      .SetDefault({0, 0});
  AddAttr<int>(
      "groups",
-      "group size of convolution operator. "
+      "(int default:1), the group size of convolution operator. "
-      "Refer to grouped convolution in Alex Krizhevsky's paper: "
+      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
-      "when group=2, the first half of the filters are only connected to the "
+      "when group=2, the first half of the filters is only connected to the "
-      "first half of the input channels, and the second half only connected "
+      "first half of the input channels, while the second half of the filters "
-      "to the second half.")
+      "is only connected to the second half of the input channels.")
+      .SetDefault(1);
+  AddComment(R"DOC(
+Convolution Operator.
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
+size, C is the number of channels, H is the height of the feature, and W is
+the width of the feature. Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
+The input(X) size and output(Out) size may be different.
+Example:
+  Input:
+       Input shape: (N, C_in, H_in, W_in)
+       Filter shape: (C_out, C_in, H_f, W_f)
+  Output:
+       Output shape: (N, C_out, H_out, W_out)
+  where
+       H_out = (H_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
+       W_out = (W_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
+)DOC");
+}
+Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
+                             framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution operator. "
+      "The format of input tensor is NCDHW. Where N is batch size, C is the "
+      "number of channels, D is the depth of the feature, H is the height of "
+      "the feature, "
+      "and W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution operator. "
+           "The format of the filter tensor is MCDHW, where M is the number of "
+           "output image channels, C is the number of input image channels, "
+           "D is the depth of the filter, H is the height of the filter, and W "
+           "is the width of the filter."
+           "If the groups attribute is greater than 1, C equals the number of "
+           "input image channels divided by the groups.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution operator."
+            "The format of output tensor is also NCDHW.");
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector, default:{0, 0, 0}), the strides of convolution operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector, default:{0, 0, 0}), the paddings of convolution operator.")
+      .SetDefault({0, 0, 0});
+  AddAttr<int>(
+      "groups",
+      "(int default:1), the group size of convolution operator. "
+      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
+      "when group=2, the first half of the filters is only connected to the "
+      "first half of the input channels, while the second half of the filters "
+      "is only connected to the second half of the input channels.")
      .SetDefault(1);
  AddComment(R"DOC(
+Convolution3D Operator.
 The convolution operation calculates the output based on the input, filter
 and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
+Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
+size, C is the number of channels,D is the depth of the feature, H is the height of
+the feature, and W is the width of the feature. Parameters(ksize, strides, paddings)
+are three elements. These three elements represent depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+Example:
+  Input:
+       Input shape: (N, C_in, D_in, H_in, W_in)
+       Filter shape: (C_out, C_in, D_f, H_f, W_f)
+  Output:
+       Output shape: (N, C_out, D_out, H_out, W_out)
+  where
+       D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
+       H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
+       W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1;
 )DOC");
 }
-void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
  auto in_dims = ctx->GetInputDim("Input");
  auto filter_dims = ctx->GetInputDim("Filter");
  if (ctx->HasOutput(framework::GradVarName("Input"))) {
@@ -102,10 +192,18 @@ void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(conv2d, ops::Conv2DOp, ops::Conv2DOpMaker, conv2d_grad,
+REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
-            ops::Conv2DOpGrad);
+            ops::ConvOpGrad);
+namespace ops = paddle::operators;
+REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
+            ops::ConvOpGrad);
+REGISTER_OP_CPU_KERNEL(conv2d,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(conv3d,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv2d_op.cu
+++ b/paddle/operators/conv2d_op.cu
@@ -12,11 +12,16 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"
 namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(conv2d,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    conv2d, ops::GemmConv2DKernel<paddle::platform::GPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(conv3d,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::GPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/conv2d_op.h
+++ b/paddle/operators/conv2d_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/im2col.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/vol2col.h"
 namespace paddle {
 namespace operators {
@@ -40,14 +41,20 @@ class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
                framework::OpAttrChecker* op_checker);
 };
-class Conv2DOp : public framework::OperatorWithKernel {
+class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv3DOpMaker(framework::OpProto* proto,
+                framework::OpAttrChecker* op_checker);
+};
+class ConvOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override;
 };
-class Conv2DOpGrad : public framework::OperatorWithKernel {
+class ConvOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -55,7 +62,7 @@ class Conv2DOpGrad : public framework::OperatorWithKernel {
 };
 template <typename Place, typename T>
-class GemmConv2DKernel : public framework::OpKernel<T> {
+class GemmConvKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");
@@ -70,51 +77,78 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    int groups = context.Attr<int>("groups");
-    int batch_size = input->dims()[0];
+    const int batch_size = static_cast<int>(input->dims()[0]);
-    int input_channels = input->dims()[1];
-    int filter_height = filter.dims()[filter.dims().size() - 2];
+    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
-    int filter_width = filter.dims()[filter.dims().size() - 1];
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    int output_channels = output->dims()[1];
+    filter_shape_vec.erase(filter_shape_vec.begin(),
-    int output_height = output->dims()[2];
+                           filter_shape_vec.begin() + 2);
-    int output_width = output->dims()[3];
+    // output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
-    paddle::operators::math::Im2ColFunctor<
+    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-        paddle::operators::math::ColFormat::kCFO, Place, T>
+    output_shape_vec.erase(output_shape_vec.begin(),
-        im2col;
+                           output_shape_vec.begin() + 2);
    // use col_shape in the im2col calculation
-    framework::DDim col_shape = {input_channels / groups, filter_height,
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-                                 filter_width, output_height, output_width};
+    // o_h, o_w}
+    std::vector<int64_t> col_shape_vec;
+    col_shape_vec.push_back(input->dims()[1] / groups);
+    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
+                         filter_shape_vec.end());
+    col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
+                         output_shape_vec.end());
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
    // use col_matrix_shape in the gemm calculation
-    framework::DDim col_matrix_shape = {
+    // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
-        input_channels / groups * filter_height * filter_width,
+    // o_h * o_w)
-        output_height * output_width};
+    framework::DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
    Tensor col;
    col.mutable_data<T>(col_shape, context.GetPlace());
    // col_matrix shares the same piece of data with col,
    // but will be reshaped into a two-dimensional matrix shape
    // to call the matrix multiplication interface.
-    Tensor col_matrix = col;
+    Tensor col_matrix;
+    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);
-    framework::DDim input_shape = {input->dims()[1], input->dims()[2],
+    framework::DDim input_shape = framework::slice_ddim(
-                                   input->dims()[3]};
+        input->dims(), 1, static_cast<int>(input->dims().size()));
    framework::DDim filter_matrix_shape = {filter.dims()[0],
                                           filter.numel() / filter.dims()[0]};
    filter.Resize(filter_matrix_shape);
-    framework::DDim output_matrix_shape = {output_channels,
+    framework::DDim output_matrix_shape = {
-                                           output_height * output_width};
+        output->dims()[1],
-    // convolution operator: im2col + gemm
+        output->numel() / (output->dims()[0] * output->dims()[1])};
-    int in_step = input_channels / groups;
-    int out_step = output_channels / groups;
+    // convolution operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output->dims()[1]) / groups;
    for (int i = 0; i < batch_size; i++) {
      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
      for (int g = 0; g < groups; g++) {
-        // im2col
        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-        im2col(context.device_context(), in_slice, col, strides[0], strides[1],
-               paddings[0], paddings[0], paddings[1], paddings[1]);
+        if (filter_shape_vec.size() == 2) {
+          // im2col
+          math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+          im2col(context.device_context(), in_slice, col, strides[0],
+                 strides[1], paddings[0], paddings[0], paddings[1],
+                 paddings[1]);
+        } else if (filter_shape_vec.size() == 3) {
+          // vol2col
+          math::Vol2ColFunctor<Place, T> vol2col;
+          vol2col(context.device_context(), in_slice, col, strides[0],
+                  strides[1], strides[2], paddings[0], paddings[1],
+                  paddings[2]);
+        }
        // gemm
        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
@@ -127,7 +161,7 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
 };
 template <typename Place, typename T>
-class GemmConvGrad2DKernel : public framework::OpKernel<T> {
+class GemmConvGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");
@@ -137,64 +171,79 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
        context.Output<Tensor>(framework::GradVarName("Input"));
    Tensor* filter_grad =
        context.Output<Tensor>(framework::GradVarName("Filter"));
    // The filter and filter_grad will be reshaped in the calculations,
    // so here use an assignment operation,
    // that avoids modifying the variable in the Scope.
    Tensor filter = *context.Input<Tensor>("Filter");
+    if (!input_grad && !filter_grad) return;
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    int groups = context.Attr<int>("groups");
-    int batch_size = input->dims()[0];
+    const int batch_size = static_cast<int>(input->dims()[0]);
-    int input_channels = input->dims()[1];
-    int filter_height = filter.dims()[filter.dims().size() - 2];
+    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
-    int filter_width = filter.dims()[filter.dims().size() - 1];
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-    int output_channels = output_grad->dims()[1];
+    filter_shape_vec.erase(filter_shape_vec.begin(),
-    int output_height = output_grad->dims()[2];
+                           filter_shape_vec.begin() + 2);
-    int output_width = output_grad->dims()[3];
+    // output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
-    paddle::operators::math::Col2ImFunctor<
+    std::vector<int64_t> output_shape_vec(
-        paddle::operators::math::ColFormat::kCFO, Place, T>
+        framework::vectorize(output_grad->dims()));
-        col2im;
+    output_shape_vec.erase(output_shape_vec.begin(),
-    paddle::operators::math::Im2ColFunctor<
+                           output_shape_vec.begin() + 2);
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        im2col;
+    // use col_shape in the im2col calculation
-    // use col_shape in the im2col and col2im calculation
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-    framework::DDim col_shape = {input_channels / groups, filter_height,
+    // o_h, o_w}
-                                 filter_width, output_height, output_width};
+    std::vector<int64_t> col_shape_vec;
+    col_shape_vec.push_back(input->dims()[1] / groups);
+    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
+                         filter_shape_vec.end());
+    col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
+                         output_shape_vec.end());
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
    // use col_matrix_shape in the gemm calculation
-    framework::DDim col_matrix_shape = {
+    // size: (i_c/g * k_h * k_w, o_h * o_w)
-        input_channels / groups * filter_height * filter_width,
+    // or
-        output_height * output_width};
+    // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
-    Tensor col;
+    framework::DDim col_matrix_shape =
-    col.mutable_data<T>(col_shape, context.GetPlace());
+        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix = col;
-    col_matrix.Resize(col_matrix_shape);
-    framework::DDim input_shape = {input->dims()[1], input->dims()[2],
+    framework::DDim input_shape = framework::slice_ddim(
-                                   input->dims()[3]};
+        input->dims(), 1, static_cast<int>(input->dims().size()));
-    framework::DDim output_matrix_shape = {
-        output_grad->dims()[1],
-        output_grad->dims()[2] * output_grad->dims()[3]};
    framework::DDim filter_matrix_shape = {filter.dims()[0],
                                           filter.numel() / filter.dims()[0]};
    filter.Resize(filter_matrix_shape);
-    // convolution backward input operator:  gemm + col2im
+    framework::DDim output_matrix_shape = {
-    // convolution backward weight operator: im2col + gemm
+        output_grad->dims()[1],
-    int in_step = input_channels / groups;
+        output_grad->numel() /
-    int out_step = output_channels / groups;
+            (output_grad->dims()[0] * output_grad->dims()[1])};
+    // convolution backward input operator:  gemm + col2im(or col2vol)
+    // convolution backward weight operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output_grad->dims()[1]) / groups;
+    Tensor col;
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    col.mutable_data<T>(col_shape, context.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+    math::SetConstant<Place, T> set_zero;
    if (input_grad) {
      input_grad->mutable_data<T>(context.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
+      set_zero(context.device_context(), input_grad, static_cast<T>(0));
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
      for (int i = 0; i < batch_size; i++) {
        Tensor out_grad_batch =
@@ -208,13 +257,22 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
          math::matmul<Place, T>(context.device_context(), filter_slice, true,
                                 out_grad_slice, false, T(1.0), &col_matrix,
                                 T(0.0));
          // col2im
          Tensor in_grad_slice =
              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-          col2im(context.device_context(), in_grad_slice, col, strides[0],
-                 strides[1], paddings[0], paddings[0], paddings[1],
+          if (filter_shape_vec.size() == 2) {
-                 paddings[1]);
+            math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
+            col2im(context.device_context(), in_grad_slice, col, strides[0],
+                   strides[1], paddings[0], paddings[0], paddings[1],
+                   paddings[1]);
+          } else if (filter_shape_vec.size() == 3) {
+            math::Col2VolFunctor<Place, T> col2vol;
+            col2vol(context.device_context(), in_grad_slice, col, strides[0],
+                    strides[1], strides[2], paddings[0], paddings[1],
+                    paddings[2]);
+          }
        }
      }
    }
@@ -223,8 +281,7 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
      filter_grad->mutable_data<T>(context.GetPlace());
      Tensor filter_grad_ = *filter_grad;
      filter_grad_.Resize(filter_matrix_shape);
-      auto t = framework::EigenVector<T>::Flatten(filter_grad_);
+      set_zero(context.device_context(), filter_grad, static_cast<T>(0));
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
      for (int i = 0; i < batch_size; i++) {
        Tensor out_grad_batch =
@@ -235,9 +292,18 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
          Tensor out_grad_slice =
              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-          im2col(context.device_context(), in_slice, col, strides[0],
-                 strides[1], paddings[0], paddings[0], paddings[1],
+          if (filter_shape_vec.size() == 2) {
-                 paddings[1]);
+            math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+            im2col(context.device_context(), in_slice, col, strides[0],
+                   strides[1], paddings[0], paddings[0], paddings[1],
+                   paddings[1]);
+          } else if (filter_shape_vec.size() == 3) {
+            math::Vol2ColFunctor<Place, T> vol2col;
+            vol2col(context.device_context(), in_slice, col, strides[0],
+                    strides[1], strides[2], paddings[0], paddings[1],
+                    paddings[2]);
+          }
          // gemm
          Tensor filter_grad_slice =
@@ -250,6 +316,5 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
    }
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/conv_shift_op.cc
+++ b/paddle/operators/conv_shift_op.cc
@@ -96,14 +96,13 @@ as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401
 The equation is:
-  \f[
+$$Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}$$
-      Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}
-  \f]
-where X's index is computed modulo M, and b's index is computed modulo N.
+where X's index is computed modulo M, and Y's index is computed modulo N.
+Both inputs X and Y can carry LoD (Level of Details) information.
+However, the output only shares the LoD information with input X.
-Both of the input `X` and `Y` can carry LoD (Level of Details) information.
-However, the output only shares the LoD information with input `X`.
 )DOC");
  }
 };

--- a/paddle/operators/conv_shift_op.cu
+++ b/paddle/operators/conv_shift_op.cu
@@ -130,9 +130,7 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
    dim3 grid_dim(num_x_blocks, batch_size);
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+    auto stream = context.cuda_device_context().stream();
-                      context.device_context())
-                      .stream();
    conv_shift_forward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
        x_data, y_data, out_data, x_width, y_width, y_half_width, batch_size);
@@ -159,9 +157,7 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
    int y_width = Y->dims()[1];
    int y_half_width = (y_width - 1) / 2;
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+    auto stream = context.cuda_device_context().stream();
-                      context.device_context())
-                      .stream();
    const int x_per_block = 256;
    int num_x_blocks = div_up(x_width, x_per_block);

--- a/paddle/operators/conv2dtranspose_op.cc
+++ b/paddle/operators/conv2dtranspose_op.cc
@@ -12,18 +12,18 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include "paddle/operators/conv2dtranspose_op.h"
+#include "paddle/operators/conv_transpose_op.h"
 namespace paddle {
 namespace operators {
-void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
+void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of Conv2DTransposeOp should not be null.");
+                 "Input(Input) of ConvTransposeOp should not be null.");
  PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                 "Input(Filter) of Conv2DTransposeOp should not be null.");
+                 "Input(Filter) of ConvTransposeOp should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                 "Output(Output) of Conv2DTransposeOp should not be null.");
+                 "Output(Output) of ConvTransposeOp should not be null.");
  auto in_dims = ctx->GetInputDim("Input");
  auto filter_dims = ctx->GetInputDim("Filter");
@@ -35,17 +35,27 @@ void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                      "No Padding allowed in conv transpose op.");
  }
-  PADDLE_ENFORCE_EQ(in_dims.size(), 4,
+  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
-                    "Conv2DTransposeOp input should be 4-D tensor.");
+                 "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-  PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
+  PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
-                    "Conv2DTransposeOp filter should be 4-D tensor.");
+                    "ConvTransposeOp input dimension and filter dimension "
+                    "should be the same.");
+  PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U,
+                 "ConvTransposeOp input dimension and strides dimension should "
+                 "be consistent.");
+  PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
+                    "ConvTransposeOp paddings dimension and Conv strides "
+                    "dimension should be the same.");
  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
-                    "input and kernel input dimension should be equal.");
+                    "In ConvTransposeOp, The input channel should be the same "
+                    "as the number of filters.");
-  auto output_height = (in_dims[2] - 1) * strides[0] + filter_dims[2];
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
-  auto output_width = (in_dims[3] - 1) * strides[1] + filter_dims[3];
+  for (size_t i = 0; i < paddings.size(); ++i) {
-  ctx->SetOutputDim("Output",
+    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] +
-                    {in_dims[0], filter_dims[1], output_height, output_width});
+                           filter_dims[i + 2]);
+  }
+  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }
 Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
@@ -55,32 +65,108 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
      "Input",
      "(Tensor) The input tensor of convolution transpose operator. "
      "The format of input tensor is NCHW. Where N is batch size, C is the "
-      "number of input channels, H and W is the height and width of image.");
+      "number of input channels, H is the height of the feature, and "
+      "W is the width of the feature.");
  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution transpose operator."
+           "(Tensor) The filter tensor of convolution transpose operator. "
           "The format of the filter tensor is CMHW, where C is the number of "
           "output image channels, M is the number of input image channels, "
-           "H and W is height and width of filter. "
+           "H is the height of the filter, and W is the width of the filter. "
           "We enforce groups number == 1 and padding == 0 in "
-           "convolution transpose Scenario.");
+           "the convolution transpose scenario.");
  AddOutput("Output",
-            "(Tensor) The output tensor of convolution transpose operator."
+            "(Tensor) The output tensor of convolution transpose operator. "
            "The format of output tensor is also NCHW.");
-  AddAttr<std::vector<int>>("strides",
+  AddAttr<std::vector<int>>(
-                            "strides of convolution transpose operator.")
+      "strides",
+      "(vector defalut:{1, 1}), strides of convolution transpose operator.")
      .SetDefault({1, 1});
-  AddAttr<std::vector<int>>("paddings",
+  AddAttr<std::vector<int>>(
-                            "paddings of convolution transpose operator.")
+      "paddings",
+      "(vector defalut:{0, 0}), paddings of convolution transpose operator.")
      .SetDefault({0, 0});
  AddComment(R"DOC(
+Convolution2D Transpose Operator.
 The convolution transpose operation calculates the output based on the input, filter
 and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
+Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
+size, C is the number of channels, H is the height of the feature, and 
+W is the width of the feature. Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
+The input(X) size and output(Out) size may be different.
+Example:
+  Input:
+       Input shape: (N, C_in, H_in, W_in)
+       Filter shape: (C_in, C_out, H_f, W_f)
+  Output:
+       Output shape: (N, C_out, H_out, W_out)
+  where
+       H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
+       W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
 )DOC");
 }
-void Conv2DTransposeOpGrad::InferShape(
+Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
-    framework::InferShapeContext* ctx) const {
+    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput("Input",
+           "(Tensor) The input tensor of convolution transpose operator."
+           "The format of input tensor is NCDHW. Where N is batch size, C is "
+           "the number of channels, D is the depth of the feature, H is the "
+           "height of the feature, and "
+           "W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution transpose operator."
+           "The format of the filter tensor is CMDHW, where C is the number of "
+           "output image channels, M is the number of input image channels, D "
+           "is the depth of the filter, H is the height of the filter, and "
+           "W is the width of the filter."
+           "We enforce groups number == 1 and padding == 0 in "
+           "the convolution3d transpose scenario.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution transpose operator."
+            "The format of output tensor is also NCDHW."
+            "Where N is batch size, C is "
+            "the number of channels, D is the depth of the feature, H is the "
+            "height of the feature, and W is the width of the feature.");
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector defalut:{1, 1, 1}), strides of convolution transpose operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector defalut:{0, 0, 0}), paddings of convolution transpose operator.")
+      .SetDefault({0, 0, 0});
+  AddComment(R"DOC(
+Convolution3D Transpose Operator.
+The convolution transpose operation calculates the output based on the input, filter
+and strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
+size, C is the number of channels, D is the depth of the feature, 
+H is the height of the feature, and W is the width of the feature. 
+Parameters(ksize, strides, paddings) are three elements.
+These three elements represent depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+Example:
+  Input:
+       Input shape: (N, C_in, D_in, H_in, W_in)
+       Filter shape: (C_in, C_out, D_f, H_f, W_f)
+  Output:
+       Output shape: (N, C_out, D_out, H_out, W_out)
+  where
+       D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
+       H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
+       W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2];
+)DOC");
+}
+void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
  auto in_dims = ctx->GetInputDim("Input");
  auto filter_dims = ctx->GetInputDim("Filter");
  if (ctx->HasOutput(framework::GradVarName("Input"))) {
@@ -95,13 +181,23 @@ void Conv2DTransposeOpGrad::InferShape(
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(conv2dtranspose, ops::Conv2DTransposeOp,
-            ops::Conv2DTransposeOpMaker, conv2dtranspose_grad,
+REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
-            ops::Conv2DTransposeOpGrad);
+            conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
+            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    conv2dtranspose,
+    conv3d_transpose,
-    ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    conv2dtranspose_grad,
+    conv3d_transpose_grad,
-    ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv2dtranspose_op.cu
+++ b/paddle/operators/conv2dtranspose_op.cu
@@ -12,13 +12,20 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include "paddle/operators/conv2dtranspose_op.h"
+#include "paddle/operators/conv_transpose_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    conv2dtranspose,
+    conv2d_transpose,
-    ops::GemmConv2DTransposeKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    conv2dtranspose_grad,
+    conv2d_transpose_grad,
-    ops::GemmConv2DTransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    conv3d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    conv3d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/conv2dtranspose_op.h
+++ b/paddle/operators/conv2dtranspose_op.h
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
--- a/paddle/operators/crf_decoding_op.h
+++ b/paddle/operators/crf_decoding_op.h
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
--- a/paddle/operators/dynamic_recurrent_op.cc
+++ b/paddle/operators/dynamic_recurrent_op.cc
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
--- a/paddle/operators/fill_constant_op.cu
+++ b/paddle/operators/fill_constant_op.cu
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
--- a/paddle/operators/gru_op.cu
+++ b/paddle/operators/gru_op.cu
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
--- a/paddle/operators/increment_op.cu
+++ b/paddle/operators/increment_op.cu
--- a/paddle/operators/increment_op.h
+++ b/paddle/operators/increment_op.h
--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
--- a/paddle/operators/math/detail/CMakeLists.txt
+++ b/paddle/operators/math/detail/CMakeLists.txt
--- a/paddle/operators/math/detail/activation_functions.h
+++ b/paddle/operators/math/detail/activation_functions.h
--- a/paddle/operators/math/detail/hl_avx_functions.cc
+++ b/paddle/operators/math/detail/hl_avx_functions.cc
--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
--- a/paddle/operators/math/detail/gru_kernel.h
+++ b/paddle/operators/math/detail/gru_kernel.h
--- a/paddle/operators/math/detail/hl_activation_functions.h
+++ b/paddle/operators/math/detail/hl_activation_functions.h
--- a/paddle/operators/math/detail/hl_cpu_functions.cc
+++ b/paddle/operators/math/detail/hl_cpu_functions.cc
--- a/paddle/operators/math/detail/hl_gpu_functions.h
+++ b/paddle/operators/math/detail/hl_gpu_functions.h
--- a/paddle/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_cpu_kernel.h
--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
--- a/paddle/operators/math/detail/lstm_kernel.h
+++ b/paddle/operators/math/detail/lstm_kernel.h
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
--- a/paddle/operators/math/detail/hl_functions.h
+++ b/paddle/operators/math/detail/hl_functions.h
--- a/paddle/operators/math/sequence_pooling.cc
+++ b/paddle/operators/math/sequence_pooling.cc
--- a/paddle/operators/math/sequence_pooling.cu
+++ b/paddle/operators/math/sequence_pooling.cu
--- a/paddle/operators/math/detail/hl_avx_functions.h
+++ b/paddle/operators/math/detail/hl_avx_functions.h
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/name_convention.md
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
--- a/paddle/operators/positive_negative_pair_op.cc
+++ b/paddle/operators/positive_negative_pair_op.cc
--- a/paddle/operators/positive_negative_pair_op.h
+++ b/paddle/operators/positive_negative_pair_op.h
--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
--- a/paddle/operators/seq_expand_op.h
+++ b/paddle/operators/seq_expand_op.h
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/operators/sign_op.cc
+++ b/paddle/operators/sign_op.cc
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
--- a/paddle/operators/squared_l2_norm_op.cc
+++ b/paddle/operators/squared_l2_norm_op.cc
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
--- a/paddle/optimizer/adadelta_optimizer.cc
+++ b/paddle/optimizer/adadelta_optimizer.cc
--- a/paddle/optimizer/adadelta_optimizer.h
+++ b/paddle/optimizer/adadelta_optimizer.h
--- a/paddle/optimizer/adagrad_optimizer.cc
+++ b/paddle/optimizer/adagrad_optimizer.cc
--- a/paddle/optimizer/adagrad_optimizer.h
+++ b/paddle/optimizer/adagrad_optimizer.h
--- a/paddle/optimizer/adam_optimizer.cc
+++ b/paddle/optimizer/adam_optimizer.cc
--- a/paddle/optimizer/adam_optimizer.h
+++ b/paddle/optimizer/adam_optimizer.h
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
--- a/paddle/optimizer/optimizer.h
+++ b/paddle/optimizer/optimizer.h
--- a/paddle/optimizer/parameter_optimizer.cc
+++ b/paddle/optimizer/parameter_optimizer.cc
--- a/paddle/optimizer/parameter_optimizer.h
+++ b/paddle/optimizer/parameter_optimizer.h
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
--- a/paddle/optimizer/sgd_optimizer.cc
+++ b/paddle/optimizer/sgd_optimizer.cc
--- a/paddle/optimizer/sgd_optimizer.h
+++ b/paddle/optimizer/sgd_optimizer.h
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
--- a/paddle/utils/arch/osx/Excepts.cpp
+++ b/paddle/utils/arch/osx/Excepts.cpp
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ b/paddle/utils/tests/test_StringUtils.cpp
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
--- a/python/paddle/v2/framework/backward.py
+++ b/python/paddle/v2/framework/backward.py
--- a/python/paddle/v2/framework/executor.py
+++ b/python/paddle/v2/framework/executor.py
--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
--- a/python/paddle/v2/framework/initializer.py
+++ b/python/paddle/v2/framework/initializer.py
--- a/python/paddle/v2/framework/io.py
+++ b/python/paddle/v2/framework/io.py
--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/framework/layer_helper.py
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
--- a/python/paddle/v2/framework/net_drawer.py
+++ b/python/paddle/v2/framework/net_drawer.py
--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/framework/nets.py
--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/framework/optimizer.py
--- a/python/paddle/v2/framework/tests/test_accuracy_op.py
+++ b/python/paddle/v2/framework/tests/test_accuracy_op.py
--- a/python/paddle/v2/framework/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/framework/tests/test_array_read_write_op.py
--- a/python/paddle/v2/framework/tests/test_compare_op.py
+++ b/python/paddle/v2/framework/tests/test_compare_op.py
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
--- a/python/paddle/v2/framework/tests/test_conv2dtranspose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2dtranspose_op.py
--- a/python/paddle/v2/framework/tests/test_conv3d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv3d_op.py
--- a/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
--- a/python/paddle/v2/framework/tests/test_crf_decoding_op.py
+++ b/python/paddle/v2/framework/tests/test_crf_decoding_op.py
--- a/python/paddle/v2/framework/tests/test_evaluator.py
+++ b/python/paddle/v2/framework/tests/test_evaluator.py
--- a/python/paddle/v2/framework/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/framework/tests/test_executor_and_mul.py
--- a/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
--- a/python/paddle/v2/framework/tests/test_fit_a_line.py
+++ b/python/paddle/v2/framework/tests/test_fit_a_line.py
--- a/python/paddle/v2/framework/tests/test_framework_debug_str.py
+++ b/python/paddle/v2/framework/tests/test_framework_debug_str.py
--- a/python/paddle/v2/framework/tests/test_gru_op.py
+++ b/python/paddle/v2/framework/tests/test_gru_op.py
--- a/python/paddle/v2/framework/tests/test_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_huber_loss_op.py
--- a/python/paddle/v2/framework/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_layer.py
--- a/python/paddle/v2/framework/tests/test_image_classification_train.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_train.py
--- a/python/paddle/v2/framework/tests/test_inference_model_io.py
+++ b/python/paddle/v2/framework/tests/test_inference_model_io.py
--- a/python/paddle/v2/framework/tests/test_initializer.py
+++ b/python/paddle/v2/framework/tests/test_initializer.py
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/framework/tests/test_layers.py
--- a/python/paddle/v2/framework/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/framework/tests/test_lod_rank_table.py
--- a/python/paddle/v2/framework/tests/test_lod_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_lod_tensor_array.py
--- a/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
--- a/python/paddle/v2/framework/tests/test_operator_desc.py
+++ b/python/paddle/v2/framework/tests/test_operator_desc.py
--- a/python/paddle/v2/framework/tests/test_parameter.py
+++ b/python/paddle/v2/framework/tests/test_parameter.py
--- a/python/paddle/v2/framework/tests/test_positive_negative_pair_op.py
+++ b/python/paddle/v2/framework/tests/test_positive_negative_pair_op.py
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/framework/tests/test_program.py
--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
--- a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
--- a/python/paddle/v2/framework/tests/test_recommender_system.py
+++ b/python/paddle/v2/framework/tests/test_recommender_system.py
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
--- a/python/paddle/v2/framework/tests/test_variable.py
+++ b/python/paddle/v2/framework/tests/test_variable.py
--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ b/python/paddle/v2/framework/tests/test_word2vec.py
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
--- a/python/paddle/v2/plot/plot.py
+++ b/python/paddle/v2/plot/plot.py
--- a/python/requirements.txt
+++ b/python/requirements.txt
--- a/python/setup.py.in
+++ b/python/setup.py.in