diff --git a/CMakeLists.txt b/CMakeLists.txt
index 264420ad830ed39b38f1918951d8d66c84fd5ee9..fd3582a1bca199d62d19550ffdd1efe9db520fa7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,7 +126,7 @@ include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
-include(external/pybind11)    # download pybind11
+include(external/pybind11)  # download pybind11
 include(external/nccl)
 
 include(cudnn)              # set cudnn libraries, must before configure
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
index 0b38943952f7fb9052368fe95eb31dd7592d8a47..310450f7d009dc0cdae9c0079a96445af8ec8f95 100644
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -79,9 +79,8 @@ if(NOT DEFINED IOS_ARCH)
     # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
     set(IOS_ARCH "arm64")
   elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    set(IOS_ARCH "i386;x86_64")
-  elseif(IOS_PLATFORM STREQUAL "WATCHOS")
-    set(IOS_ARCH armv7k)
+    # FIXME(liuyiqun): support "i386;x86_64" future
+    set(IOS_ARCH "x86_64")
   endif()
 endif()
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake
index 57d2c0a352507afd01d1cbf2c7b23c00ff7ad81b..fc43766efafc3d3e16f2906ce7f9a3d692c8e4ff 100644
--- a/cmake/external/nccl.cmake
+++ b/cmake/external/nccl.cmake
@@ -1,3 +1,21 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_GPU)
+  return()
+endif()
+
 include(ExternalProject)
 
 set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 143b57a954e4e6b2bf273535ebdf0fa8e3dab768..3f86e456cfbe55fe47e5b18e755e34829ebe9930 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 9391c285c7544669a5b1a078b7473d7a656c1bb4..4e87dc49d8956d1fa6dec777efc5a63c6b0f79a5 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -1,8 +1,26 @@
-INCLUDE(ExternalProject)
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+if(NOT WITH_PYTHON)
+    return()
+endif()
+
+include(ExternalProject)
 
-INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+
+include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
 
 ExternalProject_Add(
         extern_pybind
@@ -17,14 +35,12 @@ ExternalProject_Add(
         TEST_COMMAND      ""
 )
 
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";")
     add_library(pybind STATIC ${dummyfile})
 else()
     add_library(pybind INTERFACE)
 endif()
 
 add_dependencies(pybind extern_pybind)
-
-LIST(APPEND external_project_dependencies pybind)
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
index ce088ae7eaa3355f2f9761e8c421da0d7ef89fa7..9db457c7b2d61228e5d5af6827c4cda11a20a463 100644
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index e2c9fe56f335ae5b627b4d8d4bb17e4a2a466677..a98e069b7cd1654ddd5868560d0905eab6d9c692 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 46035a908b588861607a25d3a21cf34b7b6fd4b8..53c2de332ea74b06d1bd6e5bb119cad6af27ed01 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -1,27 +1,28 @@
 # This file is use to check all support level of AVX on your machine
 # so that PaddlePaddle can unleash the vectorization power of muticore.
 
-INCLUDE(CheckCXXSourceRuns)
-INCLUDE(CheckCXXSourceCompiles)
+include(CheckCXXSourceRuns)
+include(CheckCXXSourceCompiles)
 
-IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
     set(MMX_FLAG "-mmmx")
     set(SSE2_FLAG "-msse2")
     set(SSE3_FLAG "-msse3")
-    SET(AVX_FLAG "-mavx")
-    SET(AVX2_FLAG "-mavx2")
-ELSEIF(MSVC)
+    set(AVX_FLAG "-mavx")
+    set(AVX2_FLAG "-mavx2")
+elseif(MSVC)
     set(MMX_FLAG "/arch:MMX")
     set(SSE2_FLAG "/arch:SSE2")
     set(SSE3_FLAG "/arch:SSE3")
     SET(AVX_FLAG "/arch:AVX")
     SET(AVX2_FLAG "/arch:AVX2")
-ENDIF()
+endif()
 
 set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
 
 # Check  MMX
 set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
+set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <mmintrin.h>
 int main()
@@ -32,6 +33,7 @@ int main()
 
 # Check SSE2
 set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
+set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <emmintrin.h>
 int main()
@@ -42,6 +44,7 @@ int main()
 
 # Check SSE3
 set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
+set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <pmmintrin.h>
 int main()
@@ -55,6 +58,7 @@ int main()
 
 # Check AVX
 set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
@@ -67,6 +71,7 @@ int main()
 
 # Check AVX 2
 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 30b144d849bec367cd0197b6082889e011193a9a..0d34dec8e908c5e61001500725187a2233797f46 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -145,7 +145,7 @@ PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以
 
 Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
 
-PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
 如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
 
 我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md
index f7aa525054468670f59309ddf9206af55bb77869..2dea231ca5487978d59a4d0a570431722ed6b3bf 100644
--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -63,7 +63,7 @@
 </tr>
 
 <tr>
-<td class="left" rowspan="15">训练</td><td class="left">dot_period</td>
+<td class="left" rowspan="14">训练</td><td class="left">dot_period</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index 9279bac7f4b2898c18979630a8d6dfcb2dba70e0..ada51c2d73263898b2c748437f8eb0f30b537073 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -8,3 +8,4 @@ PaddlePaddle 文档
   howto/index_cn.rst
   api/index_cn.rst
   faq/index_cn.rst
+  mobile/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 64684b8b9b27e245c6b32ea28809d3bbce22fab9..23b64b6cadf776d44c4d0aa5a550ffe24be13b18 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -7,3 +7,4 @@ PaddlePaddle Documentation
   getstarted/index_en.rst
   howto/index_en.rst
   api/index_en.rst
+  mobile/index_en.rst
diff --git a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
similarity index 95%
rename from doc/howto/cross_compiling/cross_compiling_for_android_cn.md
rename to doc/mobile/cross_compiling_for_android_cn.md
index 58e4dd9c3fe43f963d00152aa4f456fadbb12bf3..882066f23714f7ab3bba9199b5fa5ff2325ce849 100644
--- a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -20,10 +20,32 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android
 构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
 Android的Docker开发镜像向用户提供两个可配置的参数：
 
-| Argument        | Optional Values         | Default |
-|-----------------|-------------------------|---------|
-|`ANDROID_ABI`    |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` |
-|`ANDROID_API`    |`>= 21` | `21` |
+<table class="docutils">
+<colgroup>
+  <col width="25%" />
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 21</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>
 
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
   ```bash
diff --git a/doc/howto/cross_compiling/cross_compiling_for_android.md b/doc/mobile/cross_compiling_for_android_en.md
similarity index 93%
rename from doc/howto/cross_compiling/cross_compiling_for_android.md
rename to doc/mobile/cross_compiling_for_android_en.md
index 161863e5c0a2c002af7d7611dad53c2c19148722..26858581fc1d77a9391520ac0dfd80fbd98f508c 100644
--- a/doc/howto/cross_compiling/cross_compiling_for_android.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -26,10 +26,32 @@ $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_A
 
 The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
 
-| Argument        | Optional Values         | Default |
-|-----------------|-------------------------|---------|
-|`ANDROID_ABI`    |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` |
-|`ANDROID_API`    |`>= 21` | `21` |
+<table class="docutils">
+<colgroup>
+  <col width="25%" />
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 21</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>
 
 The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
 
diff --git a/doc/howto/cross_compiling/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md
similarity index 91%
rename from doc/howto/cross_compiling/cross_compiling_for_ios_cn.md
rename to doc/mobile/cross_compiling_for_ios_cn.md
index 32c490d9aa4202e17aa1784a45a317c5307b98ea..cda636a67de712e072f4cc7ad859dda75211eaa8 100644
--- a/doc/howto/cross_compiling/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -27,10 +27,28 @@ iOS平台可选配置参数：
   - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
 - `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示：
 
-   | IOS_PLATFORM | IOS_ARCH             |
-   |--------------|----------------------|
-   |   OS         | armv7, armv7s, arm64 (默认) |
-   | SIMULATOR    | i386, x86_64 (默认)         |   
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 (默认)</td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 (默认)</td>
+    </tr>
+    </tbody>
+    </table>
 
 - `IOS_DEPLOYMENT_TARGET`，最小的iOS部署版本，默认值为`7.0`。
 - `IOS_ENABLE_BITCODE`，是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3)，可设置`ON/OFF`，默认值为`ON`。
diff --git a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md b/doc/mobile/cross_compiling_for_raspberry_cn.md
similarity index 100%
rename from doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
rename to doc/mobile/cross_compiling_for_raspberry_cn.md
diff --git a/doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md b/doc/mobile/cross_compiling_for_raspberry_en.md
similarity index 100%
rename from doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md
rename to doc/mobile/cross_compiling_for_raspberry_en.md
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1d99666e58b7043b85b0203ee0dfcd1957710161
--- /dev/null
+++ b/doc/mobile/index_cn.rst
@@ -0,0 +1,9 @@
+MOBILE
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_cn.md
+  cross_compiling_for_ios_cn.md
+  cross_compiling_for_raspberry_cn.md
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3c08d736717cfe8d5fdf449dc58015086befbe60
--- /dev/null
+++ b/doc/mobile/index_en.rst
@@ -0,0 +1,8 @@
+MOBILE
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_en.md
+  cross_compiling_for_raspberry_en.md
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index e767856d5012fd205f6b57f9721d0cbca8dc46ed..d267b14657be2a773d1dacfd9ac3767cddc47415 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -29,32 +29,32 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
 add_dependencies(paddle_capi paddle_proto)
 
 # TODO: paddle_capi_whole will be removed.
+set(PADDLE_CAPI_LAYERS_LIBS
+    paddle_function
+    paddle_gserver)
 if(MOBILE_INFERENCE)
-    set(PADDLE_CAPI_INFER_LIBS
-        paddle_utils
-        paddle_parameter
-        paddle_math
-        paddle_cuda
-        paddle_function
-        paddle_gserver
-        paddle_proto)
+  set(PADDLE_CAPI_ENGINE_LIBS
+      paddle_utils
+      paddle_parameter
+      paddle_math
+      paddle_cuda
+      paddle_proto)
 else()
-    set(PADDLE_CAPI_INFER_LIBS
-        paddle_utils
-        paddle_parameter
-        paddle_math
-        paddle_cuda
-        paddle_function
-        paddle_gserver
-        paddle_proto
-        paddle_pserver
-        paddle_network)
+  set(PADDLE_CAPI_ENGINE_LIBS
+      paddle_utils
+      paddle_parameter
+      paddle_math
+      paddle_cuda
+      paddle_proto
+      paddle_pserver
+      paddle_network)
 endif()
+set(PADDLE_CAPI_INFER_LIBS ${PADDLE_CAPI_LAYERS_LIBS} ${PADDLE_CAPI_ENGINE_LIBS})
 cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
 
 # Link the static library for inference
-cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto)
-cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver)
+cc_library(paddle_capi_engine DEPS paddle_capi ${PADDLE_CAPI_ENGINE_LIBS})
+cc_library(paddle_capi_layers DEPS ${PADDLE_CAPI_LAYERS_LIBS})
 
 # Link the shared library for inference
 if(NOT IOS)
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index 19b7ad1869af98e6313fe85a40203fd1e84f31d6..00d8ce017aa0121217688a1afc1fe31b4c3619ec 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -98,8 +98,19 @@ void SubSequenceLayer::forward(PassType passType) {
   CHECK_EQ(numSequences2, numSequences3);
 
   MatrixPtr inputValue = input.value;
-  IVectorPtr offsetValue = offsetSeq.ids;
-  IVectorPtr sizeValue = sizeSeq.ids;
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
 
   CHECK_EQ(offsetValue->getSize(), numSequences1);
   CHECK_EQ(sizeValue->getSize(), numSequences1);
@@ -176,8 +187,21 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) {
   size_t numSequences1 = startPositions1->getSize() - 1;
   const int* starts1 = startPositions1->getData();
 
-  IVectorPtr offsetValue = getInput(1).ids;
-  IVectorPtr sizeValue = getInput(2).ids;
+  const Argument& offsetSeq = getInput(1);
+  const Argument& sizeSeq = getInput(2);
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
 
   int* offsets = offsetValue->getData();
   int* sizes = sizeValue->getData();
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 7e5869cf2ca30caefa9cf29358771ef7132b5ba4..384f004e0ec8c8af23eb40611409d3aafef05912 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -75,6 +75,13 @@ function(op_library TARGET)
         # It's enough to just adding one operator to pybind
         file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
     endif()
+
+    # conv_transpose_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_transpose_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
+    endif()
     
     # pool_cudnn_op contains several operators
     if ("${TARGET}" STREQUAL "pool_cudnn_op")
@@ -147,10 +154,14 @@ set(DEPS_OPS
     pool_op
     pool_with_index_op
     conv_op
+    lstm_op
+    conv_transpose_op
     nccl_op
     sequence_conv_op
+    sequence_pool_op
     lod_rank_table_op
-    lstm_op)
+    lstm_op
+    gru_op)
 
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
@@ -164,10 +175,14 @@ if(WITH_GPU)
 op_library(nccl_op DEPS nccl_common)
 endif()
 op_library(sequence_conv_op DEPS context_project)
+op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(conv_transpose_op DEPS vol2col)
+op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
         DEPS net_op tensor_array)
 op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
+
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src})
diff --git a/paddle/operators/conv2d_transpose_cudnn_op.cc b/paddle/operators/conv2d_transpose_cudnn_op.cc
index 8ce94e0f04f14e1eae7e7d01280601cc72dea8c4..fce1357ce5af5f11ccc5941690431393301e6725 100644
--- a/paddle/operators/conv2d_transpose_cudnn_op.cc
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cc
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/conv2d_transpose_op.h"
+#include "paddle/operators/conv_transpose_op.h"
 
 namespace paddle {
 namespace operators {
@@ -38,13 +38,13 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(conv2d_transpose_cudnn, ops::Conv2DTransposeOp,
+REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp,
             ops::CudnnConv2DTransposeOpMaker, conv2d_transpose_cudnn_grad,
-            ops::Conv2DTransposeOpGrad);
+            ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_cudnn,
-    ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_cudnn_grad,
-    ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/conv2d_transpose_cudnn_op.cu b/paddle/operators/conv2d_transpose_cudnn_op.cu
index 61fcfb3bd8fa57f2c45fbf3a980dbe41041cff18..1aa8d110759a7d99c26cf7baaf6d4ce4b92975b9 100644
--- a/paddle/operators/conv2d_transpose_cudnn_op.cu
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cu
@@ -15,7 +15,7 @@
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memory.h"
-#include "paddle/operators/conv2d_transpose_op.h"
+#include "paddle/operators/conv_transpose_op.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cudnn_helper.h"
 
diff --git a/paddle/operators/conv2d_transpose_op.cc b/paddle/operators/conv2d_transpose_op.cc
deleted file mode 100644
index 8f5d18cddf45d1129040454adbc95a511ccf0583..0000000000000000000000000000000000000000
--- a/paddle/operators/conv2d_transpose_op.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/conv2d_transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of Conv2DTransposeOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                 "Input(Filter) of Conv2DTransposeOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                 "Output(Output) of Conv2DTransposeOp should not be null.");
-
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    PADDLE_ENFORCE_EQ(paddings[i], 0,
-                      "No Padding allowed in conv transpose op.");
-  }
-
-  PADDLE_ENFORCE_EQ(in_dims.size(), 4,
-                    "Conv2DTransposeOp input should be 4-D tensor.");
-  PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
-                    "Conv2DTransposeOp filter should be 4-D tensor.");
-  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
-                    "input and kernel input dimension should be equal.");
-
-  auto output_height = (in_dims[2] - 1) * strides[0] + filter_dims[2];
-  auto output_width = (in_dims[3] - 1) * strides[1] + filter_dims[3];
-  ctx->SetOutputDim("Output",
-                    {in_dims[0], filter_dims[1], output_height, output_width});
-}
-
-Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
-    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput(
-      "Input",
-      "(Tensor) The input tensor of convolution transpose operator. "
-      "The format of input tensor is NCHW, where N is batch size, C is the "
-      "number of input channels, H is the height of the image, and "
-      "W is the width of the image.");
-  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution transpose operator."
-           "The format of the filter tensor is CMHW, where C is the number of "
-           "output image channels, M is the number of input image channels, "
-           "H is the height of the filter, and W is the width of the filter. "
-           "We enforce groups number == 1 and padding == 0 in "
-           "the convolution transpose scenario.");
-  AddOutput("Output",
-            "(Tensor) The output tensor of convolution transpose operator."
-            "The format of output tensor is also NCHW.");
-  AddAttr<std::vector<int>>("strides",
-                            "strides of convolution transpose operator.")
-      .SetDefault({1, 1});
-  AddAttr<std::vector<int>>("paddings",
-                            "paddings of convolution transpose operator.")
-      .SetDefault({0, 0});
-  AddComment(R"DOC(
-Convolution Transpose Operator.
-
-The convolution transpose operation calculates the output based on the input, 
-filter, strides, paddings, and groups parameters. The size of each dimension 
-of the parameters is checked in the infer-shape method.
-
-)DOC");
-}
-
-void Conv2DTransposeOpGrad::InferShape(
-    framework::InferShapeContext* ctx) const {
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  if (ctx->HasOutput(framework::GradVarName("Input"))) {
-    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-  }
-  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(conv2d_transpose, ops::Conv2DTransposeOp,
-            ops::Conv2DTransposeOpMaker, conv2d_transpose_grad,
-            ops::Conv2DTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose,
-    ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose_grad,
-    ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/conv2d_transpose_op.h b/paddle/operators/conv2d_transpose_op.h
deleted file mode 100644
index cab7788227690621a0e5b744197b86c515bbef72..0000000000000000000000000000000000000000
--- a/paddle/operators/conv2d_transpose_op.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/im2col.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-// Define Op classes in .h file so that other conv transpose
-// operator implementations can reuse the code.
-class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Conv2DTransposeOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker);
-};
-
-class Conv2DTransposeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-class Conv2DTransposeOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-template <typename Place, typename T>
-class GemmConv2DTransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped, so it should not be constant pointer
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    Tensor* output = context.Output<Tensor>("Output");
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-
-    // TODO(Zhuoyuan): Paddings can be added in future.
-    // groups will alway be disabled in conv2d_transpose.
-
-    const int batch_size = input->dims()[0];
-    const int m = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    const int k_h = filter.dims()[2];
-    const int k_w = filter.dims()[3];
-
-    const int c = output->dims()[1];  // output channels
-    const int o_h = output->dims()[2];
-    const int o_w = output->dims()[3];
-
-    paddle::operators::math::Col2ImFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        col2im;
-
-    // use col_shape in the im2col and col2im calculation
-    DDim col_shape = {c, k_h, k_w, h, w};
-
-    // use col_matrix_shape in the gemm calculation
-    DDim col_matrix_shape = {c * k_h * k_w, h * w};
-
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-
-    DDim output_shape = {c, o_h, o_w};
-    DDim input_matrix_shape = {m, h * w};
-
-    DDim filter_matrix_shape = {m, c * k_h * k_w};
-    filter.Resize(filter_matrix_shape);
-
-    // convolution transpose: gemm + col2im (similar to conv-backward on input)
-
-    output->mutable_data<T>(context.GetPlace());
-    auto t = framework::EigenVector<T>::Flatten(*output);
-    t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
-
-    for (int i = 0; i < batch_size; i++) {
-      // batch with size (M, h * w)
-      Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-      // filter size: (M, c * k_h * k_w)
-
-      // output size: (c, o_h, o_w)
-      Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
-
-      // col_matrix = filter * input_batch
-      // of shape (c * k_h * k_w, h * w)
-      math::matmul<Place, T>(context.device_context(), filter, true,
-                             input_batch, false, T(1.0), &col_matrix, T(0.0));
-      col2im(context.device_context(), output_batch, col, strides[0],
-             strides[1], 0, 0, 0, 0);
-    }
-  }
-};
-
-template <typename Place, typename T>
-class GemmConv2DTransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-
-    // For filter, we do not use const pointer b/c we will do reshape,
-    // but we should avoid modifying its value.
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    // Actually, no paddings and groups allowed in conv transpose.
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-
-    const int batch_size = input->dims()[0];
-    const int m = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    const int k_h = filter.dims()[2];
-    const int k_w = filter.dims()[3];
-
-    const int c = output_grad->dims()[1];  // output channels
-    const int o_h = output_grad->dims()[2];
-    const int o_w = output_grad->dims()[3];
-
-    // Only im2col functor required for bp to get to the right shape
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        im2col;
-
-    // use col_shape in the im2col and col2im calculation
-    DDim col_shape = {c, k_h, k_w, h, w};
-
-    // use col_matrix_shape in the gemm calculation
-    DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
-
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-
-    DDim output_shape = {c, o_h, o_w};
-    DDim input_matrix_shape = {m, h * w};
-
-    DDim filter_matrix_shape = {m, c * k_h * k_w};
-    filter.Resize(filter_matrix_shape);
-
-    // convolution transpose grad on input:
-    // im2col + gemm (similar to conv-forward)
-    // input need to compute gradient
-    if (input_grad) {
-      Tensor col_matrix;
-      col_matrix.ShareDataWith(col);
-      DDim col_matrix_shape = {c * k_h * k_w, h * w};
-      col_matrix.Resize(col_matrix_shape);
-
-      input_grad->mutable_data<T>(context.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
-
-      for (int i = 0; i < batch_size; i++) {
-        // batch with size (c, o_h * o_w)
-        Tensor output_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_shape);
-        // filter of size (m, c * k_h * k_w)
-
-        // batch with size (m, h, w)
-        Tensor input_grad_batch =
-            input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
-
-        // im2col: dy from (c, o_h, o_w) -> (c * k_h * k_w, h * w)
-        im2col(context.device_context(), output_grad_batch, col, strides[0],
-               strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
-
-        // gemm: dx = filter * dy
-        // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, c, h)
-        math::matmul<Place, T>(context.device_context(), filter, false,
-                               col_matrix, false, T(1.0), &input_grad_batch,
-                               T(0.0));
-      }
-    }
-
-    // filter gradient required
-    if (filter_grad) {
-      Tensor col_matrix_f;
-      col_matrix_f.ShareDataWith(col);
-      DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
-      col_matrix_f.Resize(col_matrix_shape_f);
-
-      filter_grad->mutable_data<T>(context.GetPlace());
-      Tensor filter_grad_ = *filter_grad;
-      filter_grad_.Resize(filter_matrix_shape);
-      auto t = framework::EigenVector<T>::Flatten(filter_grad_);
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
-
-      for (int i = 0; i < batch_size; ++i) {
-        // batch with size (c, o_h, o_w)
-        Tensor output_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_shape);
-        // input batch
-        Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-
-        // im2col: (c * h * w, k_h * k_w)
-        im2col(context.device_context(), output_grad_batch, col, strides[0],
-               strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
-
-        // gemm: d_filter = x * y_grad^T
-        // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, c, h)
-        math::matmul<Place, T>(context.device_context(), in_batch, false,
-                               col_matrix_f, true, T(1.0), &filter_grad_,
-                               T(1.0));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50081779a5ea3c81884007d4e4b7832dc4ea2bdd
--- /dev/null
+++ b/paddle/operators/conv_transpose_op.cc
@@ -0,0 +1,203 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv_transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "Input(Input) of ConvTransposeOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                 "Input(Filter) of ConvTransposeOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                 "Output(Output) of ConvTransposeOp should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    PADDLE_ENFORCE_EQ(paddings[i], 0,
+                      "No Padding allowed in conv transpose op.");
+  }
+
+  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
+                 "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+  PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
+                    "ConvTransposeOp input dimension and filter dimension "
+                    "should be the same.");
+  PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U,
+                 "ConvTransposeOp input dimension and strides dimension should "
+                 "be consistent.");
+  PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
+                    "ConvTransposeOp paddings dimension and Conv strides "
+                    "dimension should be the same.");
+  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
+                    "In ConvTransposeOp, The input channel should be the same "
+                    "as the number of filters.");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] +
+                           filter_dims[i + 2]);
+  }
+  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+}
+
+Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
+    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution transpose operator. "
+      "The format of input tensor is NCHW. Where N is batch size, C is the "
+      "number of input channels, H is the height of the feature, and "
+      "W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution transpose operator. "
+           "The format of the filter tensor is CMHW, where C is the number of "
+           "output image channels, M is the number of input image channels, "
+           "H is the height of the filter, and W is the width of the filter. "
+           "We enforce groups number == 1 and padding == 0 in "
+           "the convolution transpose scenario.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution transpose operator. "
+            "The format of output tensor is also NCHW.");
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector defalut:{1, 1}), strides of convolution transpose operator.")
+      .SetDefault({1, 1});
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector defalut:{0, 0}), paddings of convolution transpose operator.")
+      .SetDefault({0, 0});
+  AddComment(R"DOC(
+Convolution2D Transpose Operator.
+
+The convolution transpose operation calculates the output based on the input, filter
+and strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+
+Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
+size, C is the number of channels, H is the height of the feature, and 
+W is the width of the feature. Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
+The input(X) size and output(Out) size may be different.
+Example:
+  Input:
+       Input shape: (N, C_in, H_in, W_in)
+       Filter shape: (C_in, C_out, H_f, W_f)
+  Output:
+       Output shape: (N, C_out, H_out, W_out)
+  where
+       H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
+       W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
+)DOC");
+}
+
+Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
+    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput("Input",
+           "(Tensor) The input tensor of convolution transpose operator."
+           "The format of input tensor is NCDHW. Where N is batch size, C is "
+           "the number of channels, D is the depth of the feature, H is the "
+           "height of the feature, and "
+           "W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution transpose operator."
+           "The format of the filter tensor is CMDHW, where C is the number of "
+           "output image channels, M is the number of input image channels, D "
+           "is the depth of the filter, H is the height of the filter, and "
+           "W is the width of the filter."
+           "We enforce groups number == 1 and padding == 0 in "
+           "the convolution3d transpose scenario.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution transpose operator."
+            "The format of output tensor is also NCDHW."
+            "Where N is batch size, C is "
+            "the number of channels, D is the depth of the feature, H is the "
+            "height of the feature, and W is the width of the feature.");
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector defalut:{1, 1, 1}), strides of convolution transpose operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector defalut:{0, 0, 0}), paddings of convolution transpose operator.")
+      .SetDefault({0, 0, 0});
+  AddComment(R"DOC(
+Convolution3D Transpose Operator.
+
+The convolution transpose operation calculates the output based on the input, filter
+and strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+
+Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
+size, C is the number of channels, D is the depth of the feature, 
+H is the height of the feature, and W is the width of the feature. 
+Parameters(ksize, strides, paddings) are three elements.
+These three elements represent depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+Example:
+  Input:
+       Input shape: (N, C_in, D_in, H_in, W_in)
+       Filter shape: (C_in, C_out, D_f, H_f, W_f)
+  Output:
+       Output shape: (N, C_out, D_out, H_out, W_out)
+  where
+       D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
+       H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
+       W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2];
+)DOC");
+}
+
+void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+  }
+  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
+            conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
+            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/conv2d_transpose_op.cu b/paddle/operators/conv_transpose_op.cu
similarity index 63%
rename from paddle/operators/conv2d_transpose_op.cu
rename to paddle/operators/conv_transpose_op.cu
index 931ac9eed294c4fe7c726d8cc2c4d9a39ec12828..401cddb379ced134b800d2a078fe130a2850fbb2 100644
--- a/paddle/operators/conv2d_transpose_op.cu
+++ b/paddle/operators/conv_transpose_op.cu
@@ -12,13 +12,20 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/conv2d_transpose_op.h"
+#include "paddle/operators/conv_transpose_op.h"
 
 namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(
     conv2d_transpose,
-    ops::GemmConv2DTransposeKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
     conv2d_transpose_grad,
-    ops::GemmConv2DTransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
+
+REGISTER_OP_GPU_KERNEL(
+    conv3d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    conv3d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c1a6220d784abf89ec789f94d9cff9e5414db04
--- /dev/null
+++ b/paddle/operators/conv_transpose_op.h
@@ -0,0 +1,293 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/im2col.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/vol2col.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+// Define Op classes in .h file so that other conv transpose
+// operator implementations can reuse the code.
+class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv2DTransposeOpMaker(framework::OpProto* proto,
+                         framework::OpAttrChecker* op_checker);
+};
+
+class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv3DTransposeOpMaker(framework::OpProto* proto,
+                         framework::OpAttrChecker* op_checker);
+};
+
+class ConvTransposeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+class ConvTransposeOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+template <typename Place, typename T>
+class GemmConvTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    // The filter will be reshaped, so it should not be constant pointer
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    // TODO(Zhuoyuan): Paddings can be added in future.
+    // groups will alway be disabled in conv2dtranspose.
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // input_shape_vec: {h, w} or {d, h, w}
+    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
+    input_shape_vec.erase(input_shape_vec.begin(), input_shape_vec.begin() + 2);
+
+    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
+    filter_shape_vec.erase(filter_shape_vec.begin(),
+                           filter_shape_vec.begin() + 2);
+
+    // use col_shape in the im2col and col2im (or vol2col and col2vol)
+    // calculation
+    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
+    std::vector<int64_t> col_shape_vec;
+    col_shape_vec.push_back(output->dims()[1]);
+    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
+                         filter_shape_vec.end());
+    col_shape_vec.insert(col_shape_vec.end(), input_shape_vec.begin(),
+                         input_shape_vec.end());
+    DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+    DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
+
+    Tensor col;
+    col.mutable_data<T>(col_shape, context.GetPlace());
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+
+    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+    DDim output_shape =
+        framework::slice_ddim(output->dims(), 1, output->dims().size());
+
+    // input matrix size: (m, h * w) or (m, d * h * w)
+    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
+
+    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
+    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+    filter.Resize(filter_matrix_shape);
+
+    output->mutable_data<T>(context.GetPlace());
+    math::SetConstant<Place, T> set_zero;
+    set_zero(context.device_context(), output, static_cast<T>(0));
+
+    // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
+    // on input)
+    for (int i = 0; i < batch_size; i++) {
+      // batch with size (m, h * w) or (m, d * h * w)
+      Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
+
+      // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+      Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
+
+      // col_matrix = filter * input_batch
+      // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+      math::matmul<Place, T>(context.device_context(), filter, true,
+                             input_batch, false, static_cast<T>(1.0),
+                             &col_matrix, static_cast<T>(0.0));
+
+      if (filter_shape_vec.size() == 2) {
+        // col2im: col_matrix -> dy
+        // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
+        math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
+
+        col2im(context.device_context(), output_batch, col, strides[0],
+               strides[1], 0, 0, 0, 0);
+      } else if (filter_shape_vec.size() == 3) {
+        // col2vol: col_matrix -> dy
+        // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
+        math::Col2VolFunctor<Place, T> col2vol;
+        col2vol(context.device_context(), output_batch, col, strides[0],
+                strides[1], strides[2], 0, 0, 0);
+      }
+    }
+  }
+};
+
+template <typename Place, typename T>
+class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    // For filter, we do not use const pointer b/c we will do reshape,
+    // but we should avoid modifying its value.
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    if ((!input_grad) && (!filter_grad)) return;
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    // Actually, no paddings and groups allowed in conv transpose.
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // input_shape_vec: {h, w} or {d, h, w}
+    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
+    input_shape_vec.erase(input_shape_vec.begin(), input_shape_vec.begin() + 2);
+
+    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
+    filter_shape_vec.erase(filter_shape_vec.begin(),
+                           filter_shape_vec.begin() + 2);
+
+    // use col_shape in the im2col and col2im (or vol2col and col2vol)
+    // calculation
+    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
+    std::vector<int64_t> col_shape_vec;
+    col_shape_vec.push_back(output_grad->dims()[1]);
+    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
+                         filter_shape_vec.end());
+    col_shape_vec.insert(col_shape_vec.end(), input_shape_vec.begin(),
+                         input_shape_vec.end());
+    DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+    DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
+
+    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+    DDim output_shape = framework::slice_ddim(output_grad->dims(), 1,
+                                              output_grad->dims().size());
+
+    // input matrix size: (m, h * w) or (m, d * h * w)
+    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
+
+    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
+    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+    filter.Resize(filter_matrix_shape);
+
+    // convolution transpose grad on input:
+    // im2col + gemm (similar to conv-forward)
+    // input need to compute gradient
+    if (input_grad || filter_grad) {
+      Tensor col;
+      col.mutable_data<T>(col_shape, context.GetPlace());
+      // col_matrix shares the same piece of data with col,
+      // but will be reshaped into a two-dimensional matrix shape
+      // to call the matrix multiplication interface.
+      Tensor col_matrix;
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+
+      Tensor filter_grad_;
+      math::SetConstant<Place, T> set_zero;
+
+      if (input_grad) {
+        input_grad->mutable_data<T>(context.GetPlace());
+        set_zero(context.device_context(), input_grad, static_cast<T>(0));
+      }
+      if (filter_grad) {  // filter size (m, c, k_h, k_w)
+        filter_grad->mutable_data<T>(context.GetPlace());
+        set_zero(context.device_context(), filter_grad, static_cast<T>(0));
+        filter_grad_ = *filter_grad;
+        filter_grad_.Resize(filter_matrix_shape);
+      }
+
+      for (int i = 0; i < batch_size; i++) {
+        // batch with size (c, o_h * o_w)
+        Tensor output_grad_batch =
+            output_grad->Slice(i, i + 1).Resize(output_shape);
+
+        if (filter_shape_vec.size() == 2) {
+          // im2col: dy -> col matrix
+          // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
+          math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+          im2col(context.device_context(), output_grad_batch, col, strides[0],
+                 strides[1], paddings[0], paddings[0], paddings[1],
+                 paddings[1]);
+        } else if (filter_shape_vec.size() == 3) {
+          // vol2col: dy -> col_matrix
+          // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
+          math::Vol2ColFunctor<Place, T> vol2col;
+          vol2col(context.device_context(), output_grad_batch, col, strides[0],
+                  strides[1], strides[2], paddings[0], paddings[1],
+                  paddings[2]);
+        }
+
+        if (input_grad) {
+          // batch with size (m, h, w)
+          Tensor input_grad_batch =
+              input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
+          // gemm: dx = filter * dy
+          // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, h * w)
+          // or
+          // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
+          // d, h, w)
+          math::matmul<Place, T>(context.device_context(), filter, false,
+                                 col_matrix, false, static_cast<T>(1.0),
+                                 &input_grad_batch, static_cast<T>(0.0));
+        }
+        if (filter_grad) {
+          // input batch
+          Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
+          // gemm: d_filter = x * dy^T
+          // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, k_h * k_w)
+          // or
+          // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
+          // k_h * k_w)
+          math::matmul<Place, T>(context.device_context(), in_batch, false,
+                                 col_matrix, true, static_cast<T>(1.0),
+                                 &filter_grad_, static_cast<T>(1.0));
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 24df1fcadac75315890635f4d3aaa7146c1cc27b..9d41879b27a24f83090f5abf1325eca5f9488d00 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -114,21 +114,17 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
              "where N is the batch size and D is the number of classes. "
              "This input is a probability computed by the previous operator, "
              "which is almost always the result of a softmax operator.");
-    AddInput(
-        "Label",
-        "(Tensor, default Tensor<int>), the ground truth which is "
-        "a 2-D tensor. "
-        "When soft_label is set to false, Label is a Tensor<int> with shape "
-        "[N x 1]. "
-        "When soft_label is set to true, Label is a Tensor<float/double> "
-        "with shape [N x K].");
+    AddInput("Label",
+             "(Tensor), the ground truth which is a 2-D tensor. When "
+             "soft_label is set to false, Label is a Tensor<int64> with shape "
+             "[N x 1]. When soft_label is set to true, Label is a "
+             "Tensor<float/double> with shape [N x K].");
     AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 2-D tensor "
-              "with shape [N x 1]. The cross entropy loss.");
-    AddAttr<bool>(
-        "soft_label",
-        "(bool, default false), a flag to indicate whether to interpretate "
-        "the given labels as soft labels.")
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
+              "[N x 1]. The cross entropy loss.");
+    AddAttr<bool>("soft_label",
+                  "(bool, default false), a flag indicating whether to "
+                  "interpretate the given labels as soft labels.")
         .SetDefault(false);
     AddComment(R"DOC(
 CrossEntropy Operator.
diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5aa03f8916a67222fb0ca5781533766063e52683
--- /dev/null
+++ b/paddle/operators/gru_op.cc
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/gru_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class GRUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                   "Output(%s) of GRUOp should not be null.", "BatchGate");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
+                   "Output(%s) of GRUOp should not be null.",
+                   "BatchResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
+                   "Output(%s) of GRUOp should not be null.", "BatchHidden");
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(%s) of GRUOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int input_size = input_dims[1];
+    int frame_size = weight_dims[0];
+    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
+                      "The input_size must be 3 times of frame_size in GRUOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_dims[1], frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("H0")) {
+      auto h0_dims = ctx->GetInputDim("H0");
+      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                        "The width of H0 must be equal to frame_size.");
+    }
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+    }
+    ctx->SetOutputDim("BatchGate", input_dims);
+    ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size});
+    ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size});
+    ctx->SetOutputDim("Hidden", {input_dims[0], frame_size});
+    ctx->ShareLoD("Input", "Hidden");
+  }
+};
+
+class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GRUOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) The first input is a LodTensor, which supports "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTenosr is a matrix with shape (T X 3D), where, T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) The initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size, D is the hidden size.")
+        .AsDispensable();
+    AddInput(
+        "Weight",
+        "(Tensor) The learnable hidden-hidden weight matrix with shape "
+        "(D x 3D), where D is the hidden size. The elements continuous in "
+        "memory can be divided into two parts. The first part are weights of "
+        "the update gate and reset gate with shape (D x 2D), and the second "
+        "part are weights of output candidate with shape (D x D).");
+    AddInput("Bias",
+             "(Tensor, optional) Bias vector with shape (1 x 3D) concating "
+             "bias of the update gate, reset gate and output candidate.")
+        .AsDispensable();
+    AddOutput("BatchGate",
+              "(LoDTensor) To compute with batches, sequence data will be "
+              "reorganized into several successive batches each containing "
+              "data from the same time step. The LoDTensor BatchGate contains "
+              "the update gate, reset gate and output candidate values "
+              "organized in batches. The LoD size is 2. The first LoD contains "
+              "the batch offsets and the second LoD contains the indexes in "
+              "the raw sequence data.")
+        .AsIntermediate();
+    AddOutput(
+        "BatchResetHiddenPrev",
+        "(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.")
+        .AsIntermediate();
+    AddOutput(
+        "BatchHidden",
+        "(LoDTensor) The hidden state LoDTensor organized in batches.  "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.")
+        .AsIntermediate();
+    AddOutput(
+        "Hidden",
+        "(LoDTensor) the hidden state LoDTensor organized in sequences. "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.");
+    AddAttr<std::string>("activation",
+                         "(string, default tanh) "
+                         "The activation type used for output candidate {h}_t.")
+        .SetDefault("tanh");
+    AddAttr<std::string>(
+        "gate_activation",
+        "(string, default sigmoid) "
+        "The activation type used in update gate and reset gate.")
+        .SetDefault("sigmoid");
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed GRU.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+GRU Operator implements part calculations of the complete GRU as following:
+
+\f[
+update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+\f]
+
+@note To implement the complete GRU, fully-connected operator must be used  
+before to feed xu, xr and xc as the Input of GRU operator.
+)DOC");
+  }
+};
+
+class GRUGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUGradOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUGradOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(%s) of GRUGradOp should not be null.", "BatchGate");
+    PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"),
+                   "Input(%s) of GRUGradOp should not be null.",
+                   "BatchResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("BatchHidden"),
+                   "Input(%s) of GRUOp should not be null.", "BatchHidden");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(%s) of GRUGradOp should not be null.", "Hidden");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
+                   "Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int input_size = input_dims[1];
+    int frame_size = weight_dims[0];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
+                      "The input_size must be 3 times of frame_size in GRUOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("H0")) {
+      auto h0_dims = ctx->GetInputDim("H0");
+      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                        "The width of H0 must be equal to frame_size.");
+      auto h0_grad_name = framework::GradVarName("H0");
+      if (ctx->HasOutput(h0_grad_name))
+        ctx->SetOutputDim(h0_grad_name, h0_dims);
+    }
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+    auto input_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(input_grad_name))
+      ctx->SetOutputDim(input_grad_name, input_dims);
+    auto weight_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(weight_grad_name))
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
+REGISTER_OP_CPU_KERNEL(gru, ops::GRUKernel<paddle::platform::CPUPlace, float>,
+                       ops::GRUKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(gru_grad,
+                       ops::GRUGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::GRUGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/gru_op.cu b/paddle/operators/gru_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..35538c74b4bf678f8068999bfadb2589a1671be0
--- /dev/null
+++ b/paddle/operators/gru_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/gru_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gru, ops::GRUKernel<paddle::platform::GPUPlace, float>,
+                       ops::GRUKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(gru_grad,
+                       ops::GRUGradKernel<paddle::platform::GPUPlace, float>,
+                       ops::GRUGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba90ec9816c40a6a49065ac6efcee6b93dffce90
--- /dev/null
+++ b/paddle/operators/gru_op.h
@@ -0,0 +1,231 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence2batch.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class GRUKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* input = context.Input<LoDTensor>("Input");
+    auto* h0 = context.Input<Tensor>("H0");
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(context.GetPlace());
+    auto* batch_reset_hidden_prev =
+        context.Output<LoDTensor>("BatchResetHiddenPrev");
+    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<LoDTensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    context.ShareLoD("Input", "Hidden");
+
+    auto hidden_dims = hidden->dims();
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    to_batch(context.device_context(), *input, *batch_gate, true, is_reverse);
+
+    int frame_size = hidden_dims[1];
+    int batch_size = hidden_dims[0];
+    auto g = EigenMatrix<T>::From(*batch_gate);
+    auto place = context.GetEigenDevice<Place>();
+    if (bias) {
+      auto b = EigenMatrix<T>::From(*bias);
+      g.device(place) = g +
+                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
+                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
+    }
+
+    math::hl_gru_value<T> gru_value;
+    gru_value.gateWeight = const_cast<T*>(weight_data);
+    gru_value.stateWeight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    gru_value.prevOutValue = const_cast<T*>(h0_data);
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      gru_value.outputValue = hidden_t.data<T>();
+      gru_value.gateValue = gate_t.data<T>();
+      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+      math::GRUUnitFunctor<Place, T>::compute(
+          context.device_context(), gru_value, frame_size, cur_batch_size,
+          math::ActiveType(context.Attr<std::string>("activation")),
+          math::ActiveType(context.Attr<std::string>("gate_activation")));
+      gru_value.prevOutValue = gru_value.outputValue;
+    }
+
+    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(context.device_context(), *batch_hidden, *hidden);
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+template <typename Place, typename T>
+class GRUGradKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* h0 = context.Input<Tensor>("H0");
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* batch_gate = context.Input<LoDTensor>("BatchGate");
+    auto* batch_reset_hidden_prev =
+        context.Input<LoDTensor>("BatchResetHiddenPrev");
+    auto* batch_hidden = context.Input<LoDTensor>("BatchHidden");
+    auto* hidden = context.Input<LoDTensor>("Hidden");
+    auto* hidden_grad =
+        context.Input<LoDTensor>(framework::GradVarName("Hidden"));
+    auto* input_grad =
+        context.Output<LoDTensor>(framework::GradVarName("Input"));
+    auto* h0_grad = context.Output<Tensor>(framework::GradVarName("H0"));
+    auto* weight_grad =
+        context.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto gate_dims = batch_gate->dims();
+    auto hidden_dims = hidden->dims();
+    int frame_size = hidden_dims[1];
+
+    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
+    batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
+    batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
+    batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
+                                                 context.GetPlace());
+    math::SetConstant<Place, T> zero;
+    zero(context.device_context(), &batch_hidden_grad, static_cast<T>(0.0));
+    zero(context.device_context(), &batch_gate_grad, static_cast<T>(0.0));
+    zero(context.device_context(), &batch_reset_hidden_prev_grad,
+         static_cast<T>(0.0));
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    batch_hidden_grad.set_lod(batch_hidden->lod());
+    to_batch(context.device_context(), *hidden_grad, batch_hidden_grad, false,
+             is_reverse);
+
+    math::hl_gru_value<T> gru_value;
+    gru_value.gateWeight = const_cast<T*>(weight_data);
+    gru_value.stateWeight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+
+    math::hl_gru_grad<T> gru_grad;
+    if (weight_grad) {
+      gru_grad.gateWeightGrad =
+          weight_grad->mutable_data<T>(context.GetPlace());
+      zero(context.device_context(), weight_grad, static_cast<T>(0.0));
+      gru_grad.stateWeightGrad =
+          weight_grad->data<T>() + 2 * frame_size * frame_size;
+    } else {
+      gru_grad.gateWeightGrad = nullptr;
+      gru_grad.stateWeightGrad = nullptr;
+    }
+
+    auto batch_starts = batch_hidden_grad.lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      gru_value.gateValue = gate_t.data<T>();
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+
+      Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
+      gru_grad.outputGrad = hidden_grad_t.data<T>();
+      Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
+      gru_grad.gateGrad = gate_grad_t.data<T>();
+      Tensor reset_hidden_prev_grad_t =
+          batch_reset_hidden_prev_grad.Slice(bstart, bend);
+      gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
+      if (n == 0) {
+        gru_value.prevOutValue = const_cast<T*>(h0_data);
+        if (h0_grad) {
+          T* h0_grad_data = h0_grad->mutable_data<T>(context.GetPlace());
+          zero(context.device_context(), h0_grad, static_cast<T>(0.0));
+          gru_grad.prevOutGrad = h0_grad_data;
+        } else {
+          gru_grad.prevOutGrad = nullptr;
+        }
+      } else {
+        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
+        gru_value.prevOutValue = hidden_prev_t.data<T>();
+        Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
+        gru_grad.prevOutGrad = hidden_prev_grad_t.data<T>();
+      }
+
+      math::GRUUnitGradFunctor<Place, T>::compute(
+          context.device_context(), gru_value, gru_grad, frame_size,
+          cur_batch_size,
+          math::ActiveType(context.Attr<std::string>("activation")),
+          math::ActiveType(context.Attr<std::string>("gate_activation")));
+    }
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      math::Batch2LoDTensorFunctor<Place, T> to_seq;
+      batch_gate_grad.set_lod(batch_gate->lod());
+      to_seq(context.device_context(), batch_gate_grad, *input_grad);
+    }
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(context.GetPlace());
+      auto d_b = EigenMatrix<T>::From(*bias_grad);
+      auto d_g = EigenMatrix<T>::From(batch_gate_grad);
+      auto place = context.GetEigenDevice<Place>();
+      d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 40cc177d0f19c2359626ef972e787a0b1c5580f8..90bc9f4f922e7aa09523bad8ffb3ef477dd89857 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -8,20 +8,24 @@ if(WITH_GPU)
     nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
     nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
+    nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
     nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
     cc_library(softmax SRCS softmax.cc DEPS operator)
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
     cc_library(pooling SRCS pooling.cc DEPS device_context)
+    cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
     cc_library(vol2col SRCS vol2col.cc DEPS device_context)
     cc_library(context_project SRCS context_project.cc DEPS device_context)
     cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
+    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
 endif()
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/operators/math/detail/gru_cpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..51af140cf4d5e6581765bea00033fa53d383230d
--- /dev/null
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
@@ -0,0 +1,424 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/gru_compute.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+#ifndef __NVCC__
+
+template <class OpResetOutput, typename T>
+void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
+                                       T *gateValue, T *resetOutputValue,
+                                       T *prevOutputValue, int frameSize,
+                                       activation_mode_t active_gate) {
+  T rValueUpdateGate;
+  T rValueResetGate;
+  T rValueResetOutput;
+  T rPrevOut = 0;
+  T *updateGate = gateValue;
+  T *resetGate = gateValue + frameSize;
+
+  for (int i = 0; i < frameSize; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueResetGate = resetGate[i];
+    if (prevOutputValue) {
+      rPrevOut = prevOutputValue[i];
+    }
+
+    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
+                  rValueResetOutput, active_gate);
+
+    updateGate[i] = rValueUpdateGate;
+    resetGate[i] = rValueResetGate;
+    resetOutputValue[i] = rValueResetOutput;
+  }
+}
+
+template <class OpFinalOutput, typename T>
+void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
+                                       T *gateValue, T *prevOutputValue,
+                                       T *outputValue, int frameSize,
+                                       activation_mode_t active_node) {
+  T rValueUpdateGate;
+  T rValueFrameState;
+  T rPrevOut = 0;
+  T rOutput;
+  T *updateGate = gateValue;
+  T *frameState = gateValue + frameSize * 2;
+
+  for (int i = 0; i < frameSize; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueFrameState = frameState[i];
+    if (prevOutputValue) {
+      rPrevOut = prevOutputValue[i];
+    }
+
+    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
+                  active_node);
+
+    frameState[i] = rValueFrameState;
+    outputValue[i] = rOutput;
+  }
+}
+
+template <class OpResetOutput, typename T>
+void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue,
+                                     T *resetOutputValue, T *prevOutputValue,
+                                     int frameSize,
+                                     activation_mode_t active_gate) {
+#ifdef __AVX__
+  __m256 rValueUpdateGate;
+  __m256 rValueResetGate;
+  __m256 rValueResetOutput;
+  __m256 rPrevOut = _mm256_set1_ps(0.0f);
+  __m256 *updateGate = (__m256 *)gateValue;
+  __m256 *resetGate = (__m256 *)(gateValue + frameSize);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueResetGate = resetGate[i];
+    if (prevOutputValue) {
+      rPrevOut = ((__m256 *)prevOutputValue)[i];
+    }
+
+    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
+                  rValueResetOutput, active_gate);
+
+    updateGate[i] = rValueUpdateGate;
+    resetGate[i] = rValueResetGate;
+    ((__m256 *)resetOutputValue)[i] = rValueResetOutput;
+  }
+#endif
+}
+
+template <class OpFinalOutput, typename T>
+void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue,
+                                     T *prevOutputValue, T *outputValue,
+                                     int frameSize,
+                                     activation_mode_t active_node) {
+#ifdef __AVX__
+  __m256 rValueUpdateGate;
+  __m256 rValueFrameState;
+  __m256 rPrevOut = _mm256_set1_ps(0.0f);
+  __m256 rOutput;
+  __m256 *updateGate = (__m256 *)gateValue;
+  __m256 *frameState = (__m256 *)(gateValue + frameSize * 2);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueFrameState = frameState[i];
+    if (prevOutputValue) {
+      rPrevOut = ((__m256 *)prevOutputValue)[i];
+    }
+
+    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
+                  active_node);
+
+    frameState[i] = rValueFrameState;
+    ((__m256 *)outputValue)[i] = rOutput;
+  }
+#endif
+}
+
+template <class OpResetOutput, typename T>
+inline void forward_reset_output(OpResetOutput opResetOutput,
+                                 hl_gru_value<T> value, int frameSize,
+                                 int batchSize, activation_mode_t active_gate) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_reset_output(
+          opResetOutput, value.gateValue, value.resetOutputValue,
+          value.prevOutValue, frameSize, active_gate);
+    } else {
+      hl_naive_gru_forward_reset_output(
+          opResetOutput, value.gateValue, value.resetOutputValue,
+          value.prevOutValue, frameSize, active_gate);
+    }
+
+    value.gateValue += frameSize * 3;
+    value.resetOutputValue += frameSize;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+  }
+}
+
+template <class OpFinalOutput, typename T>
+inline void forward_final_output(OpFinalOutput opFinalOutput,
+                                 hl_gru_value<T> value, int frameSize,
+                                 int batchSize, activation_mode_t active_node) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue,
+                                      value.prevOutValue, value.outputValue,
+                                      frameSize, active_node);
+    } else {
+      hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue,
+                                        value.prevOutValue, value.outputValue,
+                                        frameSize, active_node);
+    }
+
+    value.gateValue += frameSize * 3;
+    value.outputValue += frameSize;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
+                                      T *gateGrad, T *prevOutValue,
+                                      T *prevOutGrad, T *outputGrad,
+                                      int frameSize,
+                                      activation_mode_t active_node) {
+  T rUpdateGateValue;
+  T rUpdateGateGrad;
+  T rFrameStateValue;
+  T rFrameStateGrad;
+  T rOutGrad;
+  T rPrevOutValue = 0;
+  T rPrevOutGrad = 0;
+  T *updateGateValue = gateValue;
+  T *updateGateGrad = gateGrad;
+  T *frameStateValue = gateValue + frameSize * 2;
+  T *frameStateGrad = gateGrad + frameSize * 2;
+
+  for (int i = 0; i < frameSize; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rFrameStateValue = frameStateValue[i];
+    rOutGrad = outputGrad[i];
+    if (prevOutValue) {
+      rPrevOutValue = prevOutValue[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad = prevOutGrad[i];
+    }
+
+    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
+                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
+                active_node);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    frameStateGrad[i] = rFrameStateGrad;
+    if (prevOutGrad) {
+      prevOutGrad[i] = rPrevOutGrad;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
+                                      T *gateGrad, T *prevOutValue,
+                                      T *prevOutGrad, T *resetOutputGrad,
+                                      int frameSize,
+                                      activation_mode_t active_gate) {
+  T rUpdateGateValue;
+  T rUpdateGateGrad;
+  T rResetGateValue;
+  T rResetGateGrad;
+  T rResetOutputGrad = 0;
+  T rPrevOutValue = 0;
+  T rPrevOutGrad = 0;
+  T *updateGateValue = gateValue;
+  T *updateGateGrad = gateGrad;
+  T *resetGateValue = gateValue + frameSize;
+  T *resetGateGrad = gateGrad + frameSize;
+
+  for (int i = 0; i < frameSize; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rUpdateGateGrad = updateGateGrad[i];
+    rResetGateValue = resetGateValue[i];
+
+    if (prevOutValue && prevOutGrad) {
+      rResetOutputGrad = resetOutputGrad[i];
+    }
+    if (prevOutValue) {
+      rPrevOutValue = prevOutValue[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad = prevOutGrad[i];
+    }
+
+    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
+                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
+                active_gate);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    resetGateGrad[i] = rResetGateGrad;
+    if (prevOutGrad) {
+      prevOutGrad[i] = rPrevOutGrad;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
+                                    T *gateGrad, T *prevOutValue,
+                                    T *prevOutGrad, T *outputGrad,
+                                    int frameSize,
+                                    activation_mode_t active_node) {
+#ifdef __AVX__
+  __m256 rUpdateGateValue;
+  __m256 rUpdateGateGrad;
+  __m256 rFrameStateValue;
+  __m256 rFrameStateGrad;
+  __m256 rOutGrad;
+  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
+  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
+  __m256 *updateGateValue = (__m256 *)gateValue;
+  __m256 *updateGateGrad = (__m256 *)gateGrad;
+  __m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2);
+  __m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rFrameStateValue = frameStateValue[i];
+    rOutGrad = ((__m256 *)outputGrad)[i];
+    if (prevOutValue) {
+      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    }
+
+    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
+                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
+                active_node);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    frameStateGrad[i] = rFrameStateGrad;
+    if (prevOutGrad) {
+      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    }
+  }
+#endif
+}
+
+template <class OpResetGrad, typename T>
+void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
+                                    T *gateGrad, T *prevOutValue,
+                                    T *prevOutGrad, T *resetOutputGrad,
+                                    int frameSize,
+                                    activation_mode_t active_gate) {
+#ifdef __AVX__
+  __m256 rUpdateGateValue;
+  __m256 rUpdateGateGrad;
+  __m256 rResetGateValue;
+  __m256 rResetGateGrad;
+  __m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
+  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
+  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
+  __m256 *updateGateValue = (__m256 *)gateValue;
+  __m256 *updateGateGrad = (__m256 *)gateGrad;
+  __m256 *resetGateValue = (__m256 *)(gateValue + frameSize);
+  __m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rUpdateGateGrad = updateGateGrad[i];
+    rResetGateValue = resetGateValue[i];
+
+    if (prevOutValue && prevOutGrad) {
+      rResetOutputGrad = ((__m256 *)resetOutputGrad)[i];
+    }
+    if (prevOutValue) {
+      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    }
+
+    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
+                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
+                active_gate);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    resetGateGrad[i] = rResetGateGrad;
+    if (prevOutGrad) {
+      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    }
+  }
+#endif
+}
+
+template <class OpStateGrad, typename T>
+inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value<T> value,
+                                hl_gru_grad<T> grad, int frameSize,
+                                int batchSize, activation_mode_t active_node) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_backward_state_grad(
+          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+    } else {
+      hl_naive_gru_backward_state_grad(
+          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+    }
+
+    value.gateValue += frameSize * 3;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+
+    grad.gateGrad += frameSize * 3;
+    grad.outputGrad += frameSize;
+    if (grad.prevOutGrad) {
+      grad.prevOutGrad += frameSize;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value<T> value,
+                                hl_gru_grad<T> grad, int frameSize,
+                                int batchSize, activation_mode_t active_gate) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_backward_reset_grad(
+          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+    } else {
+      hl_naive_gru_backward_reset_grad(
+          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+    }
+
+    value.gateValue += frameSize * 3;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+
+    grad.gateGrad += frameSize * 3;
+    grad.resetOutputGrad += frameSize;
+    if (grad.prevOutGrad) {
+      grad.prevOutGrad += frameSize;
+    }
+  }
+}
+
+#endif
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6441c648b048422c110872a85aa8cb719f11a8d7
--- /dev/null
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
@@ -0,0 +1,203 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/device_context.h"
+
+#include <glog/logging.h>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class OpResetOutput, bool isBatch, typename T>
+__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
+                                        T *gateValue, T *resetOutputValue,
+                                        T *prevOutputValue, int frameSize,
+                                        int batchSize,
+                                        activation_mode_t active_gate) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    resetOutputValue += batchIdx * frameSize;
+  }
+
+  T rPrevOut = 0;
+  T rValueResetOutput;
+  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  T rValueResetGate = gateValue[frameIdx + frameSize * 1];
+
+  if (prevOutputValue) {
+    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    rPrevOut = prevOutputValue[frameIdx];
+  }
+
+  opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput,
+                active_gate);
+
+  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
+  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
+  resetOutputValue[frameIdx] = rValueResetOutput;
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class OpFinalOutput, bool isBatch, typename T>
+__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
+                                        T *gateValue, T *prevOutputValue,
+                                        T *outputValue, int frameSize,
+                                        int batchSize,
+                                        activation_mode_t active_node) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    outputValue += batchIdx * frameSize;
+  }
+
+  T rOutput;
+  T rPrevOut = 0;
+  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  T rValueFrameState = gateValue[frameIdx + frameSize * 2];
+
+  if (prevOutputValue) {
+    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    rPrevOut = prevOutputValue[frameIdx];
+  }
+
+  opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
+                active_node);
+
+  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
+  outputValue[frameIdx] = rOutput;
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class OpStateGrad, bool isBatch, typename T>
+__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue,
+                                       T *gateGrad, T *prevOutValue,
+                                       T *prevOutGrad, T *outputGrad,
+                                       int frameSize, int batchSize,
+                                       activation_mode_t active_node) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    gateGrad += batchIdx * 3 * frameSize;
+    outputGrad += batchIdx * frameSize;
+  }
+
+  T rUpdateGateGrad;
+  T rFrameStateGrad;
+  T rPrevOutValue = 0;
+  T rPrevOutGrad = 0;
+  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  T rFrameStateValue = gateValue[frameIdx + frameSize * 2];
+  T rOutGrad = outputGrad[frameIdx];
+
+  if (prevOutValue && prevOutGrad) {
+    if (isBatch) prevOutValue += batchIdx * frameSize;
+    rPrevOutValue = prevOutValue[frameIdx];
+
+    if (isBatch) prevOutGrad += batchIdx * frameSize;
+    rPrevOutGrad = prevOutGrad[frameIdx];
+  }
+
+  opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
+              rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
+              active_node);
+
+  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
+  if (prevOutGrad) {
+    prevOutGrad[frameIdx] = rPrevOutGrad;
+  }
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class OpResetGrad, bool isBatch, typename T>
+__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue,
+                                       T *gateGrad, T *prevOutValue,
+                                       T *prevOutGrad, T *resetOutputGrad,
+                                       int frameSize, int batchSize,
+                                       activation_mode_t active_gate) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    gateGrad += batchIdx * 3 * frameSize;
+    resetOutputGrad += batchIdx * frameSize;
+  }
+
+  T rResetGateGrad;
+  T rPrevOutValue = 0;
+  T rPrevOutGrad = 0;
+  T rResetOutputGrad = 0;
+  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0];
+  T rResetGateValue = gateValue[frameIdx + frameSize * 1];
+
+  if (prevOutValue && prevOutGrad) {
+    if (isBatch) prevOutValue += batchIdx * frameSize;
+    if (isBatch) prevOutGrad += batchIdx * frameSize;
+    rPrevOutValue = prevOutValue[frameIdx];
+    rPrevOutGrad = prevOutGrad[frameIdx];
+    rResetOutputGrad = resetOutputGrad[frameIdx];
+  }
+
+  opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
+              rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
+              active_gate);
+
+  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
+  if (prevOutGrad) {
+    prevOutGrad[frameIdx] = rPrevOutGrad;
+  }
+}
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/operators/math/detail/gru_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a681d8d8bced72e1296f863489f6ccbc7913167
--- /dev/null
+++ b/paddle/operators/math/detail/gru_kernel.h
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/platform/hostdevice.h"
+
+#include <type_traits>
+
+// TODO(guosheng): refine code style in gru_kernel
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+namespace forward {
+
+template <typename T>
+class gru_resetOutput {
+ public:
+  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut,
+                             T &valueResetOutput, activation_mode_t actGate) {
+    valueUpdateGate = activation(valueUpdateGate, actGate);
+    valueResetGate = activation(valueResetGate, actGate);
+    valueResetOutput = prevOut * valueResetGate;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate,
+                             __m256 &prevOut, __m256 &valueResetOutput,
+                             activation_mode_t actGate) {
+    valueUpdateGate = activation(valueUpdateGate, actGate);
+    valueResetGate = activation(valueResetGate, actGate);
+    valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
+  }
+#endif
+#endif
+};
+
+template <typename T>
+class gru_finalOutput {
+ public:
+  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut,
+                             T &valueOutput, activation_mode_t actInput) {
+    valueFrameState = activation(valueFrameState, actInput);
+    valueOutput = prevOut - (valueUpdateGate * prevOut) +
+                  (valueUpdateGate * valueFrameState);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState,
+                             __m256 &prevOut, __m256 &valueOutput,
+                             activation_mode_t actInput) {
+    valueFrameState = activation(valueFrameState, actInput);
+    valueOutput = _mm256_add_ps(
+        _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
+        _mm256_mul_ps(valueUpdateGate, valueFrameState));
+  }
+#endif
+#endif
+};
+}  // namespace forward
+
+namespace backward {
+
+template <typename T>
+class gru_stateGrad {
+ public:
+  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
+                             T &valueFrameState, T &gradFrameState,
+                             T &valuePrevOut, T &gradPrevOut, T &gradOutput,
+                             activation_mode_t actInput) {
+    gradUpdateGate = (gradOutput * valueFrameState);
+    gradUpdateGate -= (gradOutput * valuePrevOut);
+    gradPrevOut -= (gradOutput * valueUpdateGate);
+    gradPrevOut += gradOutput;
+    gradFrameState =
+        activation(gradOutput * valueUpdateGate, valueFrameState, actInput);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
+                             __m256 &valueFrameState, __m256 &gradFrameState,
+                             __m256 &valuePrevOut, __m256 &gradPrevOut,
+                             __m256 &gradOutput, activation_mode_t actInput) {
+    gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
+    gradUpdateGate =
+        _mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
+    gradPrevOut = _mm256_add_ps(
+        _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
+        gradOutput);
+    gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate),
+                                valueFrameState, actInput);
+  }
+#endif
+#endif
+};
+
+template <typename T>
+class gru_resetGrad {
+ public:
+  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
+                             T &valueResetGate, T &gradResetGate,
+                             T &valuePrevOut, T &gradPrevOut,
+                             T &gradResetOutput, activation_mode_t actGate) {
+    gradResetGate = (gradResetOutput * valuePrevOut);
+    gradPrevOut += (gradResetOutput * valueResetGate);
+    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
+    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
+                             __m256 &valueResetGate, __m256 &gradResetGate,
+                             __m256 &valuePrevOut, __m256 &gradPrevOut,
+                             __m256 &gradResetOutput,
+                             activation_mode_t actGate) {
+    gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
+    gradPrevOut = _mm256_add_ps(gradPrevOut,
+                                _mm256_mul_ps(gradResetOutput, valueResetGate));
+    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
+    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  }
+#endif
+#endif
+};
+
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..125af449d3f700e24be5e4b7615c3b0e03fd4e5b
--- /dev/null
+++ b/paddle/operators/math/gru_compute.cc
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/detail/gru_cpu_kernel.h"
+#include "paddle/operators/math/detail/gru_kernel.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<platform::CPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+#ifndef __NVCC__
+    if (value.prevOutValue) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, false, batchSize, frameSize * 2, frameSize, 1,
+          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
+          value.gateValue, frameSize * 3);
+    }
+
+    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
+                                 frameSize, batchSize, active_gate);
+
+    if (value.prevOutValue) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, false, batchSize, frameSize, frameSize, 1,
+          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
+          value.gateValue + frameSize * 2, frameSize * 3);
+    }
+
+    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
+                                 frameSize, batchSize, active_node);
+#endif
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<platform::CPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
+                      int batchSize, activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+#ifndef __NVCC__
+    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
+                                grad, frameSize, batchSize, active_node);
+
+    if (value.prevOutValue && grad.prevOutGrad) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, true, batchSize, frameSize, frameSize, 1,
+          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
+          frameSize, 0, grad.resetOutputGrad, frameSize);
+
+      if (grad.stateWeightGrad) {
+        math::gemm<platform::CPUPlace, T>(
+            context, true, false, frameSize, frameSize, batchSize, 1,
+            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
+            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+      }
+    }
+
+    detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
+                                grad, frameSize, batchSize, active_gate);
+
+    if (grad.prevOutGrad && value.prevOutValue) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, true, batchSize, frameSize, frameSize * 2, 1,
+          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
+          grad.prevOutGrad, frameSize);
+
+      if (grad.gateWeightGrad) {
+        math::gemm<platform::CPUPlace, T>(
+            context, true, false, frameSize, frameSize * 2, batchSize, 1,
+            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
+            grad.gateWeightGrad, frameSize * 2);
+      }
+    }
+#endif
+  }
+};
+
+template struct GRUUnitFunctor<platform::CPUPlace, float>;
+template struct GRUUnitFunctor<platform::CPUPlace, double>;
+template struct GRUUnitGradFunctor<platform::CPUPlace, float>;
+template struct GRUUnitGradFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7b9e54ac029f6aa00553338435684097d6d02b25
--- /dev/null
+++ b/paddle/operators/math/gru_compute.cu
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/gru_gpu_kernel.h"
+#include "paddle/operators/math/detail/gru_kernel.h"
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<platform::GPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+    dim3 threads;
+    dim3 grid;
+    if (batchSize == 1) {
+      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+      int frameBlocks = (frameSize + 1024 - 1) / 1024;
+      threads = dim3(framePerBlock, 1);
+      grid = dim3(frameBlocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+    }
+
+    if (value.prevOutValue) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, false, batchSize, frameSize * 2, frameSize, 1,
+          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
+          value.gateValue, frameSize * 3);
+    }
+
+    if (batchSize == 1) {
+      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
+                                      /* isBatch= */ false,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_resetOutput<T>(), value.gateValue,
+          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
+          active_gate);
+    } else {
+      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
+                                      /* isBatch= */ true,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_resetOutput<T>(), value.gateValue,
+          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
+          active_gate);
+    }
+
+    if (value.prevOutValue) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, false, batchSize, frameSize, frameSize, 1,
+          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
+          value.gateValue + frameSize * 2, frameSize * 3);
+    }
+
+    if (batchSize == 1) {
+      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
+                                      /* isBatch= */ false,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_finalOutput<T>(), value.gateValue,
+          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          active_node);
+    } else {
+      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
+                                      /* isBatch= */ true,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_finalOutput<T>(), value.gateValue,
+          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          active_node);
+    }
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<platform::GPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
+                      int batchSize, activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+    dim3 threads;
+    dim3 grid;
+    if (batchSize == 1) {
+      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+      int frameBlocks = (frameSize + 1024 - 1) / 1024;
+      threads = dim3(framePerBlock, 1);
+      grid = dim3(frameBlocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+    }
+
+    if (batchSize == 1) {
+      detail::KeGruBackwardStateGrad<
+          detail::backward::gru_stateGrad<T>,
+          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
+          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
+          batchSize, active_node);
+    } else {
+      detail::KeGruBackwardStateGrad<
+          detail::backward::gru_stateGrad<T>,
+          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
+          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
+          batchSize, active_node);
+    }
+
+    if (value.prevOutValue && grad.prevOutGrad) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, true, batchSize, frameSize, frameSize, 1,
+          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
+          frameSize, 0, grad.resetOutputGrad, frameSize);
+
+      if (grad.stateWeightGrad) {
+        math::gemm<platform::GPUPlace, T>(
+            context, true, false, frameSize, frameSize, batchSize, 1,
+            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
+            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+      }
+    }
+
+    if (batchSize == 1) {
+      detail::KeGruBackwardResetGrad<
+          detail::backward::gru_resetGrad<T>,
+          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
+          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
+          batchSize, active_gate);
+    } else {
+      detail::KeGruBackwardResetGrad<
+          detail::backward::gru_resetGrad<T>,
+          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
+          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
+          batchSize, active_gate);
+    }
+
+    if (grad.prevOutGrad && value.prevOutValue) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, true, batchSize, frameSize, frameSize * 2, 1,
+          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
+          grad.prevOutGrad, frameSize);
+
+      if (grad.gateWeightGrad) {
+        math::gemm<platform::GPUPlace, T>(
+            context, true, false, frameSize, frameSize * 2, batchSize, 1,
+            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
+            grad.gateWeightGrad, frameSize * 2);
+      }
+    }
+  }
+};
+
+template struct GRUUnitFunctor<platform::GPUPlace, float>;
+template struct GRUUnitFunctor<platform::GPUPlace, double>;
+template struct GRUUnitGradFunctor<platform::GPUPlace, float>;
+template struct GRUUnitGradFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..1475fb38104f353857dfd968e46af98a6d52c52a
--- /dev/null
+++ b/paddle/operators/math/gru_compute.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// TODO(guosheng): refine code style in gru_compute
+template <typename T>
+struct hl_gru_value {
+  T *gateWeight;
+  T *stateWeight;
+  T *gateValue;
+  T *resetOutputValue;
+  T *outputValue;
+  T *prevOutValue;
+};
+
+template <typename T>
+struct hl_gru_grad {
+  T *gateWeightGrad;
+  T *stateWeightGrad;
+  T *gateGrad;
+  T *resetOutputGrad;
+  T *outputGrad;
+  T *prevOutGrad;
+};
+
+template <typename Place, typename T>
+struct GRUUnitFunctor {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate);
+};
+
+template <typename Place, typename T>
+struct GRUUnitGradFunctor {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
+                      int batchSize, activation_mode_t active_node,
+                      activation_mode_t active_gate);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_pooling.cc b/paddle/operators/math/sequence_pooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5913c99fdb01100d0de44ab317124550fa626528
--- /dev/null
+++ b/paddle/operators/math/sequence_pooling.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_pooling.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class MaxSeqPoolFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), 1);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int* max_index = index->data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      for (int64_t k = 0; k < dim; ++k) {
+        out_data[i * dim + k] = in_data[starts[i] * dim + k];
+        max_index[i * dim + k] = starts[i];
+      }
+      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
+        for (int64_t k = 0; k < dim; ++k) {
+          if (in_data[j * dim + k] > out_data[i * dim + k]) {
+            out_data[i * dim + k] = in_data[j * dim + k];
+            max_index[i * dim + k] = j;
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad) {
+    auto og_dims = out_grad.dims();
+    auto ig_dims = in_grad->dims();
+    auto idx_dims = index.dims();
+    PADDLE_ENFORCE_GT(og_dims.size(), 1);
+    PADDLE_ENFORCE_GT(ig_dims.size(), 1);
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+
+    const T* og_data = out_grad.data<T>();
+    const int* max_index = index.data<int>();
+    T* ig_data = in_grad->data<T>();
+
+    SetConstant<platform::CPUPlace, T> set_zero;
+    set_zero(context, in_grad, static_cast<T>(0.0));
+    int64_t num_seq = og_dims[0];
+    int64_t dim = out_grad.numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      for (int64_t j = 0; j < dim; ++j) {
+        int step_id = max_index[i * dim + j];
+        ig_data[step_id * dim + j] = og_data[i * dim + j];
+      }
+    }
+  }
+};
+
+template class MaxSeqPoolFunctor<platform::CPUPlace, float>;
+template class MaxSeqPoolFunctor<platform::CPUPlace, double>;
+template class MaxSeqPoolGradFunctor<platform::CPUPlace, float>;
+template class MaxSeqPoolGradFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5ed951402fecba66a8960f4d024bf3785dac51c7
--- /dev/null
+++ b/paddle/operators/math/sequence_pooling.cu
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence_pooling.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename T>
+__global__ void KeMaxSequencePool(const T* input, const size_t* starts,
+                                  T* output, int* index, int64_t num_seq,
+                                  int64_t dim) {
+  int dim_idx = threadIdx.x;
+  int seq_id = blockIdx.x;
+  if (seq_id >= num_seq) return;
+  size_t start = starts[seq_id];
+  size_t end = starts[seq_id + 1];
+
+  for (int64_t i = dim_idx; i < dim; i += blockDim.x) {
+    T max_val = static_cast<T>(-FLT_MAX);
+    int max_id = -1;
+    for (size_t step_id = start; step_id < end; step_id++) {
+      if (max_val < input[step_id * dim + i]) {
+        max_val = input[step_id * dim + i];
+        max_id = step_id;
+      }
+    }
+    output[seq_id * dim + i] = max_val;
+    index[seq_id * dim + i] = max_id;
+  }
+}
+
+template <typename T>
+class MaxSeqPoolFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), static_cast<int64_t>(1));
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int* max_index = index->data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+
+    dim3 threads(256, 1);
+    dim3 grid(num_seq, 1);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
+        in_data, starts.data(), out_data, max_index, num_seq, dim);
+  }
+};
+
+template <typename T>
+__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
+                                      T* in_grad, int64_t num_seq,
+                                      int64_t dim) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int col_idx = idx % dim;
+  if (idx < num_seq * dim) {
+    int step_id = max_index[idx];
+    in_grad[step_id * dim + col_idx] = out_grad[idx];
+  }
+}
+
+template <typename T>
+class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad) {
+    auto og_dims = out_grad.dims();
+    auto idx_dims = index.dims();
+    auto ig_dims = in_grad->dims();
+    PADDLE_ENFORCE_GT(og_dims.size(), static_cast<int64_t>(1));
+    PADDLE_ENFORCE_GT(ig_dims.size(), static_cast<int64_t>(1));
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+
+    const T* og_data = out_grad.data<T>();
+    const int* max_index = index.data<int>();
+    T* ig_data = in_grad->data<T>();
+
+    SetConstant<platform::GPUPlace, T> set_zero;
+    set_zero(context, in_grad, static_cast<T>(0.0));
+    int64_t num_seq = og_dims[0];
+    int64_t dim = out_grad.numel() / num_seq;
+
+    unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
+    dim3 threads(128, 1);
+    dim3 grid(blocks, 1);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
+        og_data, max_index, ig_data, num_seq, dim);
+  }
+};
+
+template class MaxSeqPoolFunctor<platform::GPUPlace, float>;
+template class MaxSeqPoolFunctor<platform::GPUPlace, double>;
+template class MaxSeqPoolGradFunctor<platform::GPUPlace, float>;
+template class MaxSeqPoolGradFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_pooling.h b/paddle/operators/math/sequence_pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..35dfe26de1a87a064410401244914d4e2a94176e
--- /dev/null
+++ b/paddle/operators/math/sequence_pooling.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename Place, typename T>
+class MaxSeqPoolFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index);
+};
+
+template <typename Place, class T>
+class MaxSeqPoolGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index 63050a4ec2f16800ae6f8ee077fd0c1daa8eff87..710f280017fa5e188b187a3e91b27e2bedc65d10 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -27,6 +27,11 @@ class SequencePoolOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SequencePoolOp should not be null.");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    if (ctx->Attrs().Get<std::string>("pooltype") == "MAX") {
+      PADDLE_ENFORCE(ctx->HasOutput("MaxIndex"),
+                     "Output(MaxIndex) of SequencePoolOp should not be null.");
+      ctx->SetOutputDim("MaxIndex", ctx->GetInputDim("X"));
+    }
   }
 };
 
@@ -35,10 +40,14 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
   SequencePoolOpMaker(framework::OpProto* proto,
                       framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor), the variable-length input of SequencePoolOp");
+    AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
     AddOutput("Out",
-              "(Tensor), output of SequencePoolOp, which does not contain LoD "
+              "(Tensor) The output of SequencePoolOp does not contain LoD "
               "infomation.");
+    AddOutput("MaxIndex",
+              "(Tensor<int>) This tensor is used for the sequence max-pooling "
+              "to record the max indexes.")
+        .AsIntermediate();
     AddAttr<std::string>(
         "pooltype",
         "(int, default AVERAGE) the pooling pooltype of SequencePoolOp.")
@@ -96,6 +105,12 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index e0e0493fe0ef7e1963ce5c2e3f37c164a605809b..2b8a25c2414c20efaffedfc8603697b3a104634f 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence_pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -34,7 +35,7 @@ class SequencePoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
+    auto* out = context.Output<Tensor>("Out");
     std::string pooltype = context.Attr<std::string>("pooltype");
 
     auto dims = in->dims();
@@ -53,6 +54,16 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto lod_level_0 = lod[0];
 
     out->mutable_data<T>(context.GetPlace());
+
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolFunctor<Place, T> max_pool;
+      auto* index = context.Output<Tensor>("MaxIndex");
+      index->Resize({dims});
+      index->mutable_data<int>(context.GetPlace());
+      max_pool(context.device_context(), *in, out, index);
+      return;
+    }
+
     auto place = context.GetEigenDevice<Place>();
     for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
       Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
@@ -69,8 +80,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
       } else if (pooltype == "SQRT") {
         out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                               std::sqrt(static_cast<T>(h));
-      } else if (pooltype == "MAX") {
-        out_e.device(place) = in_e.maximum(Eigen::array<int, 1>({{0}}));
       } else if (pooltype == "LAST") {
         out_e.device(place) = in_e.chip(h - 1, 0);
       } else if (pooltype == "FIRST") {
@@ -87,8 +96,8 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
+    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
     std::string pooltype = context.Attr<std::string>("pooltype");
 
     auto dims = in->dims();
@@ -96,6 +105,14 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
     int64_t w = in->numel() / dims[0];
 
     in_g->mutable_data<T>(context.GetPlace());
+
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolGradFunctor<Place, T> max_pool_grad;
+      auto* index = context.Input<Tensor>("MaxIndex");
+      max_pool_grad(context.device_context(), *out_g, *index, in_g);
+      return;
+    }
+
     if (pooltype == "LAST" || pooltype == "FIRST") {
       // set X@Grad be zero at first when pooltype is LAST/FIRST
       math::SetConstant<Place, T> functor;
@@ -118,20 +135,6 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
       } else if (pooltype == "SQRT") {
         in_g_e.device(place) =
             (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
-      } else if (pooltype == "MAX") {
-        auto in_t =
-            in->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
-        Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
-            in_t_map(in_t.data<T>(), h, w);
-        int row_id;
-        Eigen::array<int, 2> extents{{1, 1}};
-        for (int col_id = 0; col_id < w; col_id++) {
-          in_t_map.col(col_id).maxCoeff(&row_id);
-          Eigen::array<int, 2> in_offsets{{row_id, col_id}};
-          Eigen::array<int, 2> out_offsets{{0, col_id}};
-          in_g_e.slice(in_offsets, extents).device(place) =
-              out_g_e.slice(out_offsets, extents);
-        }
       } else if (pooltype == "LAST") {
         in_g_e.chip(h - 1, 0).device(place) = out_g_e;
       } else if (pooltype == "FIRST") {
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index a006e0a595057d019b443d22cea4bdf171a6ee0b..c6b94f5cc947ccb86315fd9058b8c57d1a996927 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -4,13 +4,13 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/softmax_with_cross_entropy_op.h"
 #include <paddle/function/TensorType.h>
@@ -30,12 +30,10 @@ class SoftmaxWithCrossEntropyOpMaker
              "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
              "and K is the class number.");
     AddInput("Label",
-             "(Tensor, default: Tensor<int>), The ground truth which is a 2-D "
-             "tensor. "
-             "If softLabel is set to false, Label is a Tensor<int> with shape "
-             "[N x 1]."
-             "If softLabel is set to true, Label is a Tensor<float/double> "
-             "with shape [N x K].");
+             "(Tensor) The ground truth which is a 2-D tensor. If soft_label "
+             "is set to false, Label is a Tensor<int64> with shape [N x 1]. If "
+             "soft_label is set to true, Label is a Tensor<float/double> with "
+             "shape [N x K].");
     AddOutput(
         "Softmax",
         "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
@@ -62,7 +60,7 @@ Because this operator performs a softmax on logits internally, it expects
 unscaled logits. This operator should not be used with the output of
 softmax operator since that would produce incorrect results.
 
-When the attribute softLabel is set false, this operators expects mutually
+When the attribute soft_label is set false, this operators expects mutually
 exclusive hard labels, each sample in a batch is in exactly one class with a
 probability of 1.0. Each sample in the batch will have a single label.
 
@@ -198,6 +196,8 @@ REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
 REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
                   ops::SoftmaxWithCrossEntropyOpGrad);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
-                       ops::SoftmaxWithCrossEntropyKernel<float>);
+                       ops::SoftmaxWithCrossEntropyKernel<float>,
+                       ops::SoftmaxWithCrossEntropyKernel<double>);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradKernel<float>);
+                       ops::SoftmaxWithCrossEntropyGradKernel<float>,
+                       ops::SoftmaxWithCrossEntropyGradKernel<double>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index 7602918bb39312db3c4d1a4064801712ef94ec72..b1faddac3fd21aaf817caf9d3e57e664f4e0e2d5 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -4,13 +4,13 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
@@ -24,7 +24,7 @@ using Tensor = framework::Tensor;
 namespace {
 template <typename T>
 __global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad,
-                                 const int* labels, const int batch_size,
+                                 const int64_t* labels, const int batch_size,
                                  const int class_num) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int sample_idx = tid / class_num;
@@ -50,7 +50,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
   int ids = blockIdx.x * blockDim.x + threadIdx.x;
   if (ids < batch_size * class_num) {
     int row_ids = ids / class_num;
-    logit_grad[ids] = logit_grad[ids] * (loss_grad[row_ids] - labels[ids]);
+    logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
   }
 }
 }  // namespace
@@ -104,7 +104,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
                               .stream()>>>(logit_grad_data, loss_grad_data,
                                            label_data, batch_size, class_num);
     } else {
-      const int* label_data = labels->data<int>();
+      const int64_t* label_data = labels->data<int64_t>();
       CrossEntropyGrad<T><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               context.device_context())
@@ -119,6 +119,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy,
-                       ops::SoftmaxWithCrossEntropyCUDAKernel<float>);
+                       ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
+                       ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
 REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>);
+                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
+                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index 7f3f9e23aa9455437cfa893363b3e59a0699dbea..c4ab3f74b4b07d13957d99e01aa4868fac719f61 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -4,13 +4,13 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -60,25 +60,25 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
 
     const int class_num = logit_grad->dims()[1];
+    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
+
     if (context.Attr<bool>("soft_label")) {
-      auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
-      auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
       auto lbl_mat = EigenMatrix<T>::From(*labels);
-
       logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
-          logit_grad_mat *
-          (out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) -
-           lbl_mat);
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
+          (logit_grad_mat - lbl_mat);
     } else {
+      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
+          logit_grad_mat *
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
+
       const int batch_size = logit_grad->dims()[0];
-      const int* label_data = labels->data<int>();
-      const T* out_grad_data = out_grad->data<T>();
+      const int64_t* label_data = labels->data<int64_t>();
       T* logit_grad_data = logit_grad->data<T>();
-
+      const T* out_grad_data = out_grad->data<T>();
       for (int i = 0; i < batch_size; ++i) {
-        int index = i * class_num + label_data[i];
-        logit_grad_data[index] =
-            out_grad_data[i] * (logit_grad_data[index] - 1.);
+        logit_grad_data[i * class_num + label_data[i]] -= out_grad_data[i];
       }
     }
   }
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 5bdf8c833522564e6b1027ca5dad8c0bb481cdc4..73da7dfa6f997be5d34bbe9fb99254aa6e6b28f2 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -2,171 +2,184 @@
 
 set -xe
 
-# Set BASE_IMAGE according to env variables
-if [[ ${WITH_GPU} == "ON" ]]; then
-  BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
-else
-  BASE_IMAGE="ubuntu:16.04"
-fi
-
-DOCKERFILE_GPU_ENV=""
-DOCKERFILE_CUDNN_DSO=""
-if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-    DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
-    DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
-fi
-
-mkdir -p /paddle/build
-cd /paddle/build
-
-# build script will not fail if *.deb does not exist
-rm *.deb 2>/dev/null || true
-# delete previous built whl packages
-rm -rf /paddle/paddle/dist 2>/dev/null || true
-
-cat <<EOF
-========================================
-Configuring cmake in /paddle/build ...
-      -DCMAKE_BUILD_TYPE=Release
-      -DWITH_DOC=OFF
-      -DWITH_GPU=${WITH_GPU:-OFF}
-      -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
-      -DWITH_MKLML=${WITH_MKLML:-ON}
-      -DWITH_AVX=${WITH_AVX:-OFF}
-      -DWITH_GOLANG=${WITH_GOLANG:-ON}
-      -DWITH_SWIG_PY=ON
-      -DWITH_C_API=${WITH_C_API:-OFF}
-      -DWITH_PYTHON=${WITH_PYTHON:-ON}
-      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-      -DCUDNN_ROOT=/usr/
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
-      -DWITH_TESTING=${WITH_TESTING:-ON}
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-========================================
-EOF
 
-# Disable UNITTEST_USE_VIRTUALENV in docker because
-# docker environment is fully controlled by this script.
-# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
-cmake .. \
-      -DCMAKE_BUILD_TYPE=Release \
-      -DWITH_DOC=OFF \
-      -DWITH_GPU=${WITH_GPU:-OFF} \
-      -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
-      -DWITH_MKLML=${WITH_MKLML:-ON} \
-      -DWITH_AVX=${WITH_AVX:-OFF} \
-      -DWITH_GOLANG=${WITH_GOLANG:-ON} \
-      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-      -DWITH_C_API=${WITH_C_API:-OFF} \
-      -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-      -DCUDNN_ROOT=/usr/ \
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-      -DWITH_TESTING=${WITH_TESTING:-ON} \
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-
-cat <<EOF
-============================================
-Building in /paddle/build ...
-============================================
-EOF
-make -j `nproc`
+function cmake_gen() {
+    # Set BASE_IMAGE according to env variables
+    if [[ ${WITH_GPU} == "ON" ]]; then
+    BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
+    else
+    BASE_IMAGE="ubuntu:16.04"
+    fi
+
+    DOCKERFILE_GPU_ENV=""
+    DOCKERFILE_CUDNN_DSO=""
+    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
+        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
+    fi
+
+    mkdir -p /paddle/build
+    cd /paddle/build
+
+    # build script will not fail if *.deb does not exist
+    rm *.deb 2>/dev/null || true
+    # delete previous built whl packages
+    rm -rf /paddle/paddle/dist 2>/dev/null || true
 
-if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
-cat <<EOF
-========================================
-Running unit tests ...
-========================================
+    cat <<EOF
+    ========================================
+    Configuring cmake in /paddle/build ...
+        -DCMAKE_BUILD_TYPE=Release
+        -DWITH_DOC=OFF
+        -DWITH_GPU=${WITH_GPU:-OFF}
+        -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
+        -DWITH_MKLML=${WITH_MKLML:-ON}
+        -DWITH_AVX=${WITH_AVX:-OFF}
+        -DWITH_GOLANG=${WITH_GOLANG:-ON}
+        -DWITH_SWIG_PY=ON
+        -DWITH_C_API=${WITH_C_API:-OFF}
+        -DWITH_PYTHON=${WITH_PYTHON:-ON}
+        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+        -DCUDNN_ROOT=/usr/
+        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
+        -DWITH_TESTING=${WITH_TESTING:-ON}
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+    ========================================
 EOF
-    ctest --output-on-failure
-    # make install should also be test when unittest
-    make install -j `nproc`
-    pip install /usr/local/opt/paddle/share/wheels/*.whl
-    paddle version
-fi
-
 
-if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
+    # Disable UNITTEST_USE_VIRTUALENV in docker because
+    # docker environment is fully controlled by this script.
+    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
+    cmake .. \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DWITH_DOC=OFF \
+        -DWITH_GPU=${WITH_GPU:-OFF} \
+        -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
+        -DWITH_MKLML=${WITH_MKLML:-ON} \
+        -DWITH_AVX=${WITH_AVX:-OFF} \
+        -DWITH_GOLANG=${WITH_GOLANG:-ON} \
+        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
+        -DWITH_C_API=${WITH_C_API:-OFF} \
+        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
+        -DCUDNN_ROOT=/usr/ \
+        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
+        -DWITH_TESTING=${WITH_TESTING:-ON} \
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+}
+
+function run_build() {
     cat <<EOF
-========================================
-Building documentation ...
-   In /paddle/build_doc
-========================================
+    ============================================
+    Building in /paddle/build ...
+    ============================================
 EOF
-    mkdir -p /paddle/build_doc
-    pushd /paddle/build_doc
-    cmake .. \
-          -DWITH_DOC=ON \
-          -DWITH_GPU=OFF \
-          -DWITH_AVX=${WITH_AVX:-ON} \
-          -DWITH_SWIG_PY=ON \
-          -DWITH_STYLE_CHECK=OFF
-    make -j `nproc` gen_proto_py
-    make -j `nproc` paddle_docs paddle_docs_cn
-    popd
-fi
-
-
-if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
+    make -j `nproc`
+}
+
+function run_test() {
+    if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
     cat <<EOF
-========================================
-Converting C++ source code into HTML ...
-========================================
+    ========================================
+    Running unit tests ...
+    ========================================
 EOF
-    export WOBOQ_OUT=/paddle/build/woboq_out
-    mkdir -p $WOBOQ_OUT
-    cp -rv /woboq/data $WOBOQ_OUT/../data
-    /woboq/generator/codebrowser_generator \
-        -b /paddle/build \
-        -a \
-        -o $WOBOQ_OUT \
-        -p paddle:/paddle
-    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-fi
-
-cat <<EOF
-========================================
-Generate /paddle/build/Dockerfile ...
-========================================
+        ctest --output-on-failure
+        # make install should also be test when unittest
+        make install -j `nproc`
+        pip install /usr/local/opt/paddle/share/wheels/*.whl
+        paddle version
+    fi
+}
+
+
+function gen_docs() {
+    if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
+        cat <<EOF
+    ========================================
+    Building documentation ...
+    In /paddle/build_doc
+    ========================================
 EOF
+        mkdir -p /paddle/build_doc
+        pushd /paddle/build_doc
+        cmake .. \
+            -DWITH_DOC=ON \
+            -DWITH_GPU=OFF \
+            -DWITH_AVX=${WITH_AVX:-ON} \
+            -DWITH_SWIG_PY=ON \
+            -DWITH_STYLE_CHECK=OFF
+        make -j `nproc` gen_proto_py
+        make -j `nproc` paddle_docs paddle_docs_cn
+        popd
+    fi
+
+
+    if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
+        cat <<EOF
+    ========================================
+    Converting C++ source code into HTML ...
+    ========================================
+EOF
+        export WOBOQ_OUT=/paddle/build/woboq_out
+        mkdir -p $WOBOQ_OUT
+        cp -rv /woboq/data $WOBOQ_OUT/../data
+        /woboq/generator/codebrowser_generator \
+            -b /paddle/build \
+            -a \
+            -o $WOBOQ_OUT \
+            -p paddle:/paddle
+        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+    fi
+}
+
 
-cat > /paddle/build/Dockerfile <<EOF
-FROM ${BASE_IMAGE}
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-ENV HOME /root
+function gen_dockerfile() {
+
+    cat <<EOF
+    ========================================
+    Generate /paddle/build/Dockerfile ...
+    ========================================
 EOF
 
-if [[ -n ${APT_MIRROR} ]]; then
-cat >> /paddle/build/Dockerfile <<EOF
-RUN sed -i '${APT_MIRROR}' /etc/apt/sources.list
+    cat > /paddle/build/Dockerfile <<EOF
+    FROM ${BASE_IMAGE}
+    MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+    ENV HOME /root
 EOF
-fi
-
-if [[ ${WITH_GPU} == "ON"  ]]; then
-  NCCL_DEPS="apt-get install -y libnccl-dev &&"
-else
-  NCCL_DEPS="" 
-fi
-
-cat >> /paddle/build/Dockerfile <<EOF
-ADD python/dist/*.whl /
-# run paddle version to install python packages first
-RUN apt-get update &&\
-    ${NCCL_DEPS}\
-    apt-get install -y wget python-pip && pip install -U pip && \
-    pip install /*.whl; apt-get install -f -y && \
-    apt-get clean -y && \
-    rm -f /*.whl && \
-    paddle version && \
-    ldconfig
-${DOCKERFILE_CUDNN_DSO}
-${DOCKERFILE_GPU_ENV}
-ADD go/cmd/pserver/pserver /usr/bin/
-ADD go/cmd/master/master /usr/bin/
-ADD paddle/pybind/print_operators_doc /usr/bin/
-# default command shows the paddle version and exit
-CMD ["paddle", "version"]
+
+    if [[ ${WITH_GPU} == "ON"  ]]; then
+        NCCL_DEPS="apt-get install -y libnccl-dev &&"
+    else
+        NCCL_DEPS="" 
+    fi
+
+    cat >> /paddle/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
+    # run paddle version to install python packages first
+    RUN apt-get update &&\
+        ${NCCL_DEPS}\
+        apt-get install -y wget python-pip && pip install -U pip && \
+        pip install /*.whl; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f /*.whl && \
+        paddle version && \
+        ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_GPU_ENV}
+    ADD go/cmd/pserver/pserver /usr/bin/
+    ADD go/cmd/master/master /usr/bin/
+    # default command shows the paddle version and exit
+    CMD ["paddle", "version"]
 EOF
+}
 
 set +xe
+
+cmake_gen
+run_build
+run_test
+gen_docs
+gen_dockerfile
+
 printf "If you need to install PaddlePaddle in develop docker image,"
 printf "please make install or pip install build/python/dist/*.whl.\n"
diff --git a/paddle/utils/Excepts.h b/paddle/utils/Excepts.h
index 0add66da7464293795927431daf0e90359f40b52..5c2c504f53a586f2991ccfae891991465fdb39b6 100644
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
@@ -17,8 +17,7 @@ limitations under the License. */
 
 #include <fenv.h>
 
-#if (defined(__APPLE__) || defined(__OSX__)) && !defined(__arm__) && \
-    !defined(__aarch64__)
+#if defined(__APPLE__) || defined(__OSX__)
 
 int fegetexcept(void);
 int feenableexcept(unsigned int excepts);
diff --git a/paddle/utils/arch/osx/Excepts.cpp b/paddle/utils/arch/osx/Excepts.cpp
index 42ecaa06d256c9d259a20c648626605d77ce0308..ac444615786fa9f89f96504a31b2289eae7bb643 100644
--- a/paddle/utils/arch/osx/Excepts.cpp
+++ b/paddle/utils/arch/osx/Excepts.cpp
@@ -14,9 +14,13 @@ limitations under the License. */
 
 #include "paddle/utils/Excepts.h"
 
-#if (defined(__APPLE__) || defined(__OSX__)) && !defined(__arm__) && \
-    !defined(__aarch64__)
-
+#if defined(__APPLE__) || defined(__OSX__)
+#if defined(__arm__) || defined(__arm64__)
+// TODO(liuyiqun): implement the arm version
+int fegetexcept(void) { return -1; }
+int feenableexcept(unsigned int excepts) { return -1; }
+int fedisableexcept(unsigned int excepts) { return -1; }
+#else
 int fegetexcept(void) {
   static fenv_t fenv;
   return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
@@ -49,5 +53,5 @@ int fedisableexcept(unsigned int excepts) {
 
   return (fesetenv(&fenv) ? -1 : old_excepts);
 }
-
+#endif
 #endif
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp
index fdc914d1bcc3c74e0f05ef475069abc315bdc306..248f58a7f26e26e82b55110930964cee04fb558b 100644
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ b/paddle/utils/tests/test_StringUtils.cpp
@@ -18,6 +18,6 @@ limitations under the License. */
 
 TEST(StringUtil, to) {
   ASSERT_NEAR(paddle::str::to<double>("12.45"), 12.45, 1e-5);
-  ASSERT_DEATH(paddle::str::to<double>("12.45x23"), ".*");
-  ASSERT_DEATH(paddle::str::to<int>(""), ".*");
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<double>("12.45x23"), ".*");
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<int>(""), ".*");
 }
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 7bd6d59b0096c23bb791b9b50702130057628879..32578ad7799c0a276972ccef7770c2eae8438069 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -44,6 +44,7 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/pad
 
 
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+    COMMAND touch stub.cc
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 6e8ac8838bf3012969bf4f597cdf4430693fe313..169e201046a0d7b8c3e85f60946d8c1c762c88f4 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -143,6 +143,7 @@ __all__ = [
     'scale_shift_layer',
     'img_conv3d_layer',
     'resize_layer',
+    'sub_seq_layer',
 ]
 
 
@@ -252,6 +253,7 @@ class LayerType(object):
     SCALE_SHIFT_LAYER = 'scale_shift'
 
     RESIZE = 'resize'
+    SUB_SEQ_LAYER = 'subseq'
 
     @staticmethod
     def is_layer_type(type_name):
@@ -6980,3 +6982,58 @@ def resize_layer(input, size, name=None):
     """
     Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
     return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
+
+
+@wrap_act_default(act=LinearActivation())
+@wrap_name_default('sub_seq')
+def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
+    """
+    sub_seq_layer will return sub-sequences from the input sequences. For each
+    sequence in the input sequence layer, sub_seq_layer will slice it by given
+    offset and size. Please notice that, number of offset value and size value
+    both are equal to the number of sequence in the input layer.
+
+    .. code-block:: python
+
+        sub_seq = sub_seq_layer(input=input_seq, offsets=offsets, sizes=sizes)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer, which should be sequence.
+    :type input: LayerOutput
+    :param offsets: offset indices to slice the input sequence, which should be
+                    sequence type.
+    :type offsets: LayerOutput
+    :param sizes: sizes of the sub-sequences, which should be sequence type.
+    :type sizes: LayerOutput
+    :param act: Layer activation, default is LinearActivation
+    :type act: BaseActivation.
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of sub_seq_layer layer must be a PaddlePaddle layer.')
+    assert isinstance(offsets, LayerOutput), (
+        'The offset indices for sub_seq_layer, '
+        'must be a PaddlePaddle layer.')
+    assert isinstance(sizes, LayerOutput), (
+        'The sizes of sub-sequences, must be a PaddlePaddle layer.')
+
+    Layer(
+        name=name,
+        type=LayerType.SUB_SEQ_LAYER,
+        inputs=[input.name, offsets.name, sizes.name],
+        active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr))
+
+    return LayerOutput(
+        name,
+        LayerType.SUB_SEQ_LAYER,
+        parents=[input, offsets, sizes],
+        size=input.size)
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index c3495ee110bfaf91a47637a52e88b3bb56dce7a9..c3cd4cf8c32e20f3ef86305489fc415397dec1b8 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -116,7 +116,7 @@ class AdamOptimizer(BaseSGDOptimizer):
 
         m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
         v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
 
     :param beta1: the :math:`\\beta_1` in equation.
     :type beta1: float
diff --git a/python/paddle/utils/merge_model.py b/python/paddle/utils/merge_model.py
index 48e5087cc281bd3a3d0b4a403372456ebbf39c62..421e953d2775f145800cf7179ec644697a265060 100644
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
@@ -23,32 +23,32 @@ from paddle.v2.topology import Topology
 
 
 def merge_v2_model(net, param_file, output_file):
-    '''Integrate the model config and model parameters into one file.
-    
+    '''Merge the model config and parameters into one file.
+
     The model configuration file describes the model structure which
     ends with .py. The parameters file stores the parameters of the model
     which ends with .tar.gz.
-    
-    @param  net            The output layer of the network.
-    @param  param_file     Path of the model parameters(.tar.gz) which is stored by v2 api.
+
+    @param  net            The output layer of the network for inference.
+    @param  param_file     Path of the parameters (.tar.gz) which is stored by v2 api.
     @param  output_file    Path of the merged file which will be generated.
-    
+
     Usage:
 
-        from paddle.util.merge_model import merge_v2_model
+        from paddle.utils.merge_model import merge_v2_model
         # import your network configuration
-        from mobilenet import mobile_net
-        
-        net = mobile_net(3*224*224, 102)
+        from example_net import net_conf
+
+        net = net_conf(is_predict=True)
         param_file = './param_pass_00000.tar.gz'
         output_file = './output.paddle'
-        
+
         merge_v2_model(net, param_file, output_file)
 
     '''
 
     assert isinstance(net, LayerOutput), \
-            "The net should be the output of the network"
+            "The net should be the output of the network for inference"
     assert os.path.exists(param_file), \
             "The model parameters file %s does not exists " % (param_file)
 
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
index 0739b2d2e2964f7f3565baf068a80c89938d193a..b7e468fb51c19d33db1a68df641e3aff20f04f45 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -372,11 +372,13 @@ def sequence_pool(input, pool_type, **kwargs):
     helper = LayerHelper('sequence_pool', input=input, **kwargs)
     dtype = helper.input_dtype()
     pool_out = helper.create_tmp_variable(dtype)
+    max_index = helper.create_tmp_variable(dtype)
 
     helper.append_op(
         type="sequence_pool",
-        inputs={"X": [input]},
-        outputs={"Out": [pool_out]},
+        inputs={"X": input},
+        outputs={"Out": pool_out,
+                 "MaxIndex": max_index},
         attrs={"pooltype": pool_type.upper()})
 
     return pool_out
diff --git a/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py b/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
index 999a0bdc629010d96a8db31b317ba7a65bf35773..54349c018c4a53b8767d6cd4f94d99c719dc0237 100644
--- a/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
@@ -58,36 +58,37 @@ class TestConv2dTransposeOp(OpTest):
         print 'check output here for', self.op_type
         self.check_output()
 
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-
-    def init_op_type(self):
-        self.op_type = "conv2d_transpose"
-
     def test_check_grad_no_input(self):
         self.check_grad(
             ['Filter'],
             'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.02,
             no_grad_set=set(['Input']))
 
     def test_check_grad_no_filter(self):
         self.check_grad(
             ['Input'],
             'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.02,
             no_grad_set=set(['Filter']))
 
     def test_check_grad(self):
         self.check_grad(
-            set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv2d_transpose"
 
 
+# ------------ test_cudnn ------------
 class TestCudnn(TestConv2dTransposeOp):
     def init_op_type(self):
         self.op_type = "conv2d_transpose_cudnn"
diff --git a/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py b/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..132fe7931438a30cf02e4ad2894c0838e48ffc9f
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
@@ -0,0 +1,97 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
+    # [2, 3, 5, 5, 5]
+    in_n, in_c, in_d, in_h, in_w = input_.shape
+    # [3, 6, 3, 3, 3]
+    f_c, out_c, f_d, f_h, f_w = filter_.shape
+    assert in_c == f_c
+
+    stride, pad = conv3dtranspose_param['stride'], conv3dtranspose_param['pad']
+    out_d = (in_d - 1) * stride[0] + f_d
+    out_h = (in_h - 1) * stride[1] + f_h
+    out_w = (in_w - 1) * stride[2] + f_w
+
+    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
+
+    for n in range(in_n):
+        for d in range(in_d):
+            for i in range(in_h):
+                for j in range(in_w):
+                    input_masked = input_[n, :, d, i, j]  # (c)
+                    input_masked = np.reshape(input_masked, (in_c, 1, 1, 1))
+                    input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
+
+                    for k in range(out_c):
+                        tmp_out = np.sum(input_masked * filter_[:, k, :, :, :],
+                                         axis=0)
+                        d1, d2 = d * stride[0], d * stride[0] + f_d
+                        i1, i2 = i * stride[1], i * stride[1] + f_h
+                        j1, j2 = j * stride[2], j * stride[2] + f_w
+                        out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
+
+    return out
+
+
+class TestConv3dTransposeOp(OpTest):
+    def setUp(self):
+        # init as conv transpose
+        self.init_op_type()
+
+        # [2, 3, 5, 5, 5] -> kernel [3, 6, 3, 3, 3] -> output [2, 6, 7, 7, 7]
+        self.init_test_case()
+
+        conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
+        input_ = np.random.random(self.input_size).astype("float32")
+        filter_ = np.random.random(self.filter_size).astype("float32")
+        output = conv3dtranspose_forward_naive(
+            input_, filter_, conv3dtranspose_param).astype("float32")
+        # print 'deconv output py', output, output.shape
+
+        self.inputs = {'Input': input_, 'Filter': filter_}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            # 'dilations': self.dilations
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        print 'check output here'
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+
+    def test_check_grad_no_filter(self):
+        self.check_grad(
+            ['Input'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        self.check_grad(
+            ['Filter'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_gru_op.py b/python/paddle/v2/framework/tests/test_gru_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2474cff94c6c71cc62bc8e69a5d83e38d51c511
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_gru_op.py
@@ -0,0 +1,156 @@
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+from test_lstm_op import identity, sigmoid, tanh, relu
+
+
+class TestGRUOp(OpTest):
+    batch_size = 9
+    frame_size = 5
+    activate = {
+        'identity': identity,
+        'sigmoid': sigmoid,
+        'tanh': tanh,
+        'relu': relu
+    }
+
+    @staticmethod
+    def seq_to_batch(lod, is_reverse):
+        idx_in_seq_list = []
+        seq_starts = lod[0]
+        seq_lens = []
+        for i in range(len(seq_starts) - 1):
+            seq_lens.append(seq_starts[i + 1] - seq_starts[i])
+        sorted_seqs = sorted(
+            range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
+        num_batch = seq_lens[sorted_seqs[0]]
+        for batch_idx in range(num_batch):
+            idx_in_seq = []
+            for i in range(len(seq_lens)):
+                if seq_lens[sorted_seqs[i]] <= batch_idx:
+                    break
+                idx = (seq_starts[sorted_seqs[i] + 1] - 1 - batch_idx
+                       ) if is_reverse else (
+                           seq_starts[sorted_seqs[i]] + batch_idx)
+                idx_in_seq.append(idx)
+            idx_in_seq_list.append(idx_in_seq)
+        return idx_in_seq_list
+
+    def gru_step(self, x, h_p, w, b):
+        batch_size = x.shape[0]
+        frame_size = w.shape[0]
+        g = x + np.tile(b, (batch_size, 1))
+        w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
+            (frame_size, frame_size * 2))
+        u_r = self.activate[self.attrs['gate_activation']](np.dot(
+            h_p, w_u_r) + g[:, :frame_size * 2])
+        u = u_r[:, :frame_size]
+        r = u_r[:, frame_size:frame_size * 2]
+        r_h_p = r * h_p
+        w_c = w.flatten()[frame_size * frame_size * 2:].reshape(
+            (frame_size, frame_size))
+        c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
+                                                    g[:, frame_size * 2:])
+        g = np.hstack((u_r, c))
+        h = u * c + (1 - u) * h_p
+        return g, r_h_p, h
+
+    def gru(self):
+        input, lod = self.inputs['Input']
+        w = self.inputs['Weight']
+        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+            (1, self.frame_size * 3))
+        batch_gate = self.outputs['BatchGate']
+        batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev']
+        batch_hidden = self.outputs['BatchHidden']
+        hidden = self.outputs['Hidden']
+        idx_in_seq_list = self.idx_in_seq_list
+        h_p = self.inputs['H0'] if self.inputs.has_key('H0') else np.zeros(
+            (len(idx_in_seq_list[0]), self.frame_size))
+        num_batch = len(idx_in_seq_list)
+        end_idx = 0
+        for batch_idx in range(num_batch):
+            x = input[idx_in_seq_list[batch_idx]]
+            g, r_h_p, h = self.gru_step(x, h_p, w, b)
+            if batch_idx < (num_batch - 1):
+                h_p = h[:len(idx_in_seq_list[batch_idx + 1])]
+            start_idx = end_idx
+            end_idx = start_idx + len(idx_in_seq_list[batch_idx])
+            batch_gate[start_idx:end_idx] = g
+            batch_reset_hidden_prev[start_idx:end_idx] = r_h_p
+            batch_hidden[start_idx:end_idx] = h
+            hidden[idx_in_seq_list[batch_idx]] = h
+        return batch_gate, batch_reset_hidden_prev, hidden
+
+    def set_data(self):
+        lod = [[0, 2, 6, self.batch_size]]
+        self.idx_in_seq_list = self.seq_to_batch(lod, self.is_reverse)
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        input = np.random.rand(batch_size, frame_size * 3).astype('float64')
+        h0 = np.random.rand(len(self.idx_in_seq_list[0]),
+                            frame_size).astype('float64')
+        weight = np.random.rand(frame_size, frame_size * 3).astype('float64')
+        bias = np.random.rand(1, frame_size * 3).astype('float64')
+
+        self.inputs = {
+            'Input': (input, lod),
+            'H0': h0,
+            'Weight': weight,
+            'Bias': bias
+        }
+
+        self.outputs = {
+            'BatchGate': np.zeros(
+                (batch_size, frame_size * 3), dtype='float64'),
+            'BatchResetHiddenPrev': np.zeros(
+                (batch_size, frame_size), dtype='float64'),
+            'BatchHidden': np.zeros(
+                (batch_size, frame_size), dtype='float64'),
+            'Hidden': np.zeros(
+                (batch_size, frame_size), dtype='float64')
+        }
+
+    def set_confs(self):
+        self.is_reverse = False
+        self.attrs = {
+            'activation': 'tanh',
+            'gate_activation': 'sigmoid',
+            'is_reverse': self.is_reverse
+        }
+
+    def setUp(self):
+        self.op_type = "gru"
+        self.set_confs()
+        self.set_data()
+        self.gru()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
+
+
+class TestGRUOpNoInitial(TestGRUOp):
+    def set_data(self):
+        super(TestGRUOpNoInitial, self).set_data()
+        self.inputs.pop('H0')
+
+    def test_check_grad(self):
+        self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden'])
+
+
+class TestGRUOpReverse(TestGRUOp):
+    def set_confs(self):
+        self.is_reverse = True
+        self.attrs = {
+            'activation': 'identity',
+            'gate_activation': 'sigmoid',
+            'is_reverse': self.is_reverse
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_huber_loss_op.py b/python/paddle/v2/framework/tests/test_huber_loss_op.py
index 003e7d7ed7ccdfc48b0aa8db0a6765b5c93e7c14..a24fcbec6cc4801118ce4ef97eb4692cd2351c28 100644
--- a/python/paddle/v2/framework/tests/test_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_huber_loss_op.py
@@ -21,7 +21,8 @@ class TestHuberLossOp(OpTest):
             'Y': np.random.uniform(0, 1., (samples_num, 1)).astype('float32'),
         }
         residual = self.inputs['Y'] - self.inputs['X']
-        loss = np.vectorize(huber_loss_forward)(residual, delta)
+        loss = np.vectorize(huber_loss_forward)(residual,
+                                                delta).astype('float32')
         self.attrs = {'delta': delta}
         self.outputs = {
             'Residual': residual,
@@ -43,6 +44,5 @@ class TestHuberLossOp(OpTest):
             ['X'], 'Out', max_relative_error=0.008, no_grad_set=set('residual'))
 
 
-# TODO(typhoonzero): should add this back till we fix it
-#if __name__ == '__main__':
-#    unittest.main()
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_seq_pool.py b/python/paddle/v2/framework/tests/test_seq_pool.py
index efc4920124afb539017a3b3f211c7320da68ffef..512d8b315f29cecf79ae274dca491c240f3447a1 100644
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
@@ -29,6 +29,9 @@ class TestSeqAvgPool(OpTest):
         self.check_output()
 
     def test_check_grad(self):
+        # Remove MaxIndex after check_grad is refined.
+        self.outputs['MaxIndex'] = \
+            np.zeros(self.outputs['Out'].shape).astype('int32')
         self.check_grad(["X"], "Out")
 
 
@@ -85,31 +88,53 @@ class TestSeqSqrtPool2D(TestSeqAvgPool2D):
             out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17))
 
     def test_check_grad(self):
+        # Remove MaxIndex after check_grad is refined.
+        self.outputs['MaxIndex'] = \
+            np.zeros(self.outputs['Out'].shape).astype('int32')
         self.check_grad(["X"], "Out", max_relative_error=0.06)
 
 
 class TestSeqMaxPool(TestSeqAvgPool):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
+        lod = [[0, 4, 5, 8, 13]]
+        for i in range(4):
+            l = lod[0][i + 1] - lod[0][i]
+            x[lod[0][i] + np.random.randint(l), :] += 2.0
+
+        self.inputs = {'X': (x, lod)}
+
+        out = np.zeros((4, 23)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+
     def compute(self, x, lod, out):
         self.attrs = {'pooltype': "MAX"}
         for i in range(4):
             sub_x = x[lod[0][i]:lod[0][i + 1], :]
             out[i] = np.amax(sub_x, axis=0)
 
-    def test_check_grad(self):
-        # Remove MaxPool2D from gradient check to confirm the success of CI.
-        return
-
 
 class TestSeqMaxPool2D(TestSeqAvgPool2D):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32')
+        lod = [[0, 4, 5, 8, 13]]
+        self.inputs = {'X': (x, lod)}
+        for i in range(4):
+            l = lod[0][i + 1] - lod[0][i]
+            x[lod[0][i] + np.random.randint(l), :] += 1.0
+
+        out = np.zeros((4, 3, 11)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+
     def compute(self, x, lod, out):
         self.attrs = {'pooltype': "MAX"}
         for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
-            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 17))
-
-    def test_check_grad(self):
-        # Remove MaxPool2D from gradient check to confirm the success of CI.
-        return
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 11))
+            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
 
 
 class TestSeqLastPool(TestSeqAvgPool):
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
index f93feb20696f126423bc9412eab3b4aa41b19426..c2f07f9096c69f3d4977f9444bdd5dcda8028973 100644
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -12,30 +12,30 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
 
     def setUp(self):
         self.op_type = "softmax_with_cross_entropy"
-        batch_size = 3
+        batch_size = 2
         class_num = 37
 
         logits = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float32")
+                                   [batch_size, class_num]).astype("float64")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
-        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int32")
+        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
 
         cross_entropy = np.asmatrix(
             [[-np.log(softmax[i][labels[i][0]])]
              for i in range(softmax.shape[0])],
-            dtype="float32")
+            dtype="float64")
 
         self.inputs = {"Logits": logits, "Label": labels}
         self.outputs = {
-            "Softmax": softmax.astype('float32'),
-            "Loss": cross_entropy.astype('float32')
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
         }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+        self.check_grad(["Logits"], "Loss")
 
 
 class TestSoftmaxWithCrossEntropyOp2(OpTest):
@@ -49,19 +49,19 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
         class_num = 37
 
         logits = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float32")
+                                   [batch_size, class_num]).astype("float64")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
         labels = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float32")
+                                   [batch_size, class_num]).astype("float64")
         labels /= np.sum(labels, axis=1, keepdims=True)
 
         cross_entropy = (-labels * np.log(softmax)).sum(
-            axis=1, keepdims=True).astype("float32")
+            axis=1, keepdims=True).astype("float64")
 
         self.inputs = {"Logits": logits, "Label": labels}
         self.outputs = {
-            "Softmax": softmax.astype('float32'),
-            "Loss": cross_entropy.astype('float32')
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
         }
         self.attrs = {"soft_label": True}
 
@@ -69,9 +69,8 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+        self.check_grad(["Logits"], "Loss")
 
 
 if __name__ == "__main__":
-    exit(0)  # FIXME: xe has bug
     unittest.main()
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 29f0945eb4c88eab8fa9ee83f455190dfd473aa4..94d706b1d6289a7bffbdfb161c35d44c78fdf46f 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -11,11 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Optimizers(update equation) for SGD method.
-
-TODO(yuyang18): Complete comments.
-"""
 
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
@@ -101,32 +96,37 @@ class Optimizer(object):
 
 class Momentum(Optimizer):
     """
-    SGD Optimizer.
-
-    SGD is an optimization method, trying to find a neural network that
-    minimize the "cost/error" of it by iteration. In paddle's implementation
-    SGD Optimizer is synchronized, which means all gradients will be wait to
-    calculate and reduced into one gradient, then do optimize operation.
+    Momentum Optimizer.
 
-    The neural network consider the learning problem of minimizing an objective
-    function, that has the form of a sum
+    When sparse=False, the momentum update formula is as follows:
 
     ..  math::
 
-        Q(w) = \\sum_{i}^{n} Q_i(w)
+        v_{t} &= k * v_{t-1} - \\gamma_t / (g_{t} + \\lambda w_{t-1}) \\\\
+        w_{t} &= w_{t-1} + v_{t} \\\\
 
-    The value of function Q sometimes is the cost of neural network (Mean
-    Square Error between prediction and label for example). The function Q is
-    parametrised by w, the weight/bias of neural network. And weights is what to
-    be learned. The i is the i-th observation in (trainning) data.
+    where, :math:`k` is momentum, :math:`\\lambda` is decay rate,
+    :math:`\\gamma_t` is learning rate at the t'th iteration.
+    :math:`w_{t}` is the weight as the t'th iteration.
+    And the :math:`v_{t}` is the history momentum variable.
 
-    So, the SGD method will optimize the weight by
+    When sparse=True, the update scheme:
 
     ..  math::
 
-        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
-
-    where :math:`\\eta` is learning rate. And :math:`n` is batch size.
+        \\alpha_t &= \\alpha_{t-1} / k \\\\
+        \\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\
+        u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\
+        v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\
+        \\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t
+    
+    where :math:`k` is momentum, :math:`\\lambda` is decay rate, 
+    :math:`\\gamma_t` is learning rate at the t'th iteration.
+
+    :param momentum: the momentum factor.
+    :type momentum: float
+    :param sparse: with sparse support or not, False by default.
+    :type sparse: bool
     """
 
     def __init__(self, momentum=None, sparse=False, **kwargs):
@@ -146,7 +146,7 @@ class Adam(Optimizer):
 
         m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
         v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
 
     :param beta1: the :math:`\\beta_1` in equation.
     :type beta1: float
diff --git a/python/setup.py.in b/python/setup.py.in
index 87b3823e52604b889cdee76bc696a1ae9b9de802..5348c2d8d7e9b5adc5fe93e2943bef149ba047cc 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,4 +1,4 @@
-from setuptools import setup, Distribution
+from setuptools import setup, Distribution, Extension
 class BinaryDistribution(Distribution):
     def has_ext_modules(foo):
         return True
@@ -41,6 +41,7 @@ setup(name='paddlepaddle',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
       packages=packages,
+      ext_modules=[Extension('_foo', ['stub.cc'])],
       package_data={
         'paddle.v2.master': ['libpaddle_master.so'],
         'paddle.v2.framework': ['core.so'],
@@ -54,6 +55,5 @@ setup(name='paddlepaddle',
           'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
       },
       scripts=paddle_bins,
-      distclass=BinaryDistribution,
       data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
 )