Merge remote-tracking branch 'upstream/develop' into resnet

945ad8d9 · tensor-tang · 30b57eef · 0a4a66f5 · 945ad8d9 · 945ad8d9
113 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,7 +126,7 @@ include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
-include(external/pybind11)    # download pybind11
+include(external/pybind11)  # download pybind11
 include(external/nccl)

 include(cudnn)              # set cudnn libraries, must before configure

--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -79,9 +79,8 @@ if(NOT DEFINED IOS_ARCH)
    # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
    set(IOS_ARCH "arm64")
  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    set(IOS_ARCH "i386;x86_64")
-  elseif(IOS_PLATFORM STREQUAL "WATCHOS")
-    set(IOS_ARCH armv7k)
+    # FIXME(liuyiqun): support "i386;x86_64" future
+    set(IOS_ARCH "x86_64")
  endif()
 endif()
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")

--- a/cmake/external/nccl.cmake
+++ b/cmake/external/nccl.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_GPU)
+  return()
+endif()
+
 include(ExternalProject)

 set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
-INCLUDE(ExternalProject)
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

-SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+if(NOT WITH_PYTHON)
+    return()
+endif()
+
+include(ExternalProject)

-INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+
+include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)

 ExternalProject_Add(
        extern_pybind
@@ -17,14 +35,12 @@ ExternalProject_Add(
        TEST_COMMAND      ""
 )

-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";")
    add_library(pybind STATIC ${dummyfile})
 else()
    add_library(pybind INTERFACE)
 endif()

 add_dependencies(pybind extern_pybind)
-
-LIST(APPEND external_project_dependencies pybind)
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
 # This file is use to check all support level of AVX on your machine
 # so that PaddlePaddle can unleash the vectorization power of muticore.

-INCLUDE(CheckCXXSourceRuns)
-INCLUDE(CheckCXXSourceCompiles)
+include(CheckCXXSourceRuns)
+include(CheckCXXSourceCompiles)

-IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
    set(MMX_FLAG "-mmmx")
    set(SSE2_FLAG "-msse2")
    set(SSE3_FLAG "-msse3")
-    SET(AVX_FLAG "-mavx")
-    SET(AVX2_FLAG "-mavx2")
-ELSEIF(MSVC)
+    set(AVX_FLAG "-mavx")
+    set(AVX2_FLAG "-mavx2")
+elseif(MSVC)
    set(MMX_FLAG "/arch:MMX")
    set(SSE2_FLAG "/arch:SSE2")
    set(SSE3_FLAG "/arch:SSE3")
    SET(AVX_FLAG "/arch:AVX")
    SET(AVX2_FLAG "/arch:AVX2")
-ENDIF()
+endif()

 set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})

 # Check  MMX
 set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
+set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <mmintrin.h>
 int main()
@@ -32,6 +33,7 @@ int main()

 # Check SSE2
 set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
+set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <emmintrin.h>
 int main()
@@ -42,6 +44,7 @@ int main()

 # Check SSE3
 set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
+set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <pmmintrin.h>
 int main()
@@ -55,6 +58,7 @@ int main()

 # Check AVX
 set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
@@ -67,6 +71,7 @@ int main()

 # Check AVX 2
 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()

--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -145,7 +145,7 @@ PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以

 Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。

-PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
 如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。

 我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：

--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -63,7 +63,7 @@
 </tr>

 <tr>
-<td class="left" rowspan="15">训练</td><td class="left">dot_period</td>
+<td class="left" rowspan="14">训练</td><td class="left">dot_period</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>


--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -8,3 +8,4 @@ PaddlePaddle 文档
  howto/index_cn.rst
  api/index_cn.rst
  faq/index_cn.rst
+  mobile/index_cn.rst
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -7,3 +7,4 @@ PaddlePaddle Documentation
  getstarted/index_en.rst
  howto/index_en.rst
  api/index_en.rst
+  mobile/index_en.rst
--- a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
@@ -20,10 +20,32 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android
 构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
 Android的Docker开发镜像向用户提供两个可配置的参数：

-| Argument        | Optional Values         | Default |
-|-----------------|-------------------------|---------|
-|`ANDROID_ABI`    |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` |
-|`ANDROID_API`    |`>= 21` | `21` |
+<table class="docutils">
+<colgroup>
+  <col width="25%" />
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 21</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>

 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
  ```bash

--- a/doc/howto/cross_compiling/cross_compiling_for_android.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_android.md
@@ -26,10 +26,32 @@ $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_A

 The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:

-| Argument        | Optional Values         | Default |
-|-----------------|-------------------------|---------|
-|`ANDROID_ABI`    |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` |
-|`ANDROID_API`    |`>= 21` | `21` |
+<table class="docutils">
+<colgroup>
+  <col width="25%" />
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 21</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>

 The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.


--- a/doc/howto/cross_compiling/cross_compiling_for_ios_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_ios_cn.md
@@ -27,10 +27,28 @@ iOS平台可选配置参数：
  - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
 - `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示：

-   | IOS_PLATFORM | IOS_ARCH             |
-   |--------------|----------------------|
-   |   OS         | armv7, armv7s, arm64 (默认) |
-   | SIMULATOR    | i386, x86_64 (默认)         |   
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 (默认)</td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 (默认)</td>
+    </tr>
+    </tbody>
+    </table>

 - `IOS_DEPLOYMENT_TARGET`，最小的iOS部署版本，默认值为`7.0`。
 - `IOS_ENABLE_BITCODE`，是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3)，可设置`ON/OFF`，默认值为`ON`。

--- a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
--- a/doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md
--- a/doc/mobile/index_cn.rst
+++ b/doc/mobile/index_cn.rst
+MOBILE
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_cn.md
+  cross_compiling_for_ios_cn.md
+  cross_compiling_for_raspberry_cn.md
--- a/doc/mobile/index_en.rst
+++ b/doc/mobile/index_en.rst
+MOBILE
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_en.md
+  cross_compiling_for_raspberry_en.md
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -29,32 +29,32 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
 add_dependencies(paddle_capi paddle_proto)

 # TODO: paddle_capi_whole will be removed.
+set(PADDLE_CAPI_LAYERS_LIBS
+    paddle_function
+    paddle_gserver)
 if(MOBILE_INFERENCE)
-    set(PADDLE_CAPI_INFER_LIBS
-        paddle_utils
-        paddle_parameter
-        paddle_math
-        paddle_cuda
-        paddle_function
-        paddle_gserver
-        paddle_proto)
+  set(PADDLE_CAPI_ENGINE_LIBS
+      paddle_utils
+      paddle_parameter
+      paddle_math
+      paddle_cuda
+      paddle_proto)
 else()
-    set(PADDLE_CAPI_INFER_LIBS
-        paddle_utils
-        paddle_parameter
-        paddle_math
-        paddle_cuda
-        paddle_function
-        paddle_gserver
-        paddle_proto
-        paddle_pserver
-        paddle_network)
+  set(PADDLE_CAPI_ENGINE_LIBS
+      paddle_utils
+      paddle_parameter
+      paddle_math
+      paddle_cuda
+      paddle_proto
+      paddle_pserver
+      paddle_network)
 endif()
+set(PADDLE_CAPI_INFER_LIBS ${PADDLE_CAPI_LAYERS_LIBS} ${PADDLE_CAPI_ENGINE_LIBS})
 cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})

 # Link the static library for inference
-cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto)
-cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver)
+cc_library(paddle_capi_engine DEPS paddle_capi ${PADDLE_CAPI_ENGINE_LIBS})
+cc_library(paddle_capi_layers DEPS ${PADDLE_CAPI_LAYERS_LIBS})

 # Link the shared library for inference
 if(NOT IOS)

--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
@@ -140,19 +140,9 @@ Similarly, the lengths in the top level LoD
 are transformed into offsets of elements/words as follows:

 ```
-0 9     10  15
-  =     =   =
-  3+2+4 1+9 2+3+10
-```
-
-so we can tell that the first article is from word 0 to word 9, and the second article is from word 9 to word 10.
-
-The complete offset representation is as follows:
-
-```
-0           9 10       15
-0   3  5    9 10  12   15
- ||| || |||| |  ||  |||
+0 3 4   6
+  = =   =
+  3 3+1 4+2
 ```

 ## Slicing of LoD Tensors

--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -67,8 +67,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
                      out);
    in_var->SetLoDLevel(out_var->GetLodLevel());
  }
+  bool IsRuntime() const override;
+
+ protected:
+  VarDesc::VarType GetVarType(const std::string &name) const override;

- private:
  DDim GetDim(const std::string &name) const override;

  void SetDim(const std::string &name, const DDim &dim) override;
@@ -349,6 +352,9 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
    info.infer_var_type_(*this, block);
  } else {
    // all output type is LoDTensor by default
+    VLOG(10) << this->Type()
+             << " has not registered InferVarType. Set output variables to "
+                "LOD_TENSOR";
    for (auto &out_pair : this->outputs_) {
      for (auto &out_var_name : out_pair.second) {
        block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
@@ -448,6 +454,12 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
                                          const DDim &dim) {
  block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
 }
+bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
+
+VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
+    const std::string &name) const {
+  return block_.FindVarRecursive(name)->GetType();
+}

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/framework/operator.h"
 #include <algorithm>
 #include <atomic>
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/shape_inference.h"
+#include "paddle/framework/var_type.h"

 namespace paddle {
 namespace framework {
@@ -365,7 +367,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
    out_tensor->set_lod(in_tensor.lod());
  }

- private:
+  bool IsRuntime() const override { return true; }
+
+ protected:
  DDim GetDim(const std::string& name) const override {
    Variable* var = scope_.FindVar(name);
    if (var->IsType<LoDTensor>()) {
@@ -388,6 +392,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
    }
  }

+  VarDesc::VarType GetVarType(const std::string& name) const override {
+    auto* var = scope_.FindVar(name);
+    return ToVarType(var->Type());
+  }
+
+ private:
  const OperatorBase& op_;
  const Scope& scope_;
 };

--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -46,6 +46,23 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
    SetDim(names[i], dims[i]);
  }
 }
+std::vector<VarDesc::VarType> InferShapeContext::GetInputsVarType(
+    const std::string &name) const {
+  return GetVarTypes(Inputs(name));
+}
+std::vector<VarDesc::VarType> InferShapeContext::GetOutputsVarType(
+    const std::string &name) const {
+  return GetVarTypes(Outputs(name));
+}
+std::vector<VarDesc::VarType> InferShapeContext::GetVarTypes(
+    const std::vector<std::string> &names) const {
+  std::vector<VarDesc::VarType> retv;
+  retv.resize(names.size());
+  std::transform(names.begin(), names.end(), retv.begin(),
+                 std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
+                           std::placeholders::_1));
+  return retv;
+}

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include "paddle/framework/attribute.h"
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/framework.pb.h"

 namespace paddle {
 namespace framework {
@@ -26,6 +27,10 @@ class InferShapeContext {
  virtual bool HasInput(const std::string &name) const = 0;
  virtual bool HasOutput(const std::string &name) const = 0;

+  std::vector<VarDesc::VarType> GetInputsVarType(const std::string &name) const;
+  std::vector<VarDesc::VarType> GetOutputsVarType(
+      const std::string &name) const;
+
  virtual bool HasInputs(const std::string &name) const = 0;
  virtual bool HasOutputs(const std::string &name) const = 0;

@@ -46,6 +51,8 @@ class InferShapeContext {
  virtual void ShareLoD(const std::string &in, const std::string &out,
                        size_t i = 0, size_t j = 0) const = 0;

+  virtual bool IsRuntime() const = 0;
+
 protected:
  virtual framework::DDim GetDim(const std::string &name) const = 0;
  virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
@@ -55,6 +62,11 @@ class InferShapeContext {

  void SetDims(const std::vector<std::string> &names,
               const std::vector<framework::DDim> &dims);
+
+  std::vector<VarDesc::VarType> GetVarTypes(
+      const std::vector<std::string> &names) const;
+
+  virtual VarDesc::VarType GetVarType(const std::string &name) const = 0;
 };

 }  // namespace framework

--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/lod_tensor_array.h"
+
+namespace paddle {
+namespace framework {
+inline VarDesc::VarType ToVarType(std::type_index type) {
+  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
+    return VarDesc_VarType_LOD_TENSOR;
+  } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
+    return VarDesc_VarType_LOD_RANK_TABLE;
+  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
+    return VarDesc_VarType_LOD_TENSOR_ARRAY;
+  } else {
+    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -48,6 +48,11 @@ class Variable {

  void Clear() { holder_.reset(); }

+  std::type_index Type() const {
+    PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
+    return holder_->Type();
+  }
+
 private:
  struct Placeholder {
    virtual ~Placeholder() {}

--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -17,7 +17,7 @@ limitations under the License. */

 namespace paddle {

-ThreadLocalD<std::vector<MemoryHandle *>> ConvBaseProjection::convMem_;
+ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;

 ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
                                       ParameterPtr parameter,
@@ -175,18 +175,18 @@ void ConvBaseProjection::reshape(int batchSize) {
 }

 void *ConvBaseProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandle *> &convMem = *convMem_;
+  std::vector<MemoryHandlePtr> &convMem = *convMem_;
  if (convMem.empty()) {
    int numDevices = hl_get_device_count();
    convMem.resize(numDevices);
  }

  int devId = hl_get_device();
-  MemoryHandle **localMem = &(convMem[devId]);
-  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
-    *localMem = new GpuMemoryHandle(size);
+  MemoryHandlePtr localMem = convMem[devId];
+  if (NULL == localMem || size > localMem->getAllocSize()) {
+    localMem = std::make_shared<GpuMemoryHandle>(size);
  }
-  return (*localMem)->getBuf();
+  return localMem->getBuf();
 }

 ConvBaseProjection::~ConvBaseProjection() {

--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -105,7 +105,7 @@ protected:
  bool bias_;

  std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+  static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
 };

 }  // namespace paddle
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -98,8 +98,19 @@ void SubSequenceLayer::forward(PassType passType) {
  CHECK_EQ(numSequences2, numSequences3);

  MatrixPtr inputValue = input.value;
-  IVectorPtr offsetValue = offsetSeq.ids;
-  IVectorPtr sizeValue = sizeSeq.ids;
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }

  CHECK_EQ(offsetValue->getSize(), numSequences1);
  CHECK_EQ(sizeValue->getSize(), numSequences1);
@@ -176,8 +187,21 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) {
  size_t numSequences1 = startPositions1->getSize() - 1;
  const int* starts1 = startPositions1->getData();

-  IVectorPtr offsetValue = getInput(1).ids;
-  IVectorPtr sizeValue = getInput(2).ids;
+  const Argument& offsetSeq = getInput(1);
+  const Argument& sizeSeq = getInput(2);
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }

  int* offsets = offsetValue->getData();
  int* sizes = sizeValue->getData();

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -69,6 +69,20 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
    endif()

+    # conv_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
+    endif()
+
+    # conv_transpose_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_transpose_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
+    endif()
+    
    # pool_cudnn_op contains several operators
    if ("${TARGET}" STREQUAL "pool_cudnn_op")
        set(pybind_flag 1)
@@ -96,7 +110,7 @@ function(op_library TARGET)
        # It's enough to just adding one operator to pybind
        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
    endif()
-    
+
    # reduce_op contains several operators
    if ("${TARGET}" STREQUAL "reduce_op")
        set(pybind_flag 1)
@@ -104,6 +118,11 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
    endif()

+    if ("${TARGET}" STREQUAL "tensor_array_read_write_op")
+        set(pybind_flag 1)
+        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n")
+    endif()
+
    # pybind USE_NO_KERNEL_OP
    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
    file(READ ${TARGET}.cc TARGET_CONTENT)
@@ -139,26 +158,38 @@ set(DEPS_OPS
    sum_op
    pool_op
    pool_with_index_op
+    conv_op
+    lstm_op
+    conv_transpose_op
    nccl_op
    sequence_conv_op
+    sequence_pool_op
    lod_rank_table_op
-    lstm_op)
+    lstm_op
+    tensor_array_read_write_op
+    gru_op)

 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
+op_library(conv_op DEPS vol2col)
 op_library(sum_op DEPS net_op selected_rows_functor)
 op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
+op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
 if(WITH_GPU)
 op_library(nccl_op DEPS nccl_common)
 endif()
 op_library(sequence_conv_op DEPS context_project)
+op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(conv_transpose_op DEPS vol2col)
+op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
        DEPS net_op tensor_array)
 op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
+
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
    op_library(${src})

--- a/paddle/operators/conv2d_transpose_cudnn_op.cc
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cc
@@ -12,7 +12,7 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "paddle/operators/conv2d_transpose_op.h"
+#include "paddle/operators/conv_transpose_op.h"

 namespace paddle {
 namespace operators {
@@ -38,13 +38,13 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(conv2d_transpose_cudnn, ops::Conv2DTransposeOp,
+REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp,
            ops::CudnnConv2DTransposeOpMaker, conv2d_transpose_cudnn_grad,
-            ops::Conv2DTransposeOpGrad);
+            ops::ConvTransposeOpGrad);

 REGISTER_OP_CPU_KERNEL(
    conv2d_transpose_cudnn,
-    ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
    conv2d_transpose_cudnn_grad,
-    ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv2d_transpose_cudnn_op.cu
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cu
@@ -15,7 +15,7 @@
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memory.h"
-#include "paddle/operators/conv2d_transpose_op.h"
+#include "paddle/operators/conv_transpose_op.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cudnn_helper.h"


--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -12,7 +12,7 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"

 namespace paddle {
 namespace operators {
@@ -38,10 +38,11 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(conv_cudnn, ops::Conv2DOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
-            ops::Conv2DOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    conv_cudnn, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
+            ops::ConvOpGrad);
+
+REGISTER_OP_CPU_KERNEL(conv_cudnn,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
    conv_cudnn_grad,
-    ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv_cudnn_op.cu
+++ b/paddle/operators/conv_cudnn_op.cu
@@ -15,7 +15,7 @@
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memory.h"
-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cudnn_helper.h"


--- a/paddle/operators/conv2d_op.cc
+++ b/paddle/operators/conv2d_op.cc
@@ -12,18 +12,18 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"

 namespace paddle {
 namespace operators {

-void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
+void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of Conv2DOp should not be null.");
+                 "Input(Input) of ConvOp should not be null.");
  PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                 "Input(Filter) of Conv2DOp should not be null.");
+                 "Input(Filter) of ConvOp should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                 "Output(Output) of Conv2DOp should not be null.");
+                 "Output(Output) of ConvOp should not be null.");

  auto in_dims = ctx->GetInputDim("Input");
  auto filter_dims = ctx->GetInputDim("Filter");
@@ -33,8 +33,17 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
  int input_channels = in_dims[1];
  int output_channels = filter_dims[0];

-  PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D.");
-  PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D.");
+  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
+                 "Conv intput should be 4-D or 5-D tensor.");
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(), filter_dims.size(),
+      "Conv input dimension and filter dimension should be the same.");
+  PADDLE_ENFORCE(
+      in_dims.size() - strides.size() == 2U,
+      "Conv input dimension and strides dimension should be consistent.");
+  PADDLE_ENFORCE_EQ(
+      paddings.size(), strides.size(),
+      "Conv paddings dimension and Conv strides dimension should be the same.");
  PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
                    "The number of input channels should be equal to filter "
                    "channels * groups.");
@@ -42,12 +51,12 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
      output_channels % groups, 0,
      "The number of output channels should be divided by groups.");

-  auto output_height =
-      OutputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]);
-  auto output_width =
-      OutputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]);
-  ctx->SetOutputDim("Output",
-                    {in_dims[0], filter_dims[0], output_height, output_width});
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                      paddings[i], strides[i]));
+  }
+  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }

 Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
@@ -55,19 +64,19 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput(
      "Input",
-      "The input tensor of convolution operator. "
+      "(Tensor) The input tensor of convolution operator. "
      "The format of input tensor is NCHW, where N is batch size, C is the "
-      "number of channels, H is the height of the image, "
-      "and W is the width of the image.");
+      "number of channels, H is the height of the feature, "
+      "and W is the width of the feature.");
  AddInput("Filter",
-           "The filter tensor of convolution operator. "
+           "(Tensor) The filter tensor of convolution operator. "
           "The format of the filter tensor is MCHW, where M is the number of "
           "output image channels, C is the number of input image channels, "
           "H is the height of the filter, and W is the width of the filter. "
           "If the groups attribute is greater than 1, C equals the number of "
           "input image channels divided by the groups.");
  AddOutput("Output",
-            "The output tensor of convolution operator. "
+            "(Tensor) The output tensor of convolution operator. "
            "The format of output tensor is also NCHW.");
  AddAttr<std::vector<int>>("strides", "strides of convolution operator.")
      .SetDefault({1, 1});
@@ -75,7 +84,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
      .SetDefault({0, 0});
  AddAttr<int>(
      "groups",
-      "Group size of convolution operator. "
+      "(int default:1), the group size of convolution operator. "
      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
      "when group=2, the first half of the filters is only connected to the "
      "first half of the input channels, while the second half of the filters "
@@ -84,14 +93,91 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
  AddComment(R"DOC(
 Convolution Operator.

-The convolution operation calculates the output based on the input, filter, 
-strides, paddings, and groups parameters. The size of each dimension of the
-parameters is checked in the infer-shape method.
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
+size, C is the number of channels, H is the height of the feature, and W is
+the width of the feature. Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: (N, C_in, H_in, W_in)
+       Filter shape: (C_out, C_in, H_f, W_f)
+  Output:
+       Output shape: (N, C_out, H_out, W_out)
+  where
+       H_out = (H_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
+       W_out = (W_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
+)DOC");
+}

+Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
+                             framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution operator. "
+      "The format of input tensor is NCDHW. Where N is batch size, C is the "
+      "number of channels, D is the depth of the feature, H is the height of "
+      "the feature, "
+      "and W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution operator. "
+           "The format of the filter tensor is MCDHW, where M is the number of "
+           "output image channels, C is the number of input image channels, "
+           "D is the depth of the filter, H is the height of the filter, and W "
+           "is the width of the filter."
+           "If the groups attribute is greater than 1, C equals the number of "
+           "input image channels divided by the groups.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution operator."
+            "The format of output tensor is also NCDHW.");
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector, default:{0, 0, 0}), the strides of convolution operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector, default:{0, 0, 0}), the paddings of convolution operator.")
+      .SetDefault({0, 0, 0});
+  AddAttr<int>(
+      "groups",
+      "(int default:1), the group size of convolution operator. "
+      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
+      "when group=2, the first half of the filters is only connected to the "
+      "first half of the input channels, while the second half of the filters "
+      "is only connected to the second half of the input channels.")
+      .SetDefault(1);
+
+  AddComment(R"DOC(
+Convolution3D Operator.
+
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
+size, C is the number of channels,D is the depth of the feature, H is the height of
+the feature, and W is the width of the feature. Parameters(ksize, strides, paddings)
+are three elements. These three elements represent depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: (N, C_in, D_in, H_in, W_in)
+       Filter shape: (C_out, C_in, D_f, H_f, W_f)
+  Output:
+       Output shape: (N, C_out, D_out, H_out, W_out)
+  where
+       D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
+       H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
+       W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1;
 )DOC");
 }

-void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
  auto in_dims = ctx->GetInputDim("Input");
  auto filter_dims = ctx->GetInputDim("Filter");
  if (ctx->HasOutput(framework::GradVarName("Input"))) {
@@ -106,10 +192,18 @@ void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(conv2d, ops::Conv2DOp, ops::Conv2DOpMaker, conv2d_grad,
-            ops::Conv2DOpGrad);
+REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
+            ops::ConvOpGrad);
+namespace ops = paddle::operators;
+REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
+            ops::ConvOpGrad);

+REGISTER_OP_CPU_KERNEL(conv2d,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP_CPU_KERNEL(conv3d,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv2d_op.cu
+++ b/paddle/operators/conv2d_op.cu
@@ -12,11 +12,16 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"

 namespace ops = paddle::operators;

+REGISTER_OP_GPU_KERNEL(conv2d,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    conv2d, ops::GemmConv2DKernel<paddle::platform::GPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
+
+REGISTER_OP_GPU_KERNEL(conv3d,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::GPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/conv2d_op.h
+++ b/paddle/operators/conv2d_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/im2col.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/vol2col.h"

 namespace paddle {
 namespace operators {
@@ -40,14 +41,20 @@ class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
                framework::OpAttrChecker* op_checker);
 };

-class Conv2DOp : public framework::OperatorWithKernel {
+class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv3DOpMaker(framework::OpProto* proto,
+                framework::OpAttrChecker* op_checker);
+};
+
+class ConvOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override;
 };

-class Conv2DOpGrad : public framework::OperatorWithKernel {
+class ConvOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

@@ -55,7 +62,7 @@ class Conv2DOpGrad : public framework::OperatorWithKernel {
 };

 template <typename Place, typename T>
-class GemmConv2DKernel : public framework::OpKernel<T> {
+class GemmConvKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");
@@ -70,51 +77,78 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    int groups = context.Attr<int>("groups");

-    int batch_size = input->dims()[0];
-    int input_channels = input->dims()[1];
-    int filter_height = filter.dims()[filter.dims().size() - 2];
-    int filter_width = filter.dims()[filter.dims().size() - 1];
-    int output_channels = output->dims()[1];
-    int output_height = output->dims()[2];
-    int output_width = output->dims()[3];
-
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        im2col;
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+    filter_shape_vec.erase(filter_shape_vec.begin(),
+                           filter_shape_vec.begin() + 2);
+
+    // output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
+    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+    output_shape_vec.erase(output_shape_vec.begin(),
+                           output_shape_vec.begin() + 2);
+
    // use col_shape in the im2col calculation
-    framework::DDim col_shape = {input_channels / groups, filter_height,
-                                 filter_width, output_height, output_width};
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+    // o_h, o_w}
+    std::vector<int64_t> col_shape_vec;
+    col_shape_vec.push_back(input->dims()[1] / groups);
+    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
+                         filter_shape_vec.end());
+    col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
+                         output_shape_vec.end());
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
    // use col_matrix_shape in the gemm calculation
-    framework::DDim col_matrix_shape = {
-        input_channels / groups * filter_height * filter_width,
-        output_height * output_width};
+    // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
+    // o_h * o_w)
+    framework::DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
+
    Tensor col;
    col.mutable_data<T>(col_shape, context.GetPlace());
    // col_matrix shares the same piece of data with col,
    // but will be reshaped into a two-dimensional matrix shape
    // to call the matrix multiplication interface.
-    Tensor col_matrix = col;
+    Tensor col_matrix;
+    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);

-    framework::DDim input_shape = {input->dims()[1], input->dims()[2],
-                                   input->dims()[3]};
+    framework::DDim input_shape = framework::slice_ddim(
+        input->dims(), 1, static_cast<int>(input->dims().size()));
+
    framework::DDim filter_matrix_shape = {filter.dims()[0],
                                           filter.numel() / filter.dims()[0]};
    filter.Resize(filter_matrix_shape);

-    framework::DDim output_matrix_shape = {output_channels,
-                                           output_height * output_width};
-    // convolution operator: im2col + gemm
-    int in_step = input_channels / groups;
-    int out_step = output_channels / groups;
+    framework::DDim output_matrix_shape = {
+        output->dims()[1],
+        output->numel() / (output->dims()[0] * output->dims()[1])};
+
+    // convolution operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output->dims()[1]) / groups;
+
    for (int i = 0; i < batch_size; i++) {
      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
      for (int g = 0; g < groups; g++) {
-        // im2col
        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-        im2col(context.device_context(), in_slice, col, strides[0], strides[1],
-               paddings[0], paddings[0], paddings[1], paddings[1]);
+
+        if (filter_shape_vec.size() == 2) {
+          // im2col
+          math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+          im2col(context.device_context(), in_slice, col, strides[0],
+                 strides[1], paddings[0], paddings[0], paddings[1],
+                 paddings[1]);
+        } else if (filter_shape_vec.size() == 3) {
+          // vol2col
+          math::Vol2ColFunctor<Place, T> vol2col;
+          vol2col(context.device_context(), in_slice, col, strides[0],
+                  strides[1], strides[2], paddings[0], paddings[1],
+                  paddings[2]);
+        }

        // gemm
        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
@@ -127,7 +161,7 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
 };

 template <typename Place, typename T>
-class GemmConvGrad2DKernel : public framework::OpKernel<T> {
+class GemmConvGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");
@@ -137,64 +171,79 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
        context.Output<Tensor>(framework::GradVarName("Input"));
    Tensor* filter_grad =
        context.Output<Tensor>(framework::GradVarName("Filter"));
-
    // The filter and filter_grad will be reshaped in the calculations,
    // so here use an assignment operation,
    // that avoids modifying the variable in the Scope.
    Tensor filter = *context.Input<Tensor>("Filter");

+    if (!input_grad && !filter_grad) return;
+
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    int groups = context.Attr<int>("groups");

-    int batch_size = input->dims()[0];
-    int input_channels = input->dims()[1];
-    int filter_height = filter.dims()[filter.dims().size() - 2];
-    int filter_width = filter.dims()[filter.dims().size() - 1];
-    int output_channels = output_grad->dims()[1];
-    int output_height = output_grad->dims()[2];
-    int output_width = output_grad->dims()[3];
-
-    paddle::operators::math::Col2ImFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        col2im;
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        im2col;
-    // use col_shape in the im2col and col2im calculation
-    framework::DDim col_shape = {input_channels / groups, filter_height,
-                                 filter_width, output_height, output_width};
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+    filter_shape_vec.erase(filter_shape_vec.begin(),
+                           filter_shape_vec.begin() + 2);
+
+    // output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
+    std::vector<int64_t> output_shape_vec(
+        framework::vectorize(output_grad->dims()));
+    output_shape_vec.erase(output_shape_vec.begin(),
+                           output_shape_vec.begin() + 2);
+
+    // use col_shape in the im2col calculation
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+    // o_h, o_w}
+    std::vector<int64_t> col_shape_vec;
+    col_shape_vec.push_back(input->dims()[1] / groups);
+    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
+                         filter_shape_vec.end());
+    col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
+                         output_shape_vec.end());
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
    // use col_matrix_shape in the gemm calculation
-    framework::DDim col_matrix_shape = {
-        input_channels / groups * filter_height * filter_width,
-        output_height * output_width};
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix = col;
-    col_matrix.Resize(col_matrix_shape);
+    // size: (i_c/g * k_h * k_w, o_h * o_w)
+    // or
+    // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
+    framework::DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);

-    framework::DDim input_shape = {input->dims()[1], input->dims()[2],
-                                   input->dims()[3]};
-    framework::DDim output_matrix_shape = {
-        output_grad->dims()[1],
-        output_grad->dims()[2] * output_grad->dims()[3]};
+    framework::DDim input_shape = framework::slice_ddim(
+        input->dims(), 1, static_cast<int>(input->dims().size()));

    framework::DDim filter_matrix_shape = {filter.dims()[0],
                                           filter.numel() / filter.dims()[0]};
    filter.Resize(filter_matrix_shape);

-    // convolution backward input operator:  gemm + col2im
-    // convolution backward weight operator: im2col + gemm
-    int in_step = input_channels / groups;
-    int out_step = output_channels / groups;
+    framework::DDim output_matrix_shape = {
+        output_grad->dims()[1],
+        output_grad->numel() /
+            (output_grad->dims()[0] * output_grad->dims()[1])};
+
+    // convolution backward input operator:  gemm + col2im(or col2vol)
+    // convolution backward weight operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output_grad->dims()[1]) / groups;
+
+    Tensor col;
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    col.mutable_data<T>(col_shape, context.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+
+    math::SetConstant<Place, T> set_zero;

    if (input_grad) {
      input_grad->mutable_data<T>(context.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+      set_zero(context.device_context(), input_grad, static_cast<T>(0));

      for (int i = 0; i < batch_size; i++) {
        Tensor out_grad_batch =
@@ -208,13 +257,22 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
          math::matmul<Place, T>(context.device_context(), filter_slice, true,
                                 out_grad_slice, false, T(1.0), &col_matrix,
                                 T(0.0));
-
          // col2im
          Tensor in_grad_slice =
              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-          col2im(context.device_context(), in_grad_slice, col, strides[0],
-                 strides[1], paddings[0], paddings[0], paddings[1],
-                 paddings[1]);
+
+          if (filter_shape_vec.size() == 2) {
+            math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
+            col2im(context.device_context(), in_grad_slice, col, strides[0],
+                   strides[1], paddings[0], paddings[0], paddings[1],
+                   paddings[1]);
+
+          } else if (filter_shape_vec.size() == 3) {
+            math::Col2VolFunctor<Place, T> col2vol;
+            col2vol(context.device_context(), in_grad_slice, col, strides[0],
+                    strides[1], strides[2], paddings[0], paddings[1],
+                    paddings[2]);
+          }
        }
      }
    }
@@ -223,8 +281,7 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
      filter_grad->mutable_data<T>(context.GetPlace());
      Tensor filter_grad_ = *filter_grad;
      filter_grad_.Resize(filter_matrix_shape);
-      auto t = framework::EigenVector<T>::Flatten(filter_grad_);
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+      set_zero(context.device_context(), filter_grad, static_cast<T>(0));

      for (int i = 0; i < batch_size; i++) {
        Tensor out_grad_batch =
@@ -235,9 +292,18 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
          Tensor out_grad_slice =
              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-          im2col(context.device_context(), in_slice, col, strides[0],
-                 strides[1], paddings[0], paddings[0], paddings[1],
-                 paddings[1]);
+
+          if (filter_shape_vec.size() == 2) {
+            math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+            im2col(context.device_context(), in_slice, col, strides[0],
+                   strides[1], paddings[0], paddings[0], paddings[1],
+                   paddings[1]);
+          } else if (filter_shape_vec.size() == 3) {
+            math::Vol2ColFunctor<Place, T> vol2col;
+            vol2col(context.device_context(), in_slice, col, strides[0],
+                    strides[1], strides[2], paddings[0], paddings[1],
+                    paddings[2]);
+          }

          // gemm
          Tensor filter_grad_slice =
@@ -250,6 +316,5 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
    }
  }
 };
-
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/conv2d_transpose_op.cc
+++ b/paddle/operators/conv2d_transpose_op.cc
@@ -12,18 +12,18 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "paddle/operators/conv2d_transpose_op.h"
+#include "paddle/operators/conv_transpose_op.h"

 namespace paddle {
 namespace operators {

-void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
+void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of Conv2DTransposeOp should not be null.");
+                 "Input(Input) of ConvTransposeOp should not be null.");
  PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                 "Input(Filter) of Conv2DTransposeOp should not be null.");
+                 "Input(Filter) of ConvTransposeOp should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                 "Output(Output) of Conv2DTransposeOp should not be null.");
+                 "Output(Output) of ConvTransposeOp should not be null.");

  auto in_dims = ctx->GetInputDim("Input");
  auto filter_dims = ctx->GetInputDim("Filter");
@@ -35,17 +35,27 @@ void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                      "No Padding allowed in conv transpose op.");
  }

-  PADDLE_ENFORCE_EQ(in_dims.size(), 4,
-                    "Conv2DTransposeOp input should be 4-D tensor.");
-  PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
-                    "Conv2DTransposeOp filter should be 4-D tensor.");
+  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
+                 "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+  PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
+                    "ConvTransposeOp input dimension and filter dimension "
+                    "should be the same.");
+  PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U,
+                 "ConvTransposeOp input dimension and strides dimension should "
+                 "be consistent.");
+  PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
+                    "ConvTransposeOp paddings dimension and Conv strides "
+                    "dimension should be the same.");
  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
-                    "input and kernel input dimension should be equal.");
+                    "In ConvTransposeOp, The input channel should be the same "
+                    "as the number of filters.");

-  auto output_height = (in_dims[2] - 1) * strides[0] + filter_dims[2];
-  auto output_width = (in_dims[3] - 1) * strides[1] + filter_dims[3];
-  ctx->SetOutputDim("Output",
-                    {in_dims[0], filter_dims[1], output_height, output_width});
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] +
+                           filter_dims[i + 2]);
+  }
+  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }

 Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
@@ -54,37 +64,109 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
  AddInput(
      "Input",
      "(Tensor) The input tensor of convolution transpose operator. "
-      "The format of input tensor is NCHW, where N is batch size, C is the "
-      "number of input channels, H is the height of the image, and "
-      "W is the width of the image.");
+      "The format of input tensor is NCHW. Where N is batch size, C is the "
+      "number of input channels, H is the height of the feature, and "
+      "W is the width of the feature.");
  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution transpose operator."
+           "(Tensor) The filter tensor of convolution transpose operator. "
           "The format of the filter tensor is CMHW, where C is the number of "
           "output image channels, M is the number of input image channels, "
           "H is the height of the filter, and W is the width of the filter. "
           "We enforce groups number == 1 and padding == 0 in "
           "the convolution transpose scenario.");
  AddOutput("Output",
-            "(Tensor) The output tensor of convolution transpose operator."
+            "(Tensor) The output tensor of convolution transpose operator. "
            "The format of output tensor is also NCHW.");
-  AddAttr<std::vector<int>>("strides",
-                            "strides of convolution transpose operator.")
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector defalut:{1, 1}), strides of convolution transpose operator.")
      .SetDefault({1, 1});
-  AddAttr<std::vector<int>>("paddings",
-                            "paddings of convolution transpose operator.")
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector defalut:{0, 0}), paddings of convolution transpose operator.")
      .SetDefault({0, 0});
  AddComment(R"DOC(
-Convolution Transpose Operator.
-
-The convolution transpose operation calculates the output based on the input, 
-filter, strides, paddings, and groups parameters. The size of each dimension 
-of the parameters is checked in the infer-shape method.
+Convolution2D Transpose Operator.
+
+The convolution transpose operation calculates the output based on the input, filter
+and strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+
+Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
+size, C is the number of channels, H is the height of the feature, and 
+W is the width of the feature. Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
+The input(X) size and output(Out) size may be different.
+Example:
+  Input:
+       Input shape: (N, C_in, H_in, W_in)
+       Filter shape: (C_in, C_out, H_f, W_f)
+  Output:
+       Output shape: (N, C_out, H_out, W_out)
+  where
+       H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
+       W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
+)DOC");
+}

+Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
+    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput("Input",
+           "(Tensor) The input tensor of convolution transpose operator."
+           "The format of input tensor is NCDHW. Where N is batch size, C is "
+           "the number of channels, D is the depth of the feature, H is the "
+           "height of the feature, and "
+           "W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution transpose operator."
+           "The format of the filter tensor is CMDHW, where C is the number of "
+           "output image channels, M is the number of input image channels, D "
+           "is the depth of the filter, H is the height of the filter, and "
+           "W is the width of the filter."
+           "We enforce groups number == 1 and padding == 0 in "
+           "the convolution3d transpose scenario.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution transpose operator."
+            "The format of output tensor is also NCDHW."
+            "Where N is batch size, C is "
+            "the number of channels, D is the depth of the feature, H is the "
+            "height of the feature, and W is the width of the feature.");
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector defalut:{1, 1, 1}), strides of convolution transpose operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector defalut:{0, 0, 0}), paddings of convolution transpose operator.")
+      .SetDefault({0, 0, 0});
+  AddComment(R"DOC(
+Convolution3D Transpose Operator.
+
+The convolution transpose operation calculates the output based on the input, filter
+and strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+
+Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
+size, C is the number of channels, D is the depth of the feature, 
+H is the height of the feature, and W is the width of the feature. 
+Parameters(ksize, strides, paddings) are three elements.
+These three elements represent depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+Example:
+  Input:
+       Input shape: (N, C_in, D_in, H_in, W_in)
+       Filter shape: (C_in, C_out, D_f, H_f, W_f)
+  Output:
+       Output shape: (N, C_out, D_out, H_out, W_out)
+  where
+       D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
+       H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
+       W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2];
 )DOC");
 }

-void Conv2DTransposeOpGrad::InferShape(
-    framework::InferShapeContext* ctx) const {
+void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
  auto in_dims = ctx->GetInputDim("Input");
  auto filter_dims = ctx->GetInputDim("Filter");
  if (ctx->HasOutput(framework::GradVarName("Input"))) {
@@ -99,13 +181,23 @@ void Conv2DTransposeOpGrad::InferShape(
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(conv2d_transpose, ops::Conv2DTransposeOp,
-            ops::Conv2DTransposeOpMaker, conv2d_transpose_grad,
-            ops::Conv2DTransposeOpGrad);
+
+REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
+            conv2d_transpose_grad, ops::ConvTransposeOpGrad);

 REGISTER_OP_CPU_KERNEL(
    conv2d_transpose,
-    ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
    conv2d_transpose_grad,
-    ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
+            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv2d_transpose_op.cu
+++ b/paddle/operators/conv2d_transpose_op.cu
@@ -12,13 +12,20 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "paddle/operators/conv2d_transpose_op.h"
+#include "paddle/operators/conv_transpose_op.h"

 namespace ops = paddle::operators;

 REGISTER_OP_GPU_KERNEL(
    conv2d_transpose,
-    ops::GemmConv2DTransposeKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
    conv2d_transpose_grad,
-    ops::GemmConv2DTransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
+
+REGISTER_OP_GPU_KERNEL(
+    conv3d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    conv3d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/conv2d_transpose_op.h
+++ b/paddle/operators/conv2d_transpose_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/im2col.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/vol2col.h"

 namespace paddle {
 namespace operators {
@@ -33,7 +34,13 @@ class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
                         framework::OpAttrChecker* op_checker);
 };

-class Conv2DTransposeOp : public framework::OperatorWithKernel {
+class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv3DTransposeOpMaker(framework::OpProto* proto,
+                         framework::OpAttrChecker* op_checker);
+};
+
+class ConvTransposeOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

@@ -41,7 +48,7 @@ class Conv2DTransposeOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext* ctx) const override;
 };

-class Conv2DTransposeOpGrad : public framework::OperatorWithKernel {
+class ConvTransposeOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

@@ -50,41 +57,44 @@ class Conv2DTransposeOpGrad : public framework::OperatorWithKernel {
 };

 template <typename Place, typename T>
-class GemmConv2DTransposeKernel : public framework::OpKernel<T> {
+class GemmConvTransposeKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");
    // The filter will be reshaped, so it should not be constant pointer
    Tensor filter = *context.Input<Tensor>("Filter");
-
    Tensor* output = context.Output<Tensor>("Output");

    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-
    // TODO(Zhuoyuan): Paddings can be added in future.
-    // groups will alway be disabled in conv2d_transpose.
-
-    const int batch_size = input->dims()[0];
-    const int m = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    const int k_h = filter.dims()[2];
-    const int k_w = filter.dims()[3];
-
-    const int c = output->dims()[1];  // output channels
-    const int o_h = output->dims()[2];
-    const int o_w = output->dims()[3];
-
-    paddle::operators::math::Col2ImFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        col2im;
-
-    // use col_shape in the im2col and col2im calculation
-    DDim col_shape = {c, k_h, k_w, h, w};
+    // groups will alway be disabled in conv2dtranspose.
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // input_shape_vec: {h, w} or {d, h, w}
+    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
+    input_shape_vec.erase(input_shape_vec.begin(), input_shape_vec.begin() + 2);
+
+    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
+    filter_shape_vec.erase(filter_shape_vec.begin(),
+                           filter_shape_vec.begin() + 2);
+
+    // use col_shape in the im2col and col2im (or vol2col and col2vol)
+    // calculation
+    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
+    std::vector<int64_t> col_shape_vec;
+    col_shape_vec.push_back(output->dims()[1]);
+    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
+                         filter_shape_vec.end());
+    col_shape_vec.insert(col_shape_vec.end(), input_shape_vec.begin(),
+                         input_shape_vec.end());
+    DDim col_shape(framework::make_ddim(col_shape_vec));

    // use col_matrix_shape in the gemm calculation
-    DDim col_matrix_shape = {c * k_h * k_w, h * w};
+    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+    DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);

    Tensor col;
    col.mutable_data<T>(col_shape, context.GetPlace());
@@ -95,160 +105,189 @@ class GemmConv2DTransposeKernel : public framework::OpKernel<T> {
    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);

-    DDim output_shape = {c, o_h, o_w};
-    DDim input_matrix_shape = {m, h * w};
+    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+    DDim output_shape =
+        framework::slice_ddim(output->dims(), 1, output->dims().size());

-    DDim filter_matrix_shape = {m, c * k_h * k_w};
-    filter.Resize(filter_matrix_shape);
+    // input matrix size: (m, h * w) or (m, d * h * w)
+    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};

-    // convolution transpose: gemm + col2im (similar to conv-backward on input)
+    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
+    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+    filter.Resize(filter_matrix_shape);

    output->mutable_data<T>(context.GetPlace());
-    auto t = framework::EigenVector<T>::Flatten(*output);
-    t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+    math::SetConstant<Place, T> set_zero;
+    set_zero(context.device_context(), output, static_cast<T>(0));

+    // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
+    // on input)
    for (int i = 0; i < batch_size; i++) {
-      // batch with size (M, h * w)
+      // batch with size (m, h * w) or (m, d * h * w)
      Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-      // filter size: (M, c * k_h * k_w)

-      // output size: (c, o_h, o_w)
+      // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
      Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);

      // col_matrix = filter * input_batch
-      // of shape (c * k_h * k_w, h * w)
+      // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
      math::matmul<Place, T>(context.device_context(), filter, true,
-                             input_batch, false, T(1.0), &col_matrix, T(0.0));
-      col2im(context.device_context(), output_batch, col, strides[0],
-             strides[1], 0, 0, 0, 0);
+                             input_batch, false, static_cast<T>(1.0),
+                             &col_matrix, static_cast<T>(0.0));
+
+      if (filter_shape_vec.size() == 2) {
+        // col2im: col_matrix -> dy
+        // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
+        math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
+
+        col2im(context.device_context(), output_batch, col, strides[0],
+               strides[1], 0, 0, 0, 0);
+      } else if (filter_shape_vec.size() == 3) {
+        // col2vol: col_matrix -> dy
+        // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
+        math::Col2VolFunctor<Place, T> col2vol;
+        col2vol(context.device_context(), output_batch, col, strides[0],
+                strides[1], strides[2], 0, 0, 0);
+      }
    }
  }
 };

 template <typename Place, typename T>
-class GemmConv2DTransposeGradKernel : public framework::OpKernel<T> {
+class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");
    const Tensor* output_grad =
        context.Input<Tensor>(framework::GradVarName("Output"));
-
    // For filter, we do not use const pointer b/c we will do reshape,
    // but we should avoid modifying its value.
    Tensor filter = *context.Input<Tensor>("Filter");
-
    Tensor* input_grad =
        context.Output<Tensor>(framework::GradVarName("Input"));
    Tensor* filter_grad =
        context.Output<Tensor>(framework::GradVarName("Filter"));

+    if ((!input_grad) && (!filter_grad)) return;
+
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    // Actually, no paddings and groups allowed in conv transpose.
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");

-    const int batch_size = input->dims()[0];
-    const int m = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
+    const int batch_size = static_cast<int>(input->dims()[0]);

-    const int k_h = filter.dims()[2];
-    const int k_w = filter.dims()[3];
+    // input_shape_vec: {h, w} or {d, h, w}
+    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
+    input_shape_vec.erase(input_shape_vec.begin(), input_shape_vec.begin() + 2);

-    const int c = output_grad->dims()[1];  // output channels
-    const int o_h = output_grad->dims()[2];
-    const int o_w = output_grad->dims()[3];
+    // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
+    filter_shape_vec.erase(filter_shape_vec.begin(),
+                           filter_shape_vec.begin() + 2);

-    // Only im2col functor required for bp to get to the right shape
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        im2col;
-
-    // use col_shape in the im2col and col2im calculation
-    DDim col_shape = {c, k_h, k_w, h, w};
+    // use col_shape in the im2col and col2im (or vol2col and col2vol)
+    // calculation
+    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
+    std::vector<int64_t> col_shape_vec;
+    col_shape_vec.push_back(output_grad->dims()[1]);
+    col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
+                         filter_shape_vec.end());
+    col_shape_vec.insert(col_shape_vec.end(), input_shape_vec.begin(),
+                         input_shape_vec.end());
+    DDim col_shape(framework::make_ddim(col_shape_vec));

    // use col_matrix_shape in the gemm calculation
-    DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
+    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+    DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);

-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
+    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+    DDim output_shape = framework::slice_ddim(output_grad->dims(), 1,
+                                              output_grad->dims().size());

-    DDim output_shape = {c, o_h, o_w};
-    DDim input_matrix_shape = {m, h * w};
+    // input matrix size: (m, h * w) or (m, d * h * w)
+    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};

-    DDim filter_matrix_shape = {m, c * k_h * k_w};
+    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
+    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
    filter.Resize(filter_matrix_shape);

    // convolution transpose grad on input:
    // im2col + gemm (similar to conv-forward)
    // input need to compute gradient
-    if (input_grad) {
+    if (input_grad || filter_grad) {
+      Tensor col;
+      col.mutable_data<T>(col_shape, context.GetPlace());
+      // col_matrix shares the same piece of data with col,
+      // but will be reshaped into a two-dimensional matrix shape
+      // to call the matrix multiplication interface.
      Tensor col_matrix;
      col_matrix.ShareDataWith(col);
-      DDim col_matrix_shape = {c * k_h * k_w, h * w};
      col_matrix.Resize(col_matrix_shape);

-      input_grad->mutable_data<T>(context.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+      Tensor filter_grad_;
+      math::SetConstant<Place, T> set_zero;
+
+      if (input_grad) {
+        input_grad->mutable_data<T>(context.GetPlace());
+        set_zero(context.device_context(), input_grad, static_cast<T>(0));
+      }
+      if (filter_grad) {  // filter size (m, c, k_h, k_w)
+        filter_grad->mutable_data<T>(context.GetPlace());
+        set_zero(context.device_context(), filter_grad, static_cast<T>(0));
+        filter_grad_ = *filter_grad;
+        filter_grad_.Resize(filter_matrix_shape);
+      }

      for (int i = 0; i < batch_size; i++) {
        // batch with size (c, o_h * o_w)
        Tensor output_grad_batch =
            output_grad->Slice(i, i + 1).Resize(output_shape);
-        // filter of size (m, c * k_h * k_w)

-        // batch with size (m, h, w)
-        Tensor input_grad_batch =
-            input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
-
-        // im2col: dy from (c, o_h, o_w) -> (c * k_h * k_w, h * w)
-        im2col(context.device_context(), output_grad_batch, col, strides[0],
-               strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
-
-        // gemm: dx = filter * dy
-        // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, c, h)
-        math::matmul<Place, T>(context.device_context(), filter, false,
-                               col_matrix, false, T(1.0), &input_grad_batch,
-                               T(0.0));
-      }
-    }
-
-    // filter gradient required
-    if (filter_grad) {
-      Tensor col_matrix_f;
-      col_matrix_f.ShareDataWith(col);
-      DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
-      col_matrix_f.Resize(col_matrix_shape_f);
-
-      filter_grad->mutable_data<T>(context.GetPlace());
-      Tensor filter_grad_ = *filter_grad;
-      filter_grad_.Resize(filter_matrix_shape);
-      auto t = framework::EigenVector<T>::Flatten(filter_grad_);
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
-
-      for (int i = 0; i < batch_size; ++i) {
-        // batch with size (c, o_h, o_w)
-        Tensor output_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_shape);
-        // input batch
-        Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-
-        // im2col: (c * h * w, k_h * k_w)
-        im2col(context.device_context(), output_grad_batch, col, strides[0],
-               strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
-
-        // gemm: d_filter = x * y_grad^T
-        // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, c, h)
-        math::matmul<Place, T>(context.device_context(), in_batch, false,
-                               col_matrix_f, true, T(1.0), &filter_grad_,
-                               T(1.0));
+        if (filter_shape_vec.size() == 2) {
+          // im2col: dy -> col matrix
+          // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
+          math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+          im2col(context.device_context(), output_grad_batch, col, strides[0],
+                 strides[1], paddings[0], paddings[0], paddings[1],
+                 paddings[1]);
+        } else if (filter_shape_vec.size() == 3) {
+          // vol2col: dy -> col_matrix
+          // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
+          math::Vol2ColFunctor<Place, T> vol2col;
+          vol2col(context.device_context(), output_grad_batch, col, strides[0],
+                  strides[1], strides[2], paddings[0], paddings[1],
+                  paddings[2]);
+        }
+
+        if (input_grad) {
+          // batch with size (m, h, w)
+          Tensor input_grad_batch =
+              input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
+          // gemm: dx = filter * dy
+          // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, h * w)
+          // or
+          // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
+          // d, h, w)
+          math::matmul<Place, T>(context.device_context(), filter, false,
+                                 col_matrix, false, static_cast<T>(1.0),
+                                 &input_grad_batch, static_cast<T>(0.0));
+        }
+        if (filter_grad) {
+          // input batch
+          Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
+          // gemm: d_filter = x * dy^T
+          // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, k_h * k_w)
+          // or
+          // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
+          // k_h * k_w)
+          math::matmul<Place, T>(context.device_context(), in_batch, false,
+                                 col_matrix, true, static_cast<T>(1.0),
+                                 &filter_grad_, static_cast<T>(1.0));
+        }
      }
    }
  }
 };
-
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -114,21 +114,17 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
             "where N is the batch size and D is the number of classes. "
             "This input is a probability computed by the previous operator, "
             "which is almost always the result of a softmax operator.");
-    AddInput(
-        "Label",
-        "(Tensor, default Tensor<int>), the ground truth which is "
-        "a 2-D tensor. "
-        "When soft_label is set to false, Label is a Tensor<int> with shape "
-        "[N x 1]. "
-        "When soft_label is set to true, Label is a Tensor<float/double> "
-        "with shape [N x K].");
+    AddInput("Label",
+             "(Tensor), the ground truth which is a 2-D tensor. When "
+             "soft_label is set to false, Label is a Tensor<int64> with shape "
+             "[N x 1]. When soft_label is set to true, Label is a "
+             "Tensor<float/double> with shape [N x K].");
    AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 2-D tensor "
-              "with shape [N x 1]. The cross entropy loss.");
-    AddAttr<bool>(
-        "soft_label",
-        "(bool, default false), a flag to indicate whether to interpretate "
-        "the given labels as soft labels.")
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
+              "[N x 1]. The cross entropy loss.");
+    AddAttr<bool>("soft_label",
+                  "(bool, default false), a flag indicating whether to "
+                  "interpretate the given labels as soft labels.")
        .SetDefault(false);
    AddComment(R"DOC(
 CrossEntropy Operator.

--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -34,15 +34,18 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
    std::vector<int64_t> shape_int64(shape.size(), 0);
    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
                   [](int a) { return static_cast<int64_t>(a); });
-    auto dims = framework::make_ddim(shape_int64);
+    auto output_dim = framework::make_ddim(shape_int64);

-    int dim_idx = ctx->Attrs().Get<int>("dim_idx");
-    PADDLE_ENFORCE_GE(dim_idx, 0);
-    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), dim_idx);
-    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), dim_idx);
+    int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
+    PADDLE_ENFORCE_GE(input_dim_idx, 0);
+    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);

-    dims[dim_idx] = ctx->GetInputDim("Input")[dim_idx];
-    ctx->SetOutputDim("Out", dims);
+    int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
+    PADDLE_ENFORCE_GE(output_dim_idx, 0);
+    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
+
+    output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
+    ctx->SetOutputDim("Out", output_dim);
  }

 protected:
@@ -69,8 +72,11 @@ class FillConstantBatchSizeLikeOpMaker
              "(Tensor) Tensor of specified shape will be filled "
              "with the specified value");
    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
-    AddAttr<int>("dim_idx",
-                 "(int, default 0) The index of batch size dimension")
+    AddAttr<int>("input_dim_idx",
+                 "(int, default 0) the index of input's batch size dimension")
+        .SetDefault(0);
+    AddAttr<int>("output_dim_idx",
+                 "(int, default 0) the index of output's batch size dimension")
        .SetDefault(0);
    AddAttr<float>("value", "(float, default 0) The value to be filled")
        .SetDefault(0.0f);
@@ -86,9 +92,10 @@ Fill up a variable with specified constant value.
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(fill_constant_batch_size_like,
-                             ops::FillConstantBatchSizeLikeOp,
-                             ops::FillConstantBatchSizeLikeOpMaker);
+REGISTER_OPERATOR(fill_constant_batch_size_like,
+                  ops::FillConstantBatchSizeLikeOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::FillConstantBatchSizeLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
    fill_constant_batch_size_like,
    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>,

--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -35,7 +35,9 @@ class FillConstantOp : public framework::OperatorWithKernel {
 protected:
  framework::DataType IndicateDataType(
      const framework::ExecutionContext &ctx) const override {
-    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
+    int data_type = ctx.Attr<int>("data_type");
+    VLOG(10) << " FillConstant data_type = " << data_type;
+    return static_cast<framework::DataType>(data_type);
  }
 };

@@ -71,4 +73,5 @@ REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
 REGISTER_OP_CPU_KERNEL(
    fill_constant, ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>,
    ops::FillConstantOpKernel<paddle::platform::CPUPlace, double>,
-    ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>);
+    ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>,
+    ops::FillConstantOpKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/operators/fill_constant_op.cu
+++ b/paddle/operators/fill_constant_op.cu
@@ -20,4 +20,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
    fill_constant, ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>,
    ops::FillConstantOpKernel<paddle::platform::GPUPlace, double>,
-    ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>);
+    ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>,
+    ops::FillConstantOpKernel<paddle::platform::GPUPlace, int64_t>);
--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/gru_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class GRUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                   "Output(%s) of GRUOp should not be null.", "BatchGate");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
+                   "Output(%s) of GRUOp should not be null.",
+                   "BatchResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
+                   "Output(%s) of GRUOp should not be null.", "BatchHidden");
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(%s) of GRUOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int input_size = input_dims[1];
+    int frame_size = weight_dims[0];
+    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
+                      "The input_size must be 3 times of frame_size in GRUOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_dims[1], frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("H0")) {
+      auto h0_dims = ctx->GetInputDim("H0");
+      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                        "The width of H0 must be equal to frame_size.");
+    }
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+    }
+    ctx->SetOutputDim("BatchGate", input_dims);
+    ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size});
+    ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size});
+    ctx->SetOutputDim("Hidden", {input_dims[0], frame_size});
+    ctx->ShareLoD("Input", "Hidden");
+  }
+};
+
+class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GRUOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) The first input is a LodTensor, which supports "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTenosr is a matrix with shape (T X 3D), where, T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) The initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size, D is the hidden size.")
+        .AsDispensable();
+    AddInput(
+        "Weight",
+        "(Tensor) The learnable hidden-hidden weight matrix with shape "
+        "(D x 3D), where D is the hidden size. The elements continuous in "
+        "memory can be divided into two parts. The first part are weights of "
+        "the update gate and reset gate with shape (D x 2D), and the second "
+        "part are weights of output candidate with shape (D x D).");
+    AddInput("Bias",
+             "(Tensor, optional) Bias vector with shape (1 x 3D) concating "
+             "bias of the update gate, reset gate and output candidate.")
+        .AsDispensable();
+    AddOutput("BatchGate",
+              "(LoDTensor) To compute with batches, sequence data will be "
+              "reorganized into several successive batches each containing "
+              "data from the same time step. The LoDTensor BatchGate contains "
+              "the update gate, reset gate and output candidate values "
+              "organized in batches. The LoD size is 2. The first LoD contains "
+              "the batch offsets and the second LoD contains the indexes in "
+              "the raw sequence data.")
+        .AsIntermediate();
+    AddOutput(
+        "BatchResetHiddenPrev",
+        "(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.")
+        .AsIntermediate();
+    AddOutput(
+        "BatchHidden",
+        "(LoDTensor) The hidden state LoDTensor organized in batches.  "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.")
+        .AsIntermediate();
+    AddOutput(
+        "Hidden",
+        "(LoDTensor) the hidden state LoDTensor organized in sequences. "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.");
+    AddAttr<std::string>("activation",
+                         "(string, default tanh) "
+                         "The activation type used for output candidate {h}_t.")
+        .SetDefault("tanh");
+    AddAttr<std::string>(
+        "gate_activation",
+        "(string, default sigmoid) "
+        "The activation type used in update gate and reset gate.")
+        .SetDefault("sigmoid");
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed GRU.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+GRU Operator implements part calculations of the complete GRU as following:
+
+\f[
+update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+\f]
+
+@note To implement the complete GRU, fully-connected operator must be used  
+before to feed xu, xr and xc as the Input of GRU operator.
+)DOC");
+  }
+};
+
+class GRUGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUGradOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUGradOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(%s) of GRUGradOp should not be null.", "BatchGate");
+    PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"),
+                   "Input(%s) of GRUGradOp should not be null.",
+                   "BatchResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("BatchHidden"),
+                   "Input(%s) of GRUOp should not be null.", "BatchHidden");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(%s) of GRUGradOp should not be null.", "Hidden");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
+                   "Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int input_size = input_dims[1];
+    int frame_size = weight_dims[0];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
+                      "The input_size must be 3 times of frame_size in GRUOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("H0")) {
+      auto h0_dims = ctx->GetInputDim("H0");
+      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                        "The width of H0 must be equal to frame_size.");
+      auto h0_grad_name = framework::GradVarName("H0");
+      if (ctx->HasOutput(h0_grad_name))
+        ctx->SetOutputDim(h0_grad_name, h0_dims);
+    }
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+    auto input_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(input_grad_name))
+      ctx->SetOutputDim(input_grad_name, input_dims);
+    auto weight_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(weight_grad_name))
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
+REGISTER_OP_CPU_KERNEL(gru, ops::GRUKernel<paddle::platform::CPUPlace, float>,
+                       ops::GRUKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(gru_grad,
+                       ops::GRUGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::GRUGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/gru_op.cu
+++ b/paddle/operators/gru_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/gru_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gru, ops::GRUKernel<paddle::platform::GPUPlace, float>,
+                       ops::GRUKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(gru_grad,
+                       ops::GRUGradKernel<paddle::platform::GPUPlace, float>,
+                       ops::GRUGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence2batch.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class GRUKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* input = context.Input<LoDTensor>("Input");
+    auto* h0 = context.Input<Tensor>("H0");
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(context.GetPlace());
+    auto* batch_reset_hidden_prev =
+        context.Output<LoDTensor>("BatchResetHiddenPrev");
+    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<LoDTensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    context.ShareLoD("Input", "Hidden");
+
+    auto hidden_dims = hidden->dims();
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    to_batch(context.device_context(), *input, *batch_gate, true, is_reverse);
+
+    int frame_size = hidden_dims[1];
+    int batch_size = hidden_dims[0];
+    auto g = EigenMatrix<T>::From(*batch_gate);
+    auto place = context.GetEigenDevice<Place>();
+    if (bias) {
+      auto b = EigenMatrix<T>::From(*bias);
+      g.device(place) = g +
+                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
+                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
+    }
+
+    math::hl_gru_value<T> gru_value;
+    gru_value.gateWeight = const_cast<T*>(weight_data);
+    gru_value.stateWeight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    gru_value.prevOutValue = const_cast<T*>(h0_data);
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      gru_value.outputValue = hidden_t.data<T>();
+      gru_value.gateValue = gate_t.data<T>();
+      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+      math::GRUUnitFunctor<Place, T>::compute(
+          context.device_context(), gru_value, frame_size, cur_batch_size,
+          math::ActiveType(context.Attr<std::string>("activation")),
+          math::ActiveType(context.Attr<std::string>("gate_activation")));
+      gru_value.prevOutValue = gru_value.outputValue;
+    }
+
+    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(context.device_context(), *batch_hidden, *hidden);
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+template <typename Place, typename T>
+class GRUGradKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* h0 = context.Input<Tensor>("H0");
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* batch_gate = context.Input<LoDTensor>("BatchGate");
+    auto* batch_reset_hidden_prev =
+        context.Input<LoDTensor>("BatchResetHiddenPrev");
+    auto* batch_hidden = context.Input<LoDTensor>("BatchHidden");
+    auto* hidden = context.Input<LoDTensor>("Hidden");
+    auto* hidden_grad =
+        context.Input<LoDTensor>(framework::GradVarName("Hidden"));
+    auto* input_grad =
+        context.Output<LoDTensor>(framework::GradVarName("Input"));
+    auto* h0_grad = context.Output<Tensor>(framework::GradVarName("H0"));
+    auto* weight_grad =
+        context.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto gate_dims = batch_gate->dims();
+    auto hidden_dims = hidden->dims();
+    int frame_size = hidden_dims[1];
+
+    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
+    batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
+    batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
+    batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
+                                                 context.GetPlace());
+    math::SetConstant<Place, T> zero;
+    zero(context.device_context(), &batch_hidden_grad, static_cast<T>(0.0));
+    zero(context.device_context(), &batch_gate_grad, static_cast<T>(0.0));
+    zero(context.device_context(), &batch_reset_hidden_prev_grad,
+         static_cast<T>(0.0));
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    batch_hidden_grad.set_lod(batch_hidden->lod());
+    to_batch(context.device_context(), *hidden_grad, batch_hidden_grad, false,
+             is_reverse);
+
+    math::hl_gru_value<T> gru_value;
+    gru_value.gateWeight = const_cast<T*>(weight_data);
+    gru_value.stateWeight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+
+    math::hl_gru_grad<T> gru_grad;
+    if (weight_grad) {
+      gru_grad.gateWeightGrad =
+          weight_grad->mutable_data<T>(context.GetPlace());
+      zero(context.device_context(), weight_grad, static_cast<T>(0.0));
+      gru_grad.stateWeightGrad =
+          weight_grad->data<T>() + 2 * frame_size * frame_size;
+    } else {
+      gru_grad.gateWeightGrad = nullptr;
+      gru_grad.stateWeightGrad = nullptr;
+    }
+
+    auto batch_starts = batch_hidden_grad.lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      gru_value.gateValue = gate_t.data<T>();
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+
+      Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
+      gru_grad.outputGrad = hidden_grad_t.data<T>();
+      Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
+      gru_grad.gateGrad = gate_grad_t.data<T>();
+      Tensor reset_hidden_prev_grad_t =
+          batch_reset_hidden_prev_grad.Slice(bstart, bend);
+      gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
+      if (n == 0) {
+        gru_value.prevOutValue = const_cast<T*>(h0_data);
+        if (h0_grad) {
+          T* h0_grad_data = h0_grad->mutable_data<T>(context.GetPlace());
+          zero(context.device_context(), h0_grad, static_cast<T>(0.0));
+          gru_grad.prevOutGrad = h0_grad_data;
+        } else {
+          gru_grad.prevOutGrad = nullptr;
+        }
+      } else {
+        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
+        gru_value.prevOutValue = hidden_prev_t.data<T>();
+        Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
+        gru_grad.prevOutGrad = hidden_prev_grad_t.data<T>();
+      }
+
+      math::GRUUnitGradFunctor<Place, T>::compute(
+          context.device_context(), gru_value, gru_grad, frame_size,
+          cur_batch_size,
+          math::ActiveType(context.Attr<std::string>("activation")),
+          math::ActiveType(context.Attr<std::string>("gate_activation")));
+    }
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      math::Batch2LoDTensorFunctor<Place, T> to_seq;
+      batch_gate_grad.set_lod(batch_gate->lod());
+      to_seq(context.device_context(), batch_gate_grad, *input_grad);
+    }
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(context.GetPlace());
+      auto d_b = EigenMatrix<T>::From(*bias_grad);
+      auto d_g = EigenMatrix<T>::From(batch_gate_grad);
+      auto place = context.GetEigenDevice<Place>();
+      d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -31,7 +31,6 @@ class IncrementOp : public framework::OperatorWithKernel {
  }
 };

-template <typename AttrType>
 class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  IncrementOpMaker(framework::OpProto *proto,
@@ -39,10 +38,10 @@ class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The input tensor of increment operator");
    AddOutput("Out", "(Tensor) The output tensor of increment operator.");
-    AddAttr<AttrType>("step",
-                      "(float, default 1.0) "
-                      "The step size by which the "
-                      "input tensor will be incremented.")
+    AddAttr<float>("step",
+                   "(float, default 1.0) "
+                   "The step size by which the "
+                   "input tensor will be incremented.")
        .SetDefault(1.0);
    AddComment(R"DOC(
 Increment Operator.
@@ -73,7 +72,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {

 namespace ops = paddle::operators;

-REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker<float>,
+REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
                  ops::IncrementGradOpMaker);
-REGISTER_OP_CPU_KERNEL(increment,
-                       ops::IncrementKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    increment, ops::IncrementKernel<paddle::platform::CPUPlace, float>,
+    ops::IncrementKernel<paddle::platform::CPUPlace, double>,
+    ops::IncrementKernel<paddle::platform::CPUPlace, int>,
+    ops::IncrementKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/operators/increment_op.cu
+++ b/paddle/operators/increment_op.cu
@@ -16,4 +16,7 @@

 REGISTER_OP_GPU_KERNEL(
    increment,
-    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>);
+    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>,
+    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, double>,
+    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, int>,
+    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, int64_t>);
--- a/paddle/operators/increment_op.h
+++ b/paddle/operators/increment_op.h
@@ -19,7 +19,7 @@

 namespace paddle {
 namespace operators {
-template <typename Place, typename T, typename AttrType = T>
+template <typename Place, typename T>
 class IncrementKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& context) const {
@@ -27,7 +27,7 @@ class IncrementKernel : public framework::OpKernel<T> {
    auto* in = context.Input<framework::Tensor>("X");
    tensor->mutable_data<T>(in->place());

-    auto step = static_cast<T>(context.Attr<AttrType>("step"));
+    auto step = static_cast<T>(context.Attr<float>("step"));

    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
    auto eigen_in = framework::EigenVector<T>::Flatten(*in);

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -8,20 +8,24 @@ if(WITH_GPU)
    nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
    nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
+    nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions)
 else()
    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
    cc_library(softmax SRCS softmax.cc DEPS operator)
    cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
    cc_library(pooling SRCS pooling.cc DEPS device_context)
+    cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
    cc_library(vol2col SRCS vol2col.cc DEPS device_context)
    cc_library(context_project SRCS context_project.cc DEPS device_context)
    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
+    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
 endif()

 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)

--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/gru_compute.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+#ifndef __NVCC__
+
+template <class OpResetOutput, typename T>
+void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
+                                       T *gateValue, T *resetOutputValue,
+                                       T *prevOutputValue, int frameSize,
+                                       activation_mode_t active_gate) {
+  T rValueUpdateGate;
+  T rValueResetGate;
+  T rValueResetOutput;
+  T rPrevOut = 0;
+  T *updateGate = gateValue;
+  T *resetGate = gateValue + frameSize;
+
+  for (int i = 0; i < frameSize; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueResetGate = resetGate[i];
+    if (prevOutputValue) {
+      rPrevOut = prevOutputValue[i];
+    }
+
+    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
+                  rValueResetOutput, active_gate);
+
+    updateGate[i] = rValueUpdateGate;
+    resetGate[i] = rValueResetGate;
+    resetOutputValue[i] = rValueResetOutput;
+  }
+}
+
+template <class OpFinalOutput, typename T>
+void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
+                                       T *gateValue, T *prevOutputValue,
+                                       T *outputValue, int frameSize,
+                                       activation_mode_t active_node) {
+  T rValueUpdateGate;
+  T rValueFrameState;
+  T rPrevOut = 0;
+  T rOutput;
+  T *updateGate = gateValue;
+  T *frameState = gateValue + frameSize * 2;
+
+  for (int i = 0; i < frameSize; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueFrameState = frameState[i];
+    if (prevOutputValue) {
+      rPrevOut = prevOutputValue[i];
+    }
+
+    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
+                  active_node);
+
+    frameState[i] = rValueFrameState;
+    outputValue[i] = rOutput;
+  }
+}
+
+template <class OpResetOutput, typename T>
+void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue,
+                                     T *resetOutputValue, T *prevOutputValue,
+                                     int frameSize,
+                                     activation_mode_t active_gate) {
+#ifdef __AVX__
+  __m256 rValueUpdateGate;
+  __m256 rValueResetGate;
+  __m256 rValueResetOutput;
+  __m256 rPrevOut = _mm256_set1_ps(0.0f);
+  __m256 *updateGate = (__m256 *)gateValue;
+  __m256 *resetGate = (__m256 *)(gateValue + frameSize);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueResetGate = resetGate[i];
+    if (prevOutputValue) {
+      rPrevOut = ((__m256 *)prevOutputValue)[i];
+    }
+
+    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
+                  rValueResetOutput, active_gate);
+
+    updateGate[i] = rValueUpdateGate;
+    resetGate[i] = rValueResetGate;
+    ((__m256 *)resetOutputValue)[i] = rValueResetOutput;
+  }
+#endif
+}
+
+template <class OpFinalOutput, typename T>
+void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue,
+                                     T *prevOutputValue, T *outputValue,
+                                     int frameSize,
+                                     activation_mode_t active_node) {
+#ifdef __AVX__
+  __m256 rValueUpdateGate;
+  __m256 rValueFrameState;
+  __m256 rPrevOut = _mm256_set1_ps(0.0f);
+  __m256 rOutput;
+  __m256 *updateGate = (__m256 *)gateValue;
+  __m256 *frameState = (__m256 *)(gateValue + frameSize * 2);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueFrameState = frameState[i];
+    if (prevOutputValue) {
+      rPrevOut = ((__m256 *)prevOutputValue)[i];
+    }
+
+    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
+                  active_node);
+
+    frameState[i] = rValueFrameState;
+    ((__m256 *)outputValue)[i] = rOutput;
+  }
+#endif
+}
+
+template <class OpResetOutput, typename T>
+inline void forward_reset_output(OpResetOutput opResetOutput,
+                                 hl_gru_value<T> value, int frameSize,
+                                 int batchSize, activation_mode_t active_gate) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_reset_output(
+          opResetOutput, value.gateValue, value.resetOutputValue,
+          value.prevOutValue, frameSize, active_gate);
+    } else {
+      hl_naive_gru_forward_reset_output(
+          opResetOutput, value.gateValue, value.resetOutputValue,
+          value.prevOutValue, frameSize, active_gate);
+    }
+
+    value.gateValue += frameSize * 3;
+    value.resetOutputValue += frameSize;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+  }
+}
+
+template <class OpFinalOutput, typename T>
+inline void forward_final_output(OpFinalOutput opFinalOutput,
+                                 hl_gru_value<T> value, int frameSize,
+                                 int batchSize, activation_mode_t active_node) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue,
+                                      value.prevOutValue, value.outputValue,
+                                      frameSize, active_node);
+    } else {
+      hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue,
+                                        value.prevOutValue, value.outputValue,
+                                        frameSize, active_node);
+    }
+
+    value.gateValue += frameSize * 3;
+    value.outputValue += frameSize;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
+                                      T *gateGrad, T *prevOutValue,
+                                      T *prevOutGrad, T *outputGrad,
+                                      int frameSize,
+                                      activation_mode_t active_node) {
+  T rUpdateGateValue;
+  T rUpdateGateGrad;
+  T rFrameStateValue;
+  T rFrameStateGrad;
+  T rOutGrad;
+  T rPrevOutValue = 0;
+  T rPrevOutGrad = 0;
+  T *updateGateValue = gateValue;
+  T *updateGateGrad = gateGrad;
+  T *frameStateValue = gateValue + frameSize * 2;
+  T *frameStateGrad = gateGrad + frameSize * 2;
+
+  for (int i = 0; i < frameSize; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rFrameStateValue = frameStateValue[i];
+    rOutGrad = outputGrad[i];
+    if (prevOutValue) {
+      rPrevOutValue = prevOutValue[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad = prevOutGrad[i];
+    }
+
+    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
+                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
+                active_node);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    frameStateGrad[i] = rFrameStateGrad;
+    if (prevOutGrad) {
+      prevOutGrad[i] = rPrevOutGrad;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
+                                      T *gateGrad, T *prevOutValue,
+                                      T *prevOutGrad, T *resetOutputGrad,
+                                      int frameSize,
+                                      activation_mode_t active_gate) {
+  T rUpdateGateValue;
+  T rUpdateGateGrad;
+  T rResetGateValue;
+  T rResetGateGrad;
+  T rResetOutputGrad = 0;
+  T rPrevOutValue = 0;
+  T rPrevOutGrad = 0;
+  T *updateGateValue = gateValue;
+  T *updateGateGrad = gateGrad;
+  T *resetGateValue = gateValue + frameSize;
+  T *resetGateGrad = gateGrad + frameSize;
+
+  for (int i = 0; i < frameSize; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rUpdateGateGrad = updateGateGrad[i];
+    rResetGateValue = resetGateValue[i];
+
+    if (prevOutValue && prevOutGrad) {
+      rResetOutputGrad = resetOutputGrad[i];
+    }
+    if (prevOutValue) {
+      rPrevOutValue = prevOutValue[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad = prevOutGrad[i];
+    }
+
+    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
+                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
+                active_gate);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    resetGateGrad[i] = rResetGateGrad;
+    if (prevOutGrad) {
+      prevOutGrad[i] = rPrevOutGrad;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
+                                    T *gateGrad, T *prevOutValue,
+                                    T *prevOutGrad, T *outputGrad,
+                                    int frameSize,
+                                    activation_mode_t active_node) {
+#ifdef __AVX__
+  __m256 rUpdateGateValue;
+  __m256 rUpdateGateGrad;
+  __m256 rFrameStateValue;
+  __m256 rFrameStateGrad;
+  __m256 rOutGrad;
+  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
+  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
+  __m256 *updateGateValue = (__m256 *)gateValue;
+  __m256 *updateGateGrad = (__m256 *)gateGrad;
+  __m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2);
+  __m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rFrameStateValue = frameStateValue[i];
+    rOutGrad = ((__m256 *)outputGrad)[i];
+    if (prevOutValue) {
+      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    }
+
+    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
+                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
+                active_node);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    frameStateGrad[i] = rFrameStateGrad;
+    if (prevOutGrad) {
+      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    }
+  }
+#endif
+}
+
+template <class OpResetGrad, typename T>
+void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
+                                    T *gateGrad, T *prevOutValue,
+                                    T *prevOutGrad, T *resetOutputGrad,
+                                    int frameSize,
+                                    activation_mode_t active_gate) {
+#ifdef __AVX__
+  __m256 rUpdateGateValue;
+  __m256 rUpdateGateGrad;
+  __m256 rResetGateValue;
+  __m256 rResetGateGrad;
+  __m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
+  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
+  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
+  __m256 *updateGateValue = (__m256 *)gateValue;
+  __m256 *updateGateGrad = (__m256 *)gateGrad;
+  __m256 *resetGateValue = (__m256 *)(gateValue + frameSize);
+  __m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rUpdateGateGrad = updateGateGrad[i];
+    rResetGateValue = resetGateValue[i];
+
+    if (prevOutValue && prevOutGrad) {
+      rResetOutputGrad = ((__m256 *)resetOutputGrad)[i];
+    }
+    if (prevOutValue) {
+      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    }
+
+    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
+                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
+                active_gate);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    resetGateGrad[i] = rResetGateGrad;
+    if (prevOutGrad) {
+      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    }
+  }
+#endif
+}
+
+template <class OpStateGrad, typename T>
+inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value<T> value,
+                                hl_gru_grad<T> grad, int frameSize,
+                                int batchSize, activation_mode_t active_node) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_backward_state_grad(
+          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+    } else {
+      hl_naive_gru_backward_state_grad(
+          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+    }
+
+    value.gateValue += frameSize * 3;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+
+    grad.gateGrad += frameSize * 3;
+    grad.outputGrad += frameSize;
+    if (grad.prevOutGrad) {
+      grad.prevOutGrad += frameSize;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value<T> value,
+                                hl_gru_grad<T> grad, int frameSize,
+                                int batchSize, activation_mode_t active_gate) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_backward_reset_grad(
+          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+    } else {
+      hl_naive_gru_backward_reset_grad(
+          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+    }
+
+    value.gateValue += frameSize * 3;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+
+    grad.gateGrad += frameSize * 3;
+    grad.resetOutputGrad += frameSize;
+    if (grad.prevOutGrad) {
+      grad.prevOutGrad += frameSize;
+    }
+  }
+}
+
+#endif
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/device_context.h"
+
+#include <glog/logging.h>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class OpResetOutput, bool isBatch, typename T>
+__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
+                                        T *gateValue, T *resetOutputValue,
+                                        T *prevOutputValue, int frameSize,
+                                        int batchSize,
+                                        activation_mode_t active_gate) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    resetOutputValue += batchIdx * frameSize;
+  }
+
+  T rPrevOut = 0;
+  T rValueResetOutput;
+  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  T rValueResetGate = gateValue[frameIdx + frameSize * 1];
+
+  if (prevOutputValue) {
+    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    rPrevOut = prevOutputValue[frameIdx];
+  }
+
+  opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput,
+                active_gate);
+
+  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
+  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
+  resetOutputValue[frameIdx] = rValueResetOutput;
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class OpFinalOutput, bool isBatch, typename T>
+__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
+                                        T *gateValue, T *prevOutputValue,
+                                        T *outputValue, int frameSize,
+                                        int batchSize,
+                                        activation_mode_t active_node) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    outputValue += batchIdx * frameSize;
+  }
+
+  T rOutput;
+  T rPrevOut = 0;
+  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  T rValueFrameState = gateValue[frameIdx + frameSize * 2];
+
+  if (prevOutputValue) {
+    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    rPrevOut = prevOutputValue[frameIdx];
+  }
+
+  opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
+                active_node);
+
+  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
+  outputValue[frameIdx] = rOutput;
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class OpStateGrad, bool isBatch, typename T>
+__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue,
+                                       T *gateGrad, T *prevOutValue,
+                                       T *prevOutGrad, T *outputGrad,
+                                       int frameSize, int batchSize,
+                                       activation_mode_t active_node) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    gateGrad += batchIdx * 3 * frameSize;
+    outputGrad += batchIdx * frameSize;
+  }
+
+  T rUpdateGateGrad;
+  T rFrameStateGrad;
+  T rPrevOutValue = 0;
+  T rPrevOutGrad = 0;
+  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  T rFrameStateValue = gateValue[frameIdx + frameSize * 2];
+  T rOutGrad = outputGrad[frameIdx];
+
+  if (prevOutValue && prevOutGrad) {
+    if (isBatch) prevOutValue += batchIdx * frameSize;
+    rPrevOutValue = prevOutValue[frameIdx];
+
+    if (isBatch) prevOutGrad += batchIdx * frameSize;
+    rPrevOutGrad = prevOutGrad[frameIdx];
+  }
+
+  opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
+              rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
+              active_node);
+
+  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
+  if (prevOutGrad) {
+    prevOutGrad[frameIdx] = rPrevOutGrad;
+  }
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class OpResetGrad, bool isBatch, typename T>
+__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue,
+                                       T *gateGrad, T *prevOutValue,
+                                       T *prevOutGrad, T *resetOutputGrad,
+                                       int frameSize, int batchSize,
+                                       activation_mode_t active_gate) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    gateGrad += batchIdx * 3 * frameSize;
+    resetOutputGrad += batchIdx * frameSize;
+  }
+
+  T rResetGateGrad;
+  T rPrevOutValue = 0;
+  T rPrevOutGrad = 0;
+  T rResetOutputGrad = 0;
+  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0];
+  T rResetGateValue = gateValue[frameIdx + frameSize * 1];
+
+  if (prevOutValue && prevOutGrad) {
+    if (isBatch) prevOutValue += batchIdx * frameSize;
+    if (isBatch) prevOutGrad += batchIdx * frameSize;
+    rPrevOutValue = prevOutValue[frameIdx];
+    rPrevOutGrad = prevOutGrad[frameIdx];
+    rResetOutputGrad = resetOutputGrad[frameIdx];
+  }
+
+  opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
+              rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
+              active_gate);
+
+  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
+  if (prevOutGrad) {
+    prevOutGrad[frameIdx] = rPrevOutGrad;
+  }
+}
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/detail/gru_kernel.h
+++ b/paddle/operators/math/detail/gru_kernel.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/platform/hostdevice.h"
+
+#include <type_traits>
+
+// TODO(guosheng): refine code style in gru_kernel
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+namespace forward {
+
+template <typename T>
+class gru_resetOutput {
+ public:
+  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut,
+                             T &valueResetOutput, activation_mode_t actGate) {
+    valueUpdateGate = activation(valueUpdateGate, actGate);
+    valueResetGate = activation(valueResetGate, actGate);
+    valueResetOutput = prevOut * valueResetGate;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate,
+                             __m256 &prevOut, __m256 &valueResetOutput,
+                             activation_mode_t actGate) {
+    valueUpdateGate = activation(valueUpdateGate, actGate);
+    valueResetGate = activation(valueResetGate, actGate);
+    valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
+  }
+#endif
+#endif
+};
+
+template <typename T>
+class gru_finalOutput {
+ public:
+  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut,
+                             T &valueOutput, activation_mode_t actInput) {
+    valueFrameState = activation(valueFrameState, actInput);
+    valueOutput = prevOut - (valueUpdateGate * prevOut) +
+                  (valueUpdateGate * valueFrameState);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState,
+                             __m256 &prevOut, __m256 &valueOutput,
+                             activation_mode_t actInput) {
+    valueFrameState = activation(valueFrameState, actInput);
+    valueOutput = _mm256_add_ps(
+        _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
+        _mm256_mul_ps(valueUpdateGate, valueFrameState));
+  }
+#endif
+#endif
+};
+}  // namespace forward
+
+namespace backward {
+
+template <typename T>
+class gru_stateGrad {
+ public:
+  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
+                             T &valueFrameState, T &gradFrameState,
+                             T &valuePrevOut, T &gradPrevOut, T &gradOutput,
+                             activation_mode_t actInput) {
+    gradUpdateGate = (gradOutput * valueFrameState);
+    gradUpdateGate -= (gradOutput * valuePrevOut);
+    gradPrevOut -= (gradOutput * valueUpdateGate);
+    gradPrevOut += gradOutput;
+    gradFrameState =
+        activation(gradOutput * valueUpdateGate, valueFrameState, actInput);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
+                             __m256 &valueFrameState, __m256 &gradFrameState,
+                             __m256 &valuePrevOut, __m256 &gradPrevOut,
+                             __m256 &gradOutput, activation_mode_t actInput) {
+    gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
+    gradUpdateGate =
+        _mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
+    gradPrevOut = _mm256_add_ps(
+        _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
+        gradOutput);
+    gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate),
+                                valueFrameState, actInput);
+  }
+#endif
+#endif
+};
+
+template <typename T>
+class gru_resetGrad {
+ public:
+  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
+                             T &valueResetGate, T &gradResetGate,
+                             T &valuePrevOut, T &gradPrevOut,
+                             T &gradResetOutput, activation_mode_t actGate) {
+    gradResetGate = (gradResetOutput * valuePrevOut);
+    gradPrevOut += (gradResetOutput * valueResetGate);
+    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
+    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
+                             __m256 &valueResetGate, __m256 &gradResetGate,
+                             __m256 &valuePrevOut, __m256 &gradPrevOut,
+                             __m256 &gradResetOutput,
+                             activation_mode_t actGate) {
+    gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
+    gradPrevOut = _mm256_add_ps(gradPrevOut,
+                                _mm256_mul_ps(gradResetOutput, valueResetGate));
+    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
+    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  }
+#endif
+#endif
+};
+
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/detail/gru_cpu_kernel.h"
+#include "paddle/operators/math/detail/gru_kernel.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<platform::CPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+#ifndef __NVCC__
+    if (value.prevOutValue) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, false, batchSize, frameSize * 2, frameSize, 1,
+          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
+          value.gateValue, frameSize * 3);
+    }
+
+    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
+                                 frameSize, batchSize, active_gate);
+
+    if (value.prevOutValue) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, false, batchSize, frameSize, frameSize, 1,
+          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
+          value.gateValue + frameSize * 2, frameSize * 3);
+    }
+
+    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
+                                 frameSize, batchSize, active_node);
+#endif
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<platform::CPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
+                      int batchSize, activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+#ifndef __NVCC__
+    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
+                                grad, frameSize, batchSize, active_node);
+
+    if (value.prevOutValue && grad.prevOutGrad) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, true, batchSize, frameSize, frameSize, 1,
+          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
+          frameSize, 0, grad.resetOutputGrad, frameSize);
+
+      if (grad.stateWeightGrad) {
+        math::gemm<platform::CPUPlace, T>(
+            context, true, false, frameSize, frameSize, batchSize, 1,
+            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
+            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+      }
+    }
+
+    detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
+                                grad, frameSize, batchSize, active_gate);
+
+    if (grad.prevOutGrad && value.prevOutValue) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, true, batchSize, frameSize, frameSize * 2, 1,
+          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
+          grad.prevOutGrad, frameSize);
+
+      if (grad.gateWeightGrad) {
+        math::gemm<platform::CPUPlace, T>(
+            context, true, false, frameSize, frameSize * 2, batchSize, 1,
+            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
+            grad.gateWeightGrad, frameSize * 2);
+      }
+    }
+#endif
+  }
+};
+
+template struct GRUUnitFunctor<platform::CPUPlace, float>;
+template struct GRUUnitFunctor<platform::CPUPlace, double>;
+template struct GRUUnitGradFunctor<platform::CPUPlace, float>;
+template struct GRUUnitGradFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/gru_gpu_kernel.h"
+#include "paddle/operators/math/detail/gru_kernel.h"
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<platform::GPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+    dim3 threads;
+    dim3 grid;
+    if (batchSize == 1) {
+      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+      int frameBlocks = (frameSize + 1024 - 1) / 1024;
+      threads = dim3(framePerBlock, 1);
+      grid = dim3(frameBlocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+    }
+
+    if (value.prevOutValue) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, false, batchSize, frameSize * 2, frameSize, 1,
+          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
+          value.gateValue, frameSize * 3);
+    }
+
+    if (batchSize == 1) {
+      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
+                                      /* isBatch= */ false,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_resetOutput<T>(), value.gateValue,
+          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
+          active_gate);
+    } else {
+      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
+                                      /* isBatch= */ true,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_resetOutput<T>(), value.gateValue,
+          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
+          active_gate);
+    }
+
+    if (value.prevOutValue) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, false, batchSize, frameSize, frameSize, 1,
+          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
+          value.gateValue + frameSize * 2, frameSize * 3);
+    }
+
+    if (batchSize == 1) {
+      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
+                                      /* isBatch= */ false,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_finalOutput<T>(), value.gateValue,
+          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          active_node);
+    } else {
+      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
+                                      /* isBatch= */ true,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_finalOutput<T>(), value.gateValue,
+          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          active_node);
+    }
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<platform::GPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
+                      int batchSize, activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+    dim3 threads;
+    dim3 grid;
+    if (batchSize == 1) {
+      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+      int frameBlocks = (frameSize + 1024 - 1) / 1024;
+      threads = dim3(framePerBlock, 1);
+      grid = dim3(frameBlocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+    }
+
+    if (batchSize == 1) {
+      detail::KeGruBackwardStateGrad<
+          detail::backward::gru_stateGrad<T>,
+          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
+          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
+          batchSize, active_node);
+    } else {
+      detail::KeGruBackwardStateGrad<
+          detail::backward::gru_stateGrad<T>,
+          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
+          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
+          batchSize, active_node);
+    }
+
+    if (value.prevOutValue && grad.prevOutGrad) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, true, batchSize, frameSize, frameSize, 1,
+          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
+          frameSize, 0, grad.resetOutputGrad, frameSize);
+
+      if (grad.stateWeightGrad) {
+        math::gemm<platform::GPUPlace, T>(
+            context, true, false, frameSize, frameSize, batchSize, 1,
+            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
+            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+      }
+    }
+
+    if (batchSize == 1) {
+      detail::KeGruBackwardResetGrad<
+          detail::backward::gru_resetGrad<T>,
+          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
+          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
+          batchSize, active_gate);
+    } else {
+      detail::KeGruBackwardResetGrad<
+          detail::backward::gru_resetGrad<T>,
+          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
+          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
+          batchSize, active_gate);
+    }
+
+    if (grad.prevOutGrad && value.prevOutValue) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, true, batchSize, frameSize, frameSize * 2, 1,
+          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
+          grad.prevOutGrad, frameSize);
+
+      if (grad.gateWeightGrad) {
+        math::gemm<platform::GPUPlace, T>(
+            context, true, false, frameSize, frameSize * 2, batchSize, 1,
+            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
+            grad.gateWeightGrad, frameSize * 2);
+      }
+    }
+  }
+};
+
+template struct GRUUnitFunctor<platform::GPUPlace, float>;
+template struct GRUUnitFunctor<platform::GPUPlace, double>;
+template struct GRUUnitGradFunctor<platform::GPUPlace, float>;
+template struct GRUUnitGradFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// TODO(guosheng): refine code style in gru_compute
+template <typename T>
+struct hl_gru_value {
+  T *gateWeight;
+  T *stateWeight;
+  T *gateValue;
+  T *resetOutputValue;
+  T *outputValue;
+  T *prevOutValue;
+};
+
+template <typename T>
+struct hl_gru_grad {
+  T *gateWeightGrad;
+  T *stateWeightGrad;
+  T *gateGrad;
+  T *resetOutputGrad;
+  T *outputGrad;
+  T *prevOutGrad;
+};
+
+template <typename Place, typename T>
+struct GRUUnitFunctor {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate);
+};
+
+template <typename Place, typename T>
+struct GRUUnitGradFunctor {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
+                      int batchSize, activation_mode_t active_node,
+                      activation_mode_t active_gate);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/sequence_pooling.cc
+++ b/paddle/operators/math/sequence_pooling.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_pooling.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class MaxSeqPoolFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), 1);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int* max_index = index->data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      for (int64_t k = 0; k < dim; ++k) {
+        out_data[i * dim + k] = in_data[starts[i] * dim + k];
+        max_index[i * dim + k] = starts[i];
+      }
+      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
+        for (int64_t k = 0; k < dim; ++k) {
+          if (in_data[j * dim + k] > out_data[i * dim + k]) {
+            out_data[i * dim + k] = in_data[j * dim + k];
+            max_index[i * dim + k] = j;
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad) {
+    auto og_dims = out_grad.dims();
+    auto ig_dims = in_grad->dims();
+    auto idx_dims = index.dims();
+    PADDLE_ENFORCE_GT(og_dims.size(), 1);
+    PADDLE_ENFORCE_GT(ig_dims.size(), 1);
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+
+    const T* og_data = out_grad.data<T>();
+    const int* max_index = index.data<int>();
+    T* ig_data = in_grad->data<T>();
+
+    SetConstant<platform::CPUPlace, T> set_zero;
+    set_zero(context, in_grad, static_cast<T>(0.0));
+    int64_t num_seq = og_dims[0];
+    int64_t dim = out_grad.numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      for (int64_t j = 0; j < dim; ++j) {
+        int step_id = max_index[i * dim + j];
+        ig_data[step_id * dim + j] = og_data[i * dim + j];
+      }
+    }
+  }
+};
+
+template class MaxSeqPoolFunctor<platform::CPUPlace, float>;
+template class MaxSeqPoolFunctor<platform::CPUPlace, double>;
+template class MaxSeqPoolGradFunctor<platform::CPUPlace, float>;
+template class MaxSeqPoolGradFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/sequence_pooling.cu
+++ b/paddle/operators/math/sequence_pooling.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence_pooling.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename T>
+__global__ void KeMaxSequencePool(const T* input, const size_t* starts,
+                                  T* output, int* index, int64_t num_seq,
+                                  int64_t dim) {
+  int dim_idx = threadIdx.x;
+  int seq_id = blockIdx.x;
+  if (seq_id >= num_seq) return;
+  size_t start = starts[seq_id];
+  size_t end = starts[seq_id + 1];
+
+  for (int64_t i = dim_idx; i < dim; i += blockDim.x) {
+    T max_val = static_cast<T>(-FLT_MAX);
+    int max_id = -1;
+    for (size_t step_id = start; step_id < end; step_id++) {
+      if (max_val < input[step_id * dim + i]) {
+        max_val = input[step_id * dim + i];
+        max_id = step_id;
+      }
+    }
+    output[seq_id * dim + i] = max_val;
+    index[seq_id * dim + i] = max_id;
+  }
+}
+
+template <typename T>
+class MaxSeqPoolFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), static_cast<int64_t>(1));
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int* max_index = index->data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+
+    dim3 threads(256, 1);
+    dim3 grid(num_seq, 1);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
+        in_data, starts.data(), out_data, max_index, num_seq, dim);
+  }
+};
+
+template <typename T>
+__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
+                                      T* in_grad, int64_t num_seq,
+                                      int64_t dim) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int col_idx = idx % dim;
+  if (idx < num_seq * dim) {
+    int step_id = max_index[idx];
+    in_grad[step_id * dim + col_idx] = out_grad[idx];
+  }
+}
+
+template <typename T>
+class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad) {
+    auto og_dims = out_grad.dims();
+    auto idx_dims = index.dims();
+    auto ig_dims = in_grad->dims();
+    PADDLE_ENFORCE_GT(og_dims.size(), static_cast<int64_t>(1));
+    PADDLE_ENFORCE_GT(ig_dims.size(), static_cast<int64_t>(1));
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+
+    const T* og_data = out_grad.data<T>();
+    const int* max_index = index.data<int>();
+    T* ig_data = in_grad->data<T>();
+
+    SetConstant<platform::GPUPlace, T> set_zero;
+    set_zero(context, in_grad, static_cast<T>(0.0));
+    int64_t num_seq = og_dims[0];
+    int64_t dim = out_grad.numel() / num_seq;
+
+    unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
+    dim3 threads(128, 1);
+    dim3 grid(blocks, 1);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
+        og_data, max_index, ig_data, num_seq, dim);
+  }
+};
+
+template class MaxSeqPoolFunctor<platform::GPUPlace, float>;
+template class MaxSeqPoolFunctor<platform::GPUPlace, double>;
+template class MaxSeqPoolGradFunctor<platform::GPUPlace, float>;
+template class MaxSeqPoolGradFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/sequence_pooling.h
+++ b/paddle/operators/math/sequence_pooling.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename Place, typename T>
+class MaxSeqPoolFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index);
+};
+
+template <typename Place, class T>
+class MaxSeqPoolGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/positive_negative_pair_op.cc
+++ b/paddle/operators/positive_negative_pair_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/positive_negative_pair_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PositiveNegativePairOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("Score"),
+        "Input(Score) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Label"),
+        "Input(Label) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("QueryID"),
+        "Input(QueryID) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PositivePair"),
+        "Output(PositivePair) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NegativePair"),
+        "Output(NegativePair) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NeutralPair"),
+        "Output(NeutralPair) of PositiveNegativePairOp should not be null.");
+    auto scalar_dim = framework::make_ddim({1});
+    if (ctx->HasInput("AccumulatePositivePair") ||
+        ctx->HasInput("AccumulateNegativePair") ||
+        ctx->HasInput("AccumulateNeutralPair")) {
+      PADDLE_ENFORCE(ctx->HasInput("AccumulatePositivePair") &&
+                         ctx->HasInput("AccumulateNegativePair") &&
+                         ctx->HasInput("AccumulateNeutralPair"),
+                     "All optional inputs(AccumulatePositivePair, "
+                     "AccumulateNegativePair, AccumulateNeutralPair) of "
+                     "PositiveNegativePairOp are required if one of them is "
+                     "specified.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulatePositivePair"), scalar_dim,
+                        "Shape of AccumulatePositivePair should be {1}.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNegativePair"), scalar_dim,
+                        "Shape of AccumulateNegativePair should be {1}.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNeutralPair"), scalar_dim,
+                        "Shape of AccumulateNeutralPair should be {1}.");
+    }
+
+    auto score_dim = ctx->GetInputDim("Score");
+    auto label_dim = ctx->GetInputDim("Label");
+    auto query_dim = ctx->GetInputDim("QueryID");
+    PADDLE_ENFORCE_EQ(score_dim.size(), 2, "Score should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "Label should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        label_dim[0], score_dim[0],
+        "Tensor Score and Label should have the same height (batch size).");
+    PADDLE_ENFORCE_EQ(label_dim[1], 1,
+                      "The width of Label should be 1, i.e. each item should "
+                      "have a scalar label.");
+    PADDLE_ENFORCE(query_dim == label_dim,
+                   "QueryID should have the same shape as Label.");
+    if (ctx->HasInput("Weight")) {
+      PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
+                     "Weight should have the same shape as Label.");
+    }
+    int column = ctx->Attrs().Get<int>("column");
+    auto depth = score_dim[1];
+    PADDLE_ENFORCE(column < depth && column >= -depth,
+                   "Attribute column should be in the range of [-%l, %l)",
+                   depth, depth);
+
+    ctx->SetOutputDim("PositivePair", scalar_dim);
+    ctx->SetOutputDim("NegativePair", scalar_dim);
+    ctx->SetOutputDim("NeutralPair", scalar_dim);
+  }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("Score")->type());
+  }
+};
+
+class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PositiveNegativePairOpMaker(framework::OpProto *proto,
+                              framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Score",
+             "(Tensor, float) Model Score on an item (with "
+             "respect to QueryID). It's a 2-D tensor with shape [batch_size, "
+             "depth], where the column specified by the attribute \"column\" "
+             "is used as item score.");
+    AddInput("Label",
+             "(Tensor, float) Label of an item (with repsect to "
+             "QueryId). It's a 2-D tensor with shape [batch_size, 1].");
+    AddInput("QueryID",
+             "(Tensor, int64) Query ID that indicates the context. Its shape "
+             "should be the same as Label.");
+    AddInput(
+        "AccumulatePositivePair",
+        "(float) Optional. The accumulated number of positive pairs over a "
+        "stream of data. If provided, the output PositivePair will be "
+        "initialized with this number rather than 0. it won't be modified "
+        "in place.")
+        .AsDispensable();
+    AddInput(
+        "AccumulateNegativePair",
+        "(float) Optional. The accumulated number of negative pairs over a "
+        "stream of data. If provided, the output NegativePair will be "
+        "initialized with this number rather than 0. it won't be modified "
+        "in place.")
+        .AsDispensable();
+    AddInput("AccumulateNeutralPair",
+             "(float) Optional. The accumulated number of neutral pairs over a "
+             "stream of data. If provided, the output NeutralPair will be "
+             "initialized with this number rather than 0. it won't be modified "
+             "in place.")
+        .AsDispensable();
+    AddInput("Weight",
+             "(float) Optional. Weight of current item. If specified, its "
+             "shape should be the same as Label, and the meaning of the output "
+             "changes from numbers of pairs to the total sum of pairs' "
+             "weights. Weight of a pair of items is the average of their "
+             "weights.")
+        .AsDispensable();
+    AddOutput("PositivePair",
+              "(float) Number of positive pairs, i.e. the pairs of "
+              "items that are ranked correctly.");
+    AddOutput("NegativePair",
+              "(float) Number of negative pairs, i.e. the pairs of "
+              "items that are ranked incorrectly.");
+    AddOutput("NeutralPair",
+              "(float) Number of neutral pairs, i.e. the pairs of items "
+              "that have the same score.")
+        .AsDispensable();
+    AddAttr<int>(
+        "column",
+        "(int, default -1) The column position of Score used to rank items in "
+        "descending order. It must be in the range of [-rank(Score), "
+        "rank(Score)). "
+        "If `dim < 0`, the dim to reduce is `rank + dim`. "
+        "Noting that reducing on the first dim will make the LoD info lost.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+        PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) 
+        model performance. 
+        Within some context, e.g. the "query", a LTR model generates scores
+        for a list of items, which gives a partial order of the items.
+        PositiveNegativePairOp takes a list of reference rank order 
+        (Input("Label")) and the model generated scores (Input(Score)) as 
+        inputs and counts the pairs that ranked correctly and incorrectly.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(positive_negative_pair,
+                             ops::PositiveNegativePairOp,
+                             ops::PositiveNegativePairOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    positive_negative_pair,
+    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, float>,
+    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/positive_negative_pair_op.h
+++ b/paddle/operators/positive_negative_pair_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <unordered_map>
+#include <vector>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename Place, typename T>
+class PositiveNegativePairKernel : public framework::OpKernel<T> {
+ public:
+  struct PredictionResult {
+    PredictionResult(T score, T label, T weight)
+        : score(score), label(label), weight(weight) {}
+    T score;
+    T label;
+    T weight;
+  };
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto score_t = context.Input<Tensor>("Score");
+    auto label_t = context.Input<Tensor>("Label");
+    auto query_t = context.Input<Tensor>("QueryID");
+    auto acc_positive_t = context.Input<Tensor>("AccumulatePositivePair");
+    auto acc_negative_t = context.Input<Tensor>("AccumulateNegativePair");
+    auto acc_neutral_t = context.Input<Tensor>("AccumulateNeutralPair");
+    auto positive_t = context.Output<Tensor>("PositivePair");
+    auto negative_t = context.Output<Tensor>("NegativePair");
+    auto neutral_t = context.Output<Tensor>("NeutralPair");
+    auto weight_t = context.Input<Tensor>("Weight");
+
+    auto score = score_t->data<T>();
+    auto label = label_t->data<T>();
+    auto query = query_t->data<int64_t>();
+    const T* weight = nullptr;
+    if (weight_t != nullptr) {
+      weight = weight_t->data<T>();
+    }
+    T* positive = positive_t->mutable_data<T>(context.GetPlace());
+    T* negative = negative_t->mutable_data<T>(context.GetPlace());
+    T* neutral = neutral_t->mutable_data<T>(context.GetPlace());
+
+    auto score_dim = score_t->dims();
+    auto batch_size = score_dim[0];
+    auto width = score_dim[1];
+    auto column = context.Attr<int32_t>("column");
+    if (column < 0) {
+      column += width;
+    }
+
+    // construct document instances for each query: Query => List[<score#0,
+    // label#0, weight#0>, ...]
+    std::unordered_map<int64_t, std::vector<PredictionResult>> predictions;
+    for (auto i = 0; i < batch_size; ++i) {
+      if (predictions.find(query[i]) == predictions.end()) {
+        predictions.emplace(
+            std::make_pair(query[i], std::vector<PredictionResult>()));
+      }
+      predictions[query[i]].emplace_back(score[i * width + column], label[i],
+                                         weight_t != nullptr ? weight[i] : 1.0);
+    }
+
+    // for each query, accumulate pair counts
+    T pos = 0, neg = 0, neu = 0;
+    if (acc_positive_t != nullptr && acc_negative_t != nullptr &&
+        acc_neutral_t != nullptr) {
+      pos = acc_positive_t->data<T>()[0];
+      neg = acc_negative_t->data<T>()[0];
+      neu = acc_neutral_t->data<T>()[0];
+    }
+    auto evaluate_one_list = [&pos, &neg,
+                              &neu](std::vector<PredictionResult> vec) {
+      for (auto ite1 = vec.begin(); ite1 != vec.end(); ++ite1) {
+        for (auto ite2 = ite1 + 1; ite2 != vec.end(); ++ite2) {
+          if (ite1->label == ite2->label) {  // labels are equal, ignore.
+            continue;
+          }
+          T w = (ite1->weight + ite2->weight) * 0.5;
+          if (ite1->score == ite2->score) {
+            neu += w;
+          }
+          (ite1->score - ite2->score) * (ite1->label - ite2->label) > 0.0
+              ? pos += w
+              : neg += w;
+        }
+      }
+    };
+    for (auto prediction : predictions) {
+      evaluate_one_list(prediction.second);
+    }
+    *positive = pos;
+    *negative = neg;
+    *neutral = neu;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -27,6 +27,11 @@ class SequencePoolOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of SequencePoolOp should not be null.");
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    if (ctx->Attrs().Get<std::string>("pooltype") == "MAX") {
+      PADDLE_ENFORCE(ctx->HasOutput("MaxIndex"),
+                     "Output(MaxIndex) of SequencePoolOp should not be null.");
+      ctx->SetOutputDim("MaxIndex", ctx->GetInputDim("X"));
+    }
  }
 };

@@ -35,10 +40,14 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
  SequencePoolOpMaker(framework::OpProto* proto,
                      framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor), the variable-length input of SequencePoolOp");
+    AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
    AddOutput("Out",
-              "(Tensor), output of SequencePoolOp, which does not contain LoD "
+              "(Tensor) The output of SequencePoolOp does not contain LoD "
              "infomation.");
+    AddOutput("MaxIndex",
+              "(Tensor<int>) This tensor is used for the sequence max-pooling "
+              "to record the max indexes.")
+        .AsIntermediate();
    AddAttr<std::string>(
        "pooltype",
        "(int, default AVERAGE) the pooling pooltype of SequencePoolOp.")
@@ -96,6 +105,12 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
    }
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
  }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };

 }  // namespace operators

--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence_pooling.h"

 namespace paddle {
 namespace operators {
@@ -34,7 +35,7 @@ class SequencePoolKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
+    auto* out = context.Output<Tensor>("Out");
    std::string pooltype = context.Attr<std::string>("pooltype");

    auto dims = in->dims();
@@ -53,6 +54,16 @@ class SequencePoolKernel : public framework::OpKernel<T> {
    auto lod_level_0 = lod[0];

    out->mutable_data<T>(context.GetPlace());
+
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolFunctor<Place, T> max_pool;
+      auto* index = context.Output<Tensor>("MaxIndex");
+      index->Resize({dims});
+      index->mutable_data<int>(context.GetPlace());
+      max_pool(context.device_context(), *in, out, index);
+      return;
+    }
+
    auto place = context.GetEigenDevice<Place>();
    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
      Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
@@ -69,8 +80,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
      } else if (pooltype == "SQRT") {
        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                              std::sqrt(static_cast<T>(h));
-      } else if (pooltype == "MAX") {
-        out_e.device(place) = in_e.maximum(Eigen::array<int, 1>({{0}}));
      } else if (pooltype == "LAST") {
        out_e.device(place) = in_e.chip(h - 1, 0);
      } else if (pooltype == "FIRST") {
@@ -87,8 +96,8 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in = context.Input<LoDTensor>("X");
+    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
    std::string pooltype = context.Attr<std::string>("pooltype");

    auto dims = in->dims();
@@ -96,6 +105,14 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
    int64_t w = in->numel() / dims[0];

    in_g->mutable_data<T>(context.GetPlace());
+
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolGradFunctor<Place, T> max_pool_grad;
+      auto* index = context.Input<Tensor>("MaxIndex");
+      max_pool_grad(context.device_context(), *out_g, *index, in_g);
+      return;
+    }
+
    if (pooltype == "LAST" || pooltype == "FIRST") {
      // set X@Grad be zero at first when pooltype is LAST/FIRST
      math::SetConstant<Place, T> functor;
@@ -118,20 +135,6 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
      } else if (pooltype == "SQRT") {
        in_g_e.device(place) =
            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
-      } else if (pooltype == "MAX") {
-        auto in_t =
-            in->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
-        Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
-            in_t_map(in_t.data<T>(), h, w);
-        int row_id;
-        Eigen::array<int, 2> extents{{1, 1}};
-        for (int col_id = 0; col_id < w; col_id++) {
-          in_t_map.col(col_id).maxCoeff(&row_id);
-          Eigen::array<int, 2> in_offsets{{row_id, col_id}};
-          Eigen::array<int, 2> out_offsets{{0, col_id}};
-          in_g_e.slice(in_offsets, extents).device(place) =
-              out_g_e.slice(out_offsets, extents);
-        }
      } else if (pooltype == "LAST") {
        in_g_e.chip(h - 1, 0).device(place) = out_g_e;
      } else if (pooltype == "FIRST") {

--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -4,13 +4,13 @@
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0

-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #include "paddle/operators/softmax_with_cross_entropy_op.h"
 #include <paddle/function/TensorType.h>
@@ -30,12 +30,10 @@ class SoftmaxWithCrossEntropyOpMaker
             "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
             "and K is the class number.");
    AddInput("Label",
-             "(Tensor, default: Tensor<int>), The ground truth which is a 2-D "
-             "tensor. "
-             "If softLabel is set to false, Label is a Tensor<int> with shape "
-             "[N x 1]."
-             "If softLabel is set to true, Label is a Tensor<float/double> "
-             "with shape [N x K].");
+             "(Tensor) The ground truth which is a 2-D tensor. If soft_label "
+             "is set to false, Label is a Tensor<int64> with shape [N x 1]. If "
+             "soft_label is set to true, Label is a Tensor<float/double> with "
+             "shape [N x K].");
    AddOutput(
        "Softmax",
        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
@@ -62,7 +60,7 @@ Because this operator performs a softmax on logits internally, it expects
 unscaled logits. This operator should not be used with the output of
 softmax operator since that would produce incorrect results.

-When the attribute softLabel is set false, this operators expects mutually
+When the attribute soft_label is set false, this operators expects mutually
 exclusive hard labels, each sample in a batch is in exactly one class with a
 probability of 1.0. Each sample in the batch will have a single label.

@@ -198,6 +196,8 @@ REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
 REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
                  ops::SoftmaxWithCrossEntropyOpGrad);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
-                       ops::SoftmaxWithCrossEntropyKernel<float>);
+                       ops::SoftmaxWithCrossEntropyKernel<float>,
+                       ops::SoftmaxWithCrossEntropyKernel<double>);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradKernel<float>);
+                       ops::SoftmaxWithCrossEntropyGradKernel<float>,
+                       ops::SoftmaxWithCrossEntropyGradKernel<double>);
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -4,13 +4,13 @@
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0

-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #define EIGEN_USE_GPU

@@ -24,7 +24,7 @@ using Tensor = framework::Tensor;
 namespace {
 template <typename T>
 __global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad,
-                                 const int* labels, const int batch_size,
+                                 const int64_t* labels, const int batch_size,
                                 const int class_num) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int sample_idx = tid / class_num;
@@ -50,7 +50,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
  int ids = blockIdx.x * blockDim.x + threadIdx.x;
  if (ids < batch_size * class_num) {
    int row_ids = ids / class_num;
-    logit_grad[ids] = logit_grad[ids] * (loss_grad[row_ids] - labels[ids]);
+    logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
  }
 }
 }  // namespace
@@ -104,7 +104,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
                              .stream()>>>(logit_grad_data, loss_grad_data,
                                           label_data, batch_size, class_num);
    } else {
-      const int* label_data = labels->data<int>();
+      const int64_t* label_data = labels->data<int64_t>();
      CrossEntropyGrad<T><<<
          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                              context.device_context())
@@ -119,6 +119,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {

 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy,
-                       ops::SoftmaxWithCrossEntropyCUDAKernel<float>);
+                       ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
+                       ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
 REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>);
+                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
+                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -4,13 +4,13 @@
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0

-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #pragma once
 #include "paddle/framework/eigen.h"
@@ -60,25 +60,25 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));

    const int class_num = logit_grad->dims()[1];
+    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
+
    if (context.Attr<bool>("soft_label")) {
-      auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
-      auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
      auto lbl_mat = EigenMatrix<T>::From(*labels);
-
      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
-          logit_grad_mat *
-          (out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) -
-           lbl_mat);
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
+          (logit_grad_mat - lbl_mat);
    } else {
+      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
+          logit_grad_mat *
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
+
      const int batch_size = logit_grad->dims()[0];
-      const int* label_data = labels->data<int>();
-      const T* out_grad_data = out_grad->data<T>();
+      const int64_t* label_data = labels->data<int64_t>();
      T* logit_grad_data = logit_grad->data<T>();
-
+      const T* out_grad_data = out_grad->data<T>();
      for (int i = 0; i < batch_size; ++i) {
-        int index = i * class_num + label_data[i];
-        logit_grad_data[index] =
-            out_grad_data[i] * (logit_grad_data[index] - 1.);
+        logit_grad_data[i * class_num + label_data[i]] -= out_grad_data[i];
      }
    }
  }

--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -24,10 +24,16 @@ class SumOp : public framework::OperatorWithKernel {

  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
-    auto x_dims = ctx->GetInputsDim("X");
+
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of SumOp should not be null.");
+    if (ctx->IsRuntime() &&
+        ctx->GetOutputsVarType("Out")[0] ==
+            framework::VarDesc::LOD_TENSOR_ARRAY) {
+      return;  // skip runtime infershape when is tensor array;
+    }

+    auto x_dims = ctx->GetInputsDim("X");
    size_t N = x_dims.size();
    PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");

@@ -39,6 +45,28 @@ class SumOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Out", in_dim);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    auto x_vars = ctx.MultiInputVar("X");
+    if (x_vars[0]->IsType<framework::LoDTensor>()) {
+      return framework::ToDataType(
+          x_vars[0]->Get<framework::LoDTensor>().type());
+    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
+      return framework::ToDataType(
+          x_vars[0]->Get<framework::SelectedRows>().value().type());
+    } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
+      auto& array = x_vars[0]->Get<framework::LoDTensorArray>();
+      for (auto& each : array) {
+        if (each.numel() != 0) {
+          return framework::ToDataType(each.type());
+        }
+      }
+    }
+    PADDLE_THROW("Unexpected branch. Input type is %s",
+                 x_vars[0]->Type().name());
+  }
 };

 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -63,18 +91,32 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
  void operator()(const framework::OpDescBind& op_desc,
                  framework::BlockDescBind* block) const override {
    auto& inputs = op_desc.Input("X");
-    auto default_var_type = framework::VarDesc::SELECTED_ROWS;
+    auto var_type = framework::VarDesc::SELECTED_ROWS;

    bool any_input_is_lod_tensor = std::any_of(
        inputs.begin(), inputs.end(), [block](const std::string& name) {
          return block->Var(name)->GetType() == framework::VarDesc::LOD_TENSOR;
        });
-    if (any_input_is_lod_tensor) {
-      default_var_type = framework::VarDesc::LOD_TENSOR;
+
+    auto is_tensor_array = [block](const std::string& name) {
+      return block->Var(name)->GetType() ==
+             framework::VarDesc::LOD_TENSOR_ARRAY;
+    };
+
+    bool any_input_is_tensor_array =
+        std::any_of(inputs.begin(), inputs.end(), is_tensor_array);
+    bool all_inputs_are_tensor_array =
+        std::all_of(inputs.begin(), inputs.end(), is_tensor_array);
+
+    if (any_input_is_tensor_array) {
+      PADDLE_ENFORCE(all_inputs_are_tensor_array);
+      var_type = framework::VarDesc::LOD_TENSOR_ARRAY;
+    } else if (any_input_is_lod_tensor) {
+      var_type = framework::VarDesc::LOD_TENSOR;
    }

    auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(default_var_type);
+    block->Var(out_var_name)->SetType(var_type);
  }
 };


--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -11,6 +11,7 @@ limitations under the License. */

 #pragma once
 #include "paddle/framework/eigen.h"
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/selected_rows_functor.h"
@@ -28,7 +29,7 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename Place, typename T>
 class SumKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext &context) const override {
    auto in_vars = context.MultiInputVar("X");
    int N = in_vars.size();
    auto out_var = context.OutputVar("Out");
@@ -36,7 +37,7 @@ class SumKernel : public framework::OpKernel<T> {
    bool in_place = out_var == in_vars[0];

    if (out_var->IsType<framework::LoDTensor>()) {
-      auto* out = context.Output<Tensor>("Out");
+      auto *out = context.Output<Tensor>("Out");
      out->mutable_data<T>(context.GetPlace());

      auto result = EigenVector<T>::Flatten(*out);
@@ -51,11 +52,11 @@ class SumKernel : public framework::OpKernel<T> {
      // If in_place, just skip the first tensor
      for (int i = in_place ? 1 : 0; i < N; i++) {
        if (in_vars[i]->IsType<framework::LoDTensor>()) {
-          auto& in_t = in_vars[i]->Get<framework::LoDTensor>();
+          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
          auto in = EigenVector<T>::Flatten(in_t);
          result.device(place) = result + in;
        } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
-          auto& in_t = in_vars[i]->Get<framework::SelectedRows>();
+          auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
          functor(context.device_context(), in_t, out);
        } else {
          PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
@@ -63,8 +64,8 @@ class SumKernel : public framework::OpKernel<T> {
      }
    } else if (out_var->IsType<framework::SelectedRows>()) {
      PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now");
-      auto* out = context.Output<SelectedRows>("Out");
-      auto* out_value = out->mutable_value();
+      auto *out = context.Output<SelectedRows>("Out");
+      auto *out_value = out->mutable_value();

      // Runtime InferShape
      size_t first_dim = 0;
@@ -88,9 +89,36 @@ class SumKernel : public framework::OpKernel<T> {
                offset, out);
        offset += in_vars[i]->Get<SelectedRows>().value().numel();
      }
+    } else if (out_var->IsType<framework::LoDTensorArray>()) {
+      auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
+      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
+                       "Only support all inputs are TensorArray");
+        auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
+
+        for (size_t i = 0; i < in_array.size(); ++i) {
+          if (in_array[i].numel() != 0) {
+            if (i >= out_array.size()) {
+              out_array.resize(i + 1);
+            }
+            if (out_array[i].numel() == 0) {
+              out_array[i].CopyFrom(in_array[i], in_array[i].place(),
+                                    context.device_context());
+              out_array[i].set_lod(in_array[i].lod());
+            } else {
+              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
+              auto in = EigenVector<T>::Flatten(in_array[i]);
+              auto result = EigenVector<T>::Flatten(out_array[i]);
+              result.device(context.GetEigenDevice<Place>()) = result + in;
+            }
+          }
+        }
+      }
+    } else {
+      PADDLE_THROW("Unexpected branch, output variable type is %s",
+                   out_var->Type().name());
    }
  }
 };
-
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+class ArrayOpBase : public framework::OperatorBase {
+ public:
+  ArrayOpBase(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {}
+
+ protected:
+  size_t GetOffset(const framework::Scope &scope,
+                   const platform::DeviceContext &dev_ctx) const {
+    auto *i = scope.FindVar(Input("I"));
+    PADDLE_ENFORCE(i != nullptr, "I must be set");
+    auto &i_tensor = i->Get<framework::LoDTensor>();
+    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+    size_t offset;
+    if (platform::is_gpu_place(i_tensor.place())) {
+      // FIXME: Avoid copy from GPU to CPU
+      framework::Tensor t;
+      t.CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx);
+      dev_ctx.Wait();
+      offset = static_cast<size_t>(*t.data<int64_t>());
+    } else {
+      offset = static_cast<size_t>(*i_tensor.data<int64_t>());
+    }
+    return offset;
+  }
+};
+
+class WriteToArrayOp : public ArrayOpBase {
+ public:
+  WriteToArrayOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : ArrayOpBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *x = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x != nullptr, "X must be set");
+    auto &x_tensor = x->Get<framework::LoDTensor>();
+    size_t offset = GetOffset(scope, dev_ctx);
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
+    if (offset >= out->size()) {
+      out->resize(offset + 1);
+    }
+    auto *out_tensor = &out->at(offset);
+    out_tensor->CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx);
+    out_tensor->set_lod(x_tensor.lod());
+  }
+};
+
+class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  WriteToArrayOpProtoMaker(framework::OpProto *proto,
+                           framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
+    AddInput(
+        "I",
+        "(Tensor) the subscript index in tensor array. The number of element "
+        "should be 1");
+    AddOutput("Out", "(TensorArray) the tensor array will be written");
+    AddComment(R"DOC(Write a LoDTensor to a LoDTensor array.
+
+Assume T is LoDTensor, i is the subscript of the array, and A is the array. The
+equation is
+
+A[i] = T
+)DOC");
+  }
+};
+
+class WriteToArrayInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
+    PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
+                      "The number of element of subscript index must be 1");
+    PADDLE_ENFORCE(context->HasInput("X"), NotHasXError());
+    PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError());
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+
+ protected:
+  virtual const char *NotHasXError() const { return "Must set the lod tensor"; }
+
+  virtual const char *NotHasOutError() const {
+    return "Must set the lod tensor array";
+  }
+};
+
+class WriteToArrayInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDescBind &op_desc,
+                  framework::BlockDescBind *block) const override {
+    for (auto &out_var : op_desc.OutputArgumentNames()) {
+      VLOG(10) << "Set Variable " << out_var << " as LOD_TENSOR_ARRAY";
+      block->Var(out_var)->SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+    }
+  }
+};
+
+class ReadFromArrayOp : public ArrayOpBase {
+ public:
+  ReadFromArrayOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : ArrayOpBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *x = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x != nullptr, "X must be set");
+    auto &x_array = x->Get<framework::LoDTensorArray>();
+    auto *out = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(out != nullptr, "Out must be set");
+    auto *out_tesnor = out->GetMutable<framework::LoDTensor>();
+    size_t offset = GetOffset(scope, dev_ctx);
+    PADDLE_ENFORCE_LT(offset, x_array.size());
+    out_tesnor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx);
+    out_tesnor->set_lod(x_array[offset].lod());
+  }
+};
+
+class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReadFromArrayProtoMaker(framework::OpProto *proto,
+                          framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(TensorArray) the array will be read from.");
+    AddInput("I",
+             "(Tensor) the subscript index in tensor array. The number of "
+             "element should be 1");
+    AddOutput("Out", "(LoDTensor) the tensor will be read from.");
+    AddComment(R"DOC(Read a LoDTensor from a LoDTensor Array
+
+Assume T is LoDTensor, i is th e subscript of the array, and A is the array. The
+equation is
+
+T = A[i]
+)DOC");
+  }
+};
+
+class ReadFromArrayInferShape : public WriteToArrayInferShape {
+ protected:
+  const char *NotHasXError() const override {
+    return "The input array X must be set";
+  }
+  const char *NotHasOutError() const override {
+    return "The output tensor out must be set";
+  }
+};
+
+class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("read_from_array");
+    grad_op->SetInput("I", Input("I"));
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("write_to_array");
+    grad_op->SetInput("I", Input("I"));
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(write_to_array, ops::WriteToArrayOp,
+                  ops::WriteToArrayInferShape, ops::WriteToArrayOpProtoMaker,
+                  ops::WriteToArrayGradMaker, ops::WriteToArrayInferVarType);
+REGISTER_OPERATOR(read_from_array, ops::ReadFromArrayOp,
+                  ops::ReadFromArrayInferShape, ops::ReadFromArrayProtoMaker,
+                  ops::ReadFromArrayGradMaker);
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-
 set(OPITMIZER_SRCS
    adadelta_optimizer.cc
    adagrad_optimizer.cc
@@ -9,11 +7,6 @@ set(OPITMIZER_SRCS
    sgd_optimizer.cc
  )

-add_library(paddle_optimizer STATIC ${OPITMIZER_SRCS})
-add_dependencies(paddle_optimizer paddle_proto ${external_project_dependencies})
-
-
-if(WITH_TESTING)
-  add_simple_unittest(serialization_test)
-  add_simple_unittest(parameter_optimizer_test)
-endif()
+cc_library(paddle_optimizer STATIC SRCS ${OPITMIZER_SRCS} DEPS paddle_proto glog)
+cc_test(serialization_test SRCS serialization_test.cc DEPS paddle_proto)
+cc_test(parameter_optimizer_test SRCS parameter_optimizer_test.cc DEPS paddle_optimizer)
--- a/paddle/optimizer/adadelta_optimizer.cc
+++ b/paddle/optimizer/adadelta_optimizer.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "adadelta_optimizer.h"
 #include <algorithm>
 #include <cmath>

--- a/paddle/optimizer/adadelta_optimizer.h
+++ b/paddle/optimizer/adadelta_optimizer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once

 #include "parameter_optimizer.h"

--- a/paddle/optimizer/adagrad_optimizer.cc
+++ b/paddle/optimizer/adagrad_optimizer.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include <cmath>

 #include "adagrad_optimizer.h"

--- a/paddle/optimizer/adagrad_optimizer.h
+++ b/paddle/optimizer/adagrad_optimizer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once

 #include "parameter_optimizer.h"

--- a/paddle/optimizer/adam_optimizer.cc
+++ b/paddle/optimizer/adam_optimizer.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "adam_optimizer.h"
 #include <cmath>


--- a/paddle/optimizer/adam_optimizer.h
+++ b/paddle/optimizer/adam_optimizer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once

 #include "parameter_optimizer.h"

--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "optimizer.h"
 #include <glog/logging.h>
 #include <cstdlib>
@@ -6,8 +20,8 @@

 #include "parameter_optimizer.h"

-using namespace paddle;
-using namespace paddle::optimizer;
+using paddle::optimizer::ParameterOptimizer;
+using paddle::optimizer::Tensor;

 template <paddle_element_type VALUE>
 struct EnumToType {};
@@ -15,22 +29,21 @@ struct EnumToType {};
 template <class T>
 struct TypeToEnum {};

-#define MATCH_ENUM_TYPE(TYPE, ENUM)                  \
-  template <>                                        \
-  struct TypeToEnum<TYPE> {                          \
-    static paddle_element_type v() { return ENUM; }; \
-    static constexpr TYPE value = ENUM;              \
-  };                                                 \
-  template <>                                        \
-  struct EnumToType<ENUM> {                          \
-    typedef TYPE Type;                               \
+#define MATCH_ENUM_TYPE(TYPE, ENUM)                 \
+  template <>                                       \
+  struct TypeToEnum<TYPE> {                         \
+    static paddle_element_type v() { return ENUM; } \
+    static constexpr TYPE value = ENUM;             \
+  };                                                \
+  template <>                                       \
+  struct EnumToType<ENUM> {                         \
+    typedef TYPE Type;                              \
  }

 MATCH_ENUM_TYPE(int32_t, PADDLE_ELEMENT_TYPE_INT32);
 MATCH_ENUM_TYPE(uint32_t, PADDLE_ELEMENT_TYPE_UINT32);
 MATCH_ENUM_TYPE(int64_t, PADDLE_ELEMENT_TYPE_INT64);
 MATCH_ENUM_TYPE(uint64_t, PADDLE_ELEMENT_TYPE_UINT64);
-// TODO(zhihong): only implement below type, need to fix
 MATCH_ENUM_TYPE(float, PADDLE_ELEMENT_TYPE_FLOAT32);
 MATCH_ENUM_TYPE(double, PADDLE_ELEMENT_TYPE_FLOAT64);


--- a/paddle/optimizer/optimizer.h
+++ b/paddle/optimizer/optimizer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once

 #include <stdbool.h>

--- a/paddle/optimizer/parameter_optimizer.cc
+++ b/paddle/optimizer/parameter_optimizer.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include <glog/logging.h>
 #include "adadelta_optimizer.h"
 #include "adagrad_optimizer.h"

--- a/paddle/optimizer/parameter_optimizer.h
+++ b/paddle/optimizer/parameter_optimizer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once

 #include <glog/logging.h>

--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
@@ -110,7 +110,7 @@ public:

      int s = 0;
      float* newp = (float*)opts_[i]->get_weight(&s);
-      EXPECT_EQ(s, kSize);
+      EXPECT_EQ(static_cast<size_t>(s), kSize);
      for (size_t j = 0; j < kSize; ++j) {
        EXPECT_EQ(newp[j], (*p)[j]);
      }

--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
--- a/paddle/optimizer/sgd_optimizer.cc
+++ b/paddle/optimizer/sgd_optimizer.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "sgd_optimizer.h"
 #include "serialization.h"


--- a/paddle/optimizer/sgd_optimizer.h
+++ b/paddle/optimizer/sgd_optimizer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once

 #include "parameter_optimizer.h"
@@ -15,7 +29,6 @@ public:
        nesterov_(n) {
    if (momentum_ != 0.0) {
      size_t size = parameter->size();
-      // TODO: fix it with align aware allocator bind to Tensor
      momentums_ = new Tensor(size);
    }
  }

--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -97,6 +97,15 @@ namespace pybind {

 using namespace paddle::framework;  // NOLINT

+template <typename T>
+static py::bytes SerializeMessage(T &self) {
+  // Check IsInitialized in Python
+  std::string retv;
+  PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv),
+                 "Cannot serialize message");
+  return retv;
+}
+
 // Bind Methods
 void BindProgramDesc(py::module &m) {
  py::class_<ProgramDescBind>(m, "ProgramDesc", "")
@@ -132,17 +141,7 @@ void BindProgramDesc(py::module &m) {
      .def("block", &ProgramDescBind::MutableBlock,
           py::return_value_policy::reference)
      .def("num_blocks", &ProgramDescBind::Size)
-      .def("serialize_to_string",
-           [](ProgramDescBind &program_desc) -> py::bytes {
-             const ProgramDesc *desc = program_desc.Proto();
-             PADDLE_ENFORCE(desc->IsInitialized(),
-                            "ProgramDesc has not been initialized.");
-             std::string res;
-             PADDLE_ENFORCE(
-                 desc->SerializeToString(&res),
-                 "Serialize ProgramDesc Error. This could be a bug of Paddle.");
-             return res;
-           })
+      .def("serialize_to_string", SerializeMessage<ProgramDescBind>)
      .def("parse_from_string",
           [](ProgramDescBind &program_desc, const std::string &data) {
             ProgramDesc *desc = program_desc.Proto();
@@ -181,16 +180,7 @@ void BindBlockDesc(py::module &m) {
           py::return_value_policy::reference)
      .def("op_size", &BlockDescBind::OpSize)
      .def("op", &BlockDescBind::Op, py::return_value_policy::reference)
-      .def("serialize_to_string", [](BlockDescBind &block_desc) -> py::bytes {
-        const BlockDesc *desc = block_desc.Proto();
-        PADDLE_ENFORCE(desc->IsInitialized(),
-                       "BlockDesc has not been initialized.");
-        std::string res;
-        PADDLE_ENFORCE(
-            desc->SerializeToString(&res),
-            "Serialize BlockDesc Error. This could be a bug of Paddle.");
-        return res;
-      });
+      .def("serialize_to_string", SerializeMessage<BlockDescBind>);
 }

 void BindVarDsec(py::module &m) {
@@ -219,17 +209,7 @@ void BindVarDsec(py::module &m) {
      .def("set_lod_level", &VarDescBind::SetLoDLevel)
      .def("type", &VarDescBind::GetType)
      .def("set_type", &VarDescBind::SetType)
-      .def("serialize_to_string",
-           [](VarDescBind &var_desc) -> py::bytes {
-             const VarDesc *desc = var_desc.Proto();
-             PADDLE_ENFORCE(desc->IsInitialized(),
-                            "VarDesc has not been initialized.");
-             std::string res;
-             PADDLE_ENFORCE(
-                 desc->SerializeToString(&res),
-                 "Serialize VarDesc Error. This could be a bug of Paddle.");
-             return res;
-           })
+      .def("serialize_to_string", SerializeMessage<VarDescBind>)
      .def("persistable", &VarDescBind::Persistable)
      .def("set_persistable", &VarDescBind::SetPersistable);

@@ -274,16 +254,7 @@ void BindOpDesc(py::module &m) {
      .def("check_attrs", &OpDescBind::CheckAttrs)
      .def("infer_shape", &OpDescBind::InferShape)
      .def("infer_var_type", &OpDescBind::InferVarType)
-      .def("serialize_to_string", [](OpDescBind &op_desc) -> py::bytes {
-        const OpDesc *desc = op_desc.Proto();
-        PADDLE_ENFORCE(desc->IsInitialized(),
-                       "OpDesc has not been initialized.");
-        std::string res;
-        PADDLE_ENFORCE(
-            desc->SerializeToString(&res),
-            "Serialize OpDesc Error. This could be a bug of Paddle.");
-        return res;
-      });
+      .def("serialize_to_string", SerializeMessage<OpDescBind>);
 }

 }  // namespace pybind

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -2,171 +2,185 @@

 set -xe

-# Set BASE_IMAGE according to env variables
-if [[ ${WITH_GPU} == "ON" ]]; then
-  BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
-else
-  BASE_IMAGE="ubuntu:16.04"
-fi
-
-DOCKERFILE_GPU_ENV=""
-DOCKERFILE_CUDNN_DSO=""
-if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-    DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
-    DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
-fi
-
-mkdir -p /paddle/build
-cd /paddle/build
-
-# build script will not fail if *.deb does not exist
-rm *.deb 2>/dev/null || true
-# delete previous built whl packages
-rm -rf /paddle/paddle/dist 2>/dev/null || true
-
-cat <<EOF
-========================================
-Configuring cmake in /paddle/build ...
-      -DCMAKE_BUILD_TYPE=Release
-      -DWITH_DOC=OFF
-      -DWITH_GPU=${WITH_GPU:-OFF}
-      -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
-      -DWITH_MKLML=${WITH_MKLML:-ON}
-      -DWITH_AVX=${WITH_AVX:-OFF}
-      -DWITH_GOLANG=${WITH_GOLANG:-ON}
-      -DWITH_SWIG_PY=ON
-      -DWITH_C_API=${WITH_C_API:-OFF}
-      -DWITH_PYTHON=${WITH_PYTHON:-ON}
-      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-      -DCUDNN_ROOT=/usr/
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
-      -DWITH_TESTING=${WITH_TESTING:-ON}
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-========================================
-EOF

-# Disable UNITTEST_USE_VIRTUALENV in docker because
-# docker environment is fully controlled by this script.
-# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
-cmake .. \
-      -DCMAKE_BUILD_TYPE=Release \
-      -DWITH_DOC=OFF \
-      -DWITH_GPU=${WITH_GPU:-OFF} \
-      -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
-      -DWITH_MKLML=${WITH_MKLML:-ON} \
-      -DWITH_AVX=${WITH_AVX:-OFF} \
-      -DWITH_GOLANG=${WITH_GOLANG:-ON} \
-      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-      -DWITH_C_API=${WITH_C_API:-OFF} \
-      -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-      -DCUDNN_ROOT=/usr/ \
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-      -DWITH_TESTING=${WITH_TESTING:-ON} \
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-
-cat <<EOF
-============================================
-Building in /paddle/build ...
-============================================
-EOF
-make -j `nproc`
+function cmake_gen() {
+    # Set BASE_IMAGE according to env variables
+    if [[ ${WITH_GPU} == "ON" ]]; then
+    BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
+    else
+    BASE_IMAGE="ubuntu:16.04"
+    fi
+
+    DOCKERFILE_GPU_ENV=""
+    DOCKERFILE_CUDNN_DSO=""
+    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
+        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
+    fi
+
+    mkdir -p /paddle/build
+    cd /paddle/build
+
+    # build script will not fail if *.deb does not exist
+    rm *.deb 2>/dev/null || true
+    # delete previous built whl packages
+    rm -rf /paddle/paddle/dist 2>/dev/null || true

-if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
-cat <<EOF
-========================================
-Running unit tests ...
-========================================
+    cat <<EOF
+    ========================================
+    Configuring cmake in /paddle/build ...
+        -DCMAKE_BUILD_TYPE=Release
+        -DWITH_DOC=OFF
+        -DWITH_GPU=${WITH_GPU:-OFF}
+        -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
+        -DWITH_MKLML=${WITH_MKLML:-ON}
+        -DWITH_AVX=${WITH_AVX:-OFF}
+        -DWITH_GOLANG=${WITH_GOLANG:-ON}
+        -DWITH_SWIG_PY=ON
+        -DWITH_C_API=${WITH_C_API:-OFF}
+        -DWITH_PYTHON=${WITH_PYTHON:-ON}
+        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+        -DCUDNN_ROOT=/usr/
+        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
+        -DWITH_TESTING=${WITH_TESTING:-ON}
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+    ========================================
 EOF
-    ctest --output-on-failure
-    # make install should also be test when unittest
-    make install -j `nproc`
-    pip install /usr/local/opt/paddle/share/wheels/*.whl
-    paddle version
-fi
-

-if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
+    # Disable UNITTEST_USE_VIRTUALENV in docker because
+    # docker environment is fully controlled by this script.
+    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
+    cmake .. \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DWITH_DOC=OFF \
+        -DWITH_GPU=${WITH_GPU:-OFF} \
+        -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
+        -DWITH_MKLML=${WITH_MKLML:-ON} \
+        -DWITH_AVX=${WITH_AVX:-OFF} \
+        -DWITH_GOLANG=${WITH_GOLANG:-ON} \
+        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
+        -DWITH_C_API=${WITH_C_API:-OFF} \
+        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
+        -DCUDNN_ROOT=/usr/ \
+        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
+        -DWITH_TESTING=${WITH_TESTING:-ON} \
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+}
+
+function run_build() {
    cat <<EOF
-========================================
-Building documentation ...
-   In /paddle/build_doc
-========================================
+    ============================================
+    Building in /paddle/build ...
+    ============================================
 EOF
-    mkdir -p /paddle/build_doc
-    pushd /paddle/build_doc
-    cmake .. \
-          -DWITH_DOC=ON \
-          -DWITH_GPU=OFF \
-          -DWITH_AVX=${WITH_AVX:-ON} \
-          -DWITH_SWIG_PY=ON \
-          -DWITH_STYLE_CHECK=OFF
-    make -j `nproc` gen_proto_py
-    make -j `nproc` paddle_docs paddle_docs_cn
-    popd
-fi
-
-
-if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
+    make -j `nproc`
+}
+
+function run_test() {
+    if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
    cat <<EOF
-========================================
-Converting C++ source code into HTML ...
-========================================
+    ========================================
+    Running unit tests ...
+    ========================================
 EOF
-    export WOBOQ_OUT=/paddle/build/woboq_out
-    mkdir -p $WOBOQ_OUT
-    cp -rv /woboq/data $WOBOQ_OUT/../data
-    /woboq/generator/codebrowser_generator \
-        -b /paddle/build \
-        -a \
-        -o $WOBOQ_OUT \
-        -p paddle:/paddle
-    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-fi
-
-cat <<EOF
-========================================
-Generate /paddle/build/Dockerfile ...
-========================================
+        ctest --output-on-failure
+        # make install should also be test when unittest
+        make install -j `nproc`
+        pip install /usr/local/opt/paddle/share/wheels/*.whl
+        paddle version
+    fi
+}
+
+
+function gen_docs() {
+    if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
+        cat <<EOF
+    ========================================
+    Building documentation ...
+    In /paddle/build_doc
+    ========================================
 EOF
+        mkdir -p /paddle/build_doc
+        pushd /paddle/build_doc
+        cmake .. \
+            -DWITH_DOC=ON \
+            -DWITH_GPU=OFF \
+            -DWITH_AVX=${WITH_AVX:-ON} \
+            -DWITH_SWIG_PY=ON \
+            -DWITH_STYLE_CHECK=OFF
+        make -j `nproc` gen_proto_py
+        make -j `nproc` paddle_docs paddle_docs_cn
+        popd
+    fi
+
+
+    if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
+        cat <<EOF
+    ========================================
+    Converting C++ source code into HTML ...
+    ========================================
+EOF
+        export WOBOQ_OUT=/paddle/build/woboq_out
+        mkdir -p $WOBOQ_OUT
+        cp -rv /woboq/data $WOBOQ_OUT/../data
+        /woboq/generator/codebrowser_generator \
+            -b /paddle/build \
+            -a \
+            -o $WOBOQ_OUT \
+            -p paddle:/paddle
+        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+    fi
+}
+

-cat > /paddle/build/Dockerfile <<EOF
-FROM ${BASE_IMAGE}
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-ENV HOME /root
+function gen_dockerfile() {
+
+    cat <<EOF
+    ========================================
+    Generate /paddle/build/Dockerfile ...
+    ========================================
 EOF

-if [[ -n ${APT_MIRROR} ]]; then
-cat >> /paddle/build/Dockerfile <<EOF
-RUN sed -i '${APT_MIRROR}' /etc/apt/sources.list
+    cat > /paddle/build/Dockerfile <<EOF
+    FROM ${BASE_IMAGE}
+    MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+    ENV HOME /root
 EOF
-fi
-
-if [[ ${WITH_GPU} == "ON"  ]]; then
-  NCCL_DEPS="apt-get install -y libnccl-dev &&"
-else
-  NCCL_DEPS="" 
-fi
-
-cat >> /paddle/build/Dockerfile <<EOF
-ADD python/dist/*.whl /
-# run paddle version to install python packages first
-RUN apt-get update &&\
-    ${NCCL_DEPS}\
-    apt-get install -y wget python-pip && pip install -U pip && \
-    pip install /*.whl; apt-get install -f -y && \
-    apt-get clean -y && \
-    rm -f /*.whl && \
-    paddle version && \
-    ldconfig
-${DOCKERFILE_CUDNN_DSO}
-${DOCKERFILE_GPU_ENV}
-ADD go/cmd/pserver/pserver /usr/bin/
-ADD go/cmd/master/master /usr/bin/
-ADD paddle/pybind/print_operators_doc /usr/bin/
-# default command shows the paddle version and exit
-CMD ["paddle", "version"]
+
+    if [[ ${WITH_GPU} == "ON"  ]]; then
+        NCCL_DEPS="apt-get install -y libnccl-dev &&"
+    else
+        NCCL_DEPS="" 
+    fi
+
+    cat >> /paddle/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
+    # run paddle version to install python packages first
+    RUN apt-get update &&\
+        ${NCCL_DEPS}\
+        apt-get install -y wget python-pip && pip install -U pip && \
+        pip install /*.whl; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f /*.whl && \
+        paddle version && \
+        ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_GPU_ENV}
+    ADD go/cmd/pserver/pserver /usr/bin/
+    ADD go/cmd/master/master /usr/bin/
+    ADD paddle/pybind/print_operators_doc /usr/bin/
+    # default command shows the paddle version and exit
+    CMD ["paddle", "version"]
 EOF
+}

 set +xe
+
+cmake_gen
+run_build
+run_test
+gen_docs
+gen_dockerfile
+
 printf "If you need to install PaddlePaddle in develop docker image,"
 printf "please make install or pip install build/python/dist/*.whl.\n"
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
@@ -17,8 +17,7 @@ limitations under the License. */

 #include <fenv.h>

-#if (defined(__APPLE__) || defined(__OSX__)) && !defined(__arm__) && \
-    !defined(__aarch64__)
+#if defined(__APPLE__) || defined(__OSX__)

 int fegetexcept(void);
 int feenableexcept(unsigned int excepts);

--- a/paddle/utils/arch/osx/Excepts.cpp
+++ b/paddle/utils/arch/osx/Excepts.cpp
@@ -14,9 +14,13 @@ limitations under the License. */

 #include "paddle/utils/Excepts.h"

-#if (defined(__APPLE__) || defined(__OSX__)) && !defined(__arm__) && \
-    !defined(__aarch64__)
-
+#if defined(__APPLE__) || defined(__OSX__)
+#if defined(__arm__) || defined(__arm64__)
+// TODO(liuyiqun): implement the arm version
+int fegetexcept(void) { return -1; }
+int feenableexcept(unsigned int excepts) { return -1; }
+int fedisableexcept(unsigned int excepts) { return -1; }
+#else
 int fegetexcept(void) {
  static fenv_t fenv;
  return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
@@ -49,5 +53,5 @@ int fedisableexcept(unsigned int excepts) {

  return (fesetenv(&fenv) ? -1 : old_excepts);
 }
-
+#endif
 #endif
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ b/paddle/utils/tests/test_StringUtils.cpp
@@ -18,6 +18,6 @@ limitations under the License. */

 TEST(StringUtil, to) {
  ASSERT_NEAR(paddle::str::to<double>("12.45"), 12.45, 1e-5);
-  ASSERT_DEATH(paddle::str::to<double>("12.45x23"), ".*");
-  ASSERT_DEATH(paddle::str::to<int>(""), ".*");
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<double>("12.45x23"), ".*");
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<int>(""), ".*");
 }
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -44,6 +44,7 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/pad


 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+    COMMAND touch stub.cc
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
    COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -143,6 +143,7 @@ __all__ = [
    'scale_shift_layer',
    'img_conv3d_layer',
    'resize_layer',
+    'sub_seq_layer',
 ]


@@ -252,6 +253,7 @@ class LayerType(object):
    SCALE_SHIFT_LAYER = 'scale_shift'

    RESIZE = 'resize'
+    SUB_SEQ_LAYER = 'subseq'

    @staticmethod
    def is_layer_type(type_name):
@@ -6980,3 +6982,58 @@ def resize_layer(input, size, name=None):
    """
    Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
    return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
+
+
+@wrap_act_default(act=LinearActivation())
+@wrap_name_default('sub_seq')
+def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
+    """
+    sub_seq_layer will return sub-sequences from the input sequences. For each
+    sequence in the input sequence layer, sub_seq_layer will slice it by given
+    offset and size. Please notice that, number of offset value and size value
+    both are equal to the number of sequence in the input layer.
+
+    .. code-block:: python
+
+        sub_seq = sub_seq_layer(input=input_seq, offsets=offsets, sizes=sizes)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer, which should be sequence.
+    :type input: LayerOutput
+    :param offsets: offset indices to slice the input sequence, which should be
+                    sequence type.
+    :type offsets: LayerOutput
+    :param sizes: sizes of the sub-sequences, which should be sequence type.
+    :type sizes: LayerOutput
+    :param act: Layer activation, default is LinearActivation
+    :type act: BaseActivation.
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of sub_seq_layer layer must be a PaddlePaddle layer.')
+    assert isinstance(offsets, LayerOutput), (
+        'The offset indices for sub_seq_layer, '
+        'must be a PaddlePaddle layer.')
+    assert isinstance(sizes, LayerOutput), (
+        'The sizes of sub-sequences, must be a PaddlePaddle layer.')
+
+    Layer(
+        name=name,
+        type=LayerType.SUB_SEQ_LAYER,
+        inputs=[input.name, offsets.name, sizes.name],
+        active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr))
+
+    return LayerOutput(
+        name,
+        LayerType.SUB_SEQ_LAYER,
+        parents=[input, offsets, sizes],
+        size=input.size)
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -116,7 +116,7 @@ class AdamOptimizer(BaseSGDOptimizer):

        m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
        v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}

    :param beta1: the :math:`\\beta_1` in equation.
    :type beta1: float

--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
@@ -23,32 +23,32 @@ from paddle.v2.topology import Topology


 def merge_v2_model(net, param_file, output_file):
-    '''Integrate the model config and model parameters into one file.
-    
+    '''Merge the model config and parameters into one file.
+
    The model configuration file describes the model structure which
    ends with .py. The parameters file stores the parameters of the model
    which ends with .tar.gz.
-    
-    @param  net            The output layer of the network.
-    @param  param_file     Path of the model parameters(.tar.gz) which is stored by v2 api.
+
+    @param  net            The output layer of the network for inference.
+    @param  param_file     Path of the parameters (.tar.gz) which is stored by v2 api.
    @param  output_file    Path of the merged file which will be generated.
-    
+
    Usage:

-        from paddle.util.merge_model import merge_v2_model
+        from paddle.utils.merge_model import merge_v2_model
        # import your network configuration
-        from mobilenet import mobile_net
-        
-        net = mobile_net(3*224*224, 102)
+        from example_net import net_conf
+
+        net = net_conf(is_predict=True)
        param_file = './param_pass_00000.tar.gz'
        output_file = './output.paddle'
-        
+
        merge_v2_model(net, param_file, output_file)

    '''

    assert isinstance(net, LayerOutput), \
-            "The net should be the output of the network"
+            "The net should be the output of the network for inference"
    assert os.path.exists(param_file), \
            "The model parameters file %s does not exists " % (param_file)


--- a/python/paddle/v2/framework/executor.py
+++ b/python/paddle/v2/framework/executor.py
 import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import Block, Program
+from paddle.v2.framework.framework import Block, Program, g_main_program

 g_scope = core.Scope()

@@ -18,7 +18,7 @@ class Executor(object):
        self.executor = core.Executor(act_places)

    def run(self,
-            program,
+            program=None,
            feed=None,
            fetch_list=None,
            feed_var_name='feed',
@@ -29,6 +29,9 @@ class Executor(object):
        if fetch_list is None:
            fetch_list = []

+        if program is None:
+            program = g_main_program
+
        if not isinstance(program, Program):
            raise TypeError()


--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
@@ -12,6 +12,14 @@ def unique_name(prefix):
    return "_".join([prefix, str(uid)])


+def _debug_string_(proto):
+    error_fields = list()
+    if not proto.IsInitialized(error_fields):
+        raise ValueError("{0} are not initialized\nThe message is {1}".format(
+            error_fields, proto))
+    return proto.__str__()
+
+
 class Variable(object):
    def __init__(self,
                 block,
@@ -95,7 +103,7 @@ class Variable(object):
    def __str__(self):
        protostr = self.desc.serialize_to_string()
        proto = framework_pb2.VarDesc.FromString(str(protostr))
-        return proto.__str__()
+        return _debug_string_(proto)

    __repr__ = __str__

@@ -286,7 +294,7 @@ class Operator(object):
    def __str__(self):
        protostr = self.desc.serialize_to_string()
        proto = framework_pb2.OpDesc.FromString(str(protostr))
-        return proto.__str__()
+        return _debug_string_(proto)

    __repr__ = __str__

@@ -343,7 +351,7 @@ class Block(object):
    def __str__(self):
        protostr = self.desc.serialize_to_string()
        proto = framework_pb2.BlockDesc.FromString(str(protostr))
-        return proto.__str__()
+        return _debug_string_(proto)

    __repr__ = __str__

@@ -448,7 +456,7 @@ class Program(object):
    def __str__(self):
        protostr = self.desc.serialize_to_string()
        proto = framework_pb2.ProgramDesc.FromString(str(protostr))
-        return proto.__str__()
+        return _debug_string_(proto)

    def clone(self):
        p = Program()

--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
 import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import OpProtoHolder, Variable, Program, Operator
-from paddle.v2.framework.initializer import ConstantInitializer, NormalInitializer
+from paddle.v2.framework.framework import OpProtoHolder, Variable, Program, \
+    Operator
+from paddle.v2.framework.initializer import ConstantInitializer, \
+    NormalInitializer
 from paddle.v2.framework.layer_helper import LayerHelper, unique_name
 import re

@@ -372,11 +374,13 @@ def sequence_pool(input, pool_type, **kwargs):
    helper = LayerHelper('sequence_pool', input=input, **kwargs)
    dtype = helper.input_dtype()
    pool_out = helper.create_tmp_variable(dtype)
+    max_index = helper.create_tmp_variable(dtype)

    helper.append_op(
        type="sequence_pool",
-        inputs={"X": [input]},
-        outputs={"Out": [pool_out]},
+        inputs={"X": input},
+        outputs={"Out": pool_out,
+                 "MaxIndex": max_index},
        attrs={"pooltype": pool_type.upper()})

    return pool_out
@@ -577,25 +581,45 @@ class StaticRNN(object):
        if self.status != StaticRNN.IN_RNN_BLOCK:
            raise ValueError("You must invoke {0} in rnn block".format(method))

-    def memory(self, init=None, shape=None, dtype=None, init_value=0):
+    def memory(self,
+               init=None,
+               shape=None,
+               batch_ref=None,
+               init_value=0.0,
+               init_batch_dim_idx=0,
+               ref_batch_dim_idx=1):
+        '''
+        :param init: boot memory, if not set, a shape, batch_ref must be provided
+        :param shape: shape of the boot memory
+        :param batch_ref: batch size reference variable
+        :param init_value: the init value of boot memory
+        :param init_batch_dim_idx: the index of batch size in init's dimension
+        :param ref_batch_dim_idx: the index of batch size in batch_ref's dimension
+        :return: boot memory
+        '''
        self._assert_in_rnn_block_('memory')
        if init is None:
-            if shape is None or dtype is None:
+            if shape is None or batch_ref is None:
                raise ValueError(
-                    "if init is None, memory at least need shape and dtype")
+                    "if init is None, memory at least need shape and batch_ref")
            parent_block = self.parent_block()
            var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
            boot_var = parent_block.create_var(
-                name=var_name, shape=shape, dtype=dtype, persistable=False)
+                name=var_name,
+                shape=shape,
+                dtype=batch_ref.data_type,
+                persistable=False)

            parent_block.append_op(
-                type="fill_constant",
-                inputs={},
+                type="fill_constant_batch_size_like",
+                inputs={'Input': [batch_ref]},
                outputs={'Out': [boot_var]},
                attrs={
                    'value': init_value,
-                    'shape': [40] + list(boot_var.shape[1:]),
-                    'data_type': boot_var.data_type
+                    'shape': boot_var.shape,
+                    'data_type': boot_var.data_type,
+                    'input_dim_idx': ref_batch_dim_idx,
+                    'output_dim_idx': init_batch_dim_idx
                })

            return self.memory(init=boot_var)
@@ -749,3 +773,68 @@ def lod_rank_table(x, level=0, main_program=None):
        outputs={'Out': table},
        attrs={'level': level})
    return table
+
+
+def fill_constant(shape, dtype, value, main_program=None):
+    helper = LayerHelper("ones", **locals())
+    out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='fill_constant',
+        inputs={},
+        outputs={'Out': [out]},
+        attrs={
+            'shape': shape,
+            'data_type': out.data_type,
+            'value': float(value)
+        })
+    out.stop_gradient = True
+    return out
+
+
+def ones(shape, dtype, main_program=None):
+    return fill_constant(value=1.0, **locals())
+
+
+def zeros(shape, dtype, main_program=None):
+    return fill_constant(value=0.0, **locals())
+
+
+def increment(x, value=1.0, main_program=None):
+    helper = LayerHelper("increment", **locals())
+    tmp = helper.create_tmp_variable(dtype=x.data_type)
+    helper.append_op(
+        type='increment',
+        inputs={'X': [x]},
+        outputs={'Out': [tmp]},
+        attrs={'step': value})
+    return tmp
+
+
+def array_write(x, i, array=None, main_program=None):
+    helper = LayerHelper('array_write', **locals())
+    if array is None:
+        array = helper.create_variable(
+            name="{0}.out".format(helper.name),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=x.data_type)
+    helper.append_op(
+        type='write_to_array',
+        inputs={'X': [x],
+                'I': [i]},
+        outputs={'Out': [array]})
+    return array
+
+
+def array_read(array, i, main_program=None):
+    helper = LayerHelper('array_read', **locals())
+    if not isinstance(
+            array,
+            Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        raise TypeError("array should be tensor array vairable")
+    out = helper.create_tmp_variable(dtype=array.data_type)
+    helper.append_op(
+        type='read_from_array',
+        inputs={'X': [array],
+                'I': [i]},
+        outputs={'Out': [out]})
+    return out
--- a/python/paddle/v2/framework/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/framework/tests/test_array_read_write_op.py
+import unittest
+import paddle.v2.framework.core as core
+import paddle.v2.framework.layers as layers
+from paddle.v2.framework.executor import Executor
+from paddle.v2.framework.backward import append_backward_ops
+from paddle.v2.framework.framework import g_main_program
+import numpy
+
+
+class TestArrayReadWrite(unittest.TestCase):
+    def test_read_write(self):
+        x = [
+            layers.data(
+                name='x0', shape=[100]), layers.data(
+                    name='x1', shape=[100]), layers.data(
+                        name='x2', shape=[100])
+        ]
+
+        for each_x in x:
+            each_x.stop_gradient = False
+
+        i = layers.zeros(shape=[1], dtype='int64')
+        arr = layers.array_write(x=x[0], i=i)
+        i = layers.increment(x=i)
+        i.stop_gradient = True
+        arr = layers.array_write(x=x[1], i=i, array=arr)
+        i = layers.increment(x=i)
+        i.stop_gradient = True
+        arr = layers.array_write(x=x[2], i=i, array=arr)
+
+        i = layers.zeros(shape=[1], dtype='int64')
+        a0 = layers.array_read(array=arr, i=i)
+        i = layers.increment(x=i)
+        i.stop_gradient = True  # index should not calculate gradient
+        a1 = layers.array_read(array=arr, i=i)
+        i = layers.increment(x=i)
+        i.stop_gradient = True
+        a2 = layers.array_read(array=arr, i=i)
+
+        mean_a0 = layers.mean(x=a0)
+        mean_a1 = layers.mean(x=a1)
+        mean_a2 = layers.mean(x=a2)
+
+        a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])
+
+        mean_x0 = layers.mean(x=x[0])
+        mean_x1 = layers.mean(x=x[1])
+        mean_x2 = layers.mean(x=x[2])
+
+        x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])
+
+        scope = core.Scope()
+        cpu = core.CPUPlace()
+
+        exe = Executor(cpu)
+
+        tensor = core.LoDTensor()
+        tensor.set(numpy.random.random(size=(100, 100)).astype('float32'), cpu)
+
+        outs = map(numpy.array,
+                   exe.run(feed={'x0': tensor,
+                                 'x1': tensor,
+                                 'x2': tensor},
+                           fetch_list=[a_sum, x_sum],
+                           scope=scope))
+        self.assertEqual(outs[0], outs[1])
+
+        total_sum = layers.sums(input=[a_sum, x_sum])
+        total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)
+
+        append_backward_ops(total_sum_scaled)
+
+        g_vars = map(g_main_program.global_block().var,
+                     [each_x.name + "@GRAD" for each_x in x])
+        g_out = [
+            item.sum()
+            for item in map(
+                numpy.array,
+                exe.run(feed={'x0': tensor,
+                              'x1': tensor,
+                              'x2': tensor},
+                        fetch_list=g_vars))
+        ]
+        g_out_sum = numpy.array(g_out).sum()
+
+        # since our final gradient is 1 and the neural network are all linear
+        # with mean_op.
+        # the input gradient should also be 1
+        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
@@ -61,25 +61,23 @@ class TestConv2dOp(OpTest):

    def test_check_grad(self):
        self.check_grad(
-            set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.02)

    def test_check_grad_no_filter(self):
        self.check_grad(
            ['Input'],
            'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.02,
            no_grad_set=set(['Filter']))

    def test_check_grad_no_input(self):
        self.check_grad(
            ['Filter'],
            'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.02,
            no_grad_set=set(['Input']))

    def init_test_case(self):
-        # self.groups = 1
-        # self.op_type = "conv2d"
        self.pad = [0, 0]
        self.stride = [1, 1]
        self.dilations = [1, 1]
@@ -103,6 +101,9 @@ class TestWithGroup(TestConv2dOp):
        self.op_type = "conv2d"


+#----------------Conv2dCudnn----------------
+
+
 class TestCudnn(TestConv2dOp):
    def init_group(self):
        self.groups = 1

--- a/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
@@ -58,36 +58,37 @@ class TestConv2dTransposeOp(OpTest):
        print 'check output here for', self.op_type
        self.check_output()

-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-
-    def init_op_type(self):
-        self.op_type = "conv2d_transpose"
-
    def test_check_grad_no_input(self):
        self.check_grad(
            ['Filter'],
            'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.02,
            no_grad_set=set(['Input']))

    def test_check_grad_no_filter(self):
        self.check_grad(
            ['Input'],
            'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.02,
            no_grad_set=set(['Filter']))

    def test_check_grad(self):
        self.check_grad(
-            set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv2d_transpose"


+# ------------ test_cudnn ------------
 class TestCudnn(TestConv2dTransposeOp):
    def init_op_type(self):
        self.op_type = "conv2d_transpose_cudnn"

--- a/python/paddle/v2/framework/tests/test_conv3d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv3d_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def conv3d_forward_naive(input, filter, group, conv_param):
+    in_n, in_c, in_d, in_h, in_w = input.shape
+    out_c, f_c, f_d, f_h, f_w = filter.shape
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+    sub_out_c = out_c / group
+
+    stride, pad = conv_param['stride'], conv_param['pad']
+    out_d = 1 + (in_d + 2 * pad[0] - f_h) / stride[0]
+    out_h = 1 + (in_h + 2 * pad[1] - f_h) / stride[1]
+    out_w = 1 + (in_w + 2 * pad[2] - f_w) / stride[2]
+    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
+
+    input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], ),
+                               (pad[2], )),
+                       mode='constant',
+                       constant_values=0)
+    for d in range(out_d):
+        for i in range(out_h):
+            for j in range(out_w):
+                for g in range(group):
+                    input_pad_masked = \
+                        input_pad[:, g * f_c:(g + 1) * f_c,
+                        d * stride[0]:d * stride[0] + f_d,
+                        i * stride[1]:i * stride[1] + f_h,
+                        j * stride[2]:j * stride[2] + f_w]
+                    f_sub = filter[g * sub_out_c:(g + 1) *
+                                   sub_out_c, :, :, :, :]
+                    for k in range(sub_out_c):
+                        out[:, g * sub_out_c + k, d, i, j] = \
+                            np.sum(input_pad_masked * f_sub[k, :, :, :, :],
+                                   axis=(1, 2, 3, 4))
+
+    return out
+
+
+class TestConv3dOp(OpTest):
+    def setUp(self):
+        self.init_group()
+        self.init_op_type()
+        self.init_test_case()
+
+        conv3d_param = {'stride': self.stride, 'pad': self.pad}
+        input = np.random.random(self.input_size).astype("float32")
+        filter = np.random.random(self.filter_size).astype("float32")
+        output = conv3d_forward_naive(input, filter, self.groups,
+                                      conv3d_param).astype("float32")
+
+        self.inputs = {'Input': input, 'Filter': filter}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
+
+    def test_check_grad_no_filter(self):
+        self.check_grad(
+            ['Input'],
+            'Output',
+            max_relative_error=0.03,
+            no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        self.check_grad(
+            ['Filter'],
+            'Output',
+            max_relative_error=0.03,
+            no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 4, 4, 4]  # NCDHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3, 3]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_op_type(self):
+        self.op_type = "conv3d"
+
+
+class TestCase1(TestConv3dOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 4, 4, 4]  # NCDHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3, 3]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_op_type(self):
+        self.op_type = "conv3d"
+
+
+class TestWithGroup1(TestConv3dOp):
+    def init_group(self):
+        self.groups = 3
+
+    def init_op_type(self):
+        self.op_type = "conv3d"
+
+
+class TestWithGroup2(TestCase1):
+    def init_group(self):
+        self.groups = 3
+
+    def init_op_type(self):
+        self.op_type = "conv3d"
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
+    # [2, 3, 5, 5, 5]
+    in_n, in_c, in_d, in_h, in_w = input_.shape
+    # [3, 6, 3, 3, 3]
+    f_c, out_c, f_d, f_h, f_w = filter_.shape
+    assert in_c == f_c
+
+    stride, pad = conv3dtranspose_param['stride'], conv3dtranspose_param['pad']
+    out_d = (in_d - 1) * stride[0] + f_d
+    out_h = (in_h - 1) * stride[1] + f_h
+    out_w = (in_w - 1) * stride[2] + f_w
+
+    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
+
+    for n in range(in_n):
+        for d in range(in_d):
+            for i in range(in_h):
+                for j in range(in_w):
+                    input_masked = input_[n, :, d, i, j]  # (c)
+                    input_masked = np.reshape(input_masked, (in_c, 1, 1, 1))
+                    input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
+
+                    for k in range(out_c):
+                        tmp_out = np.sum(input_masked * filter_[:, k, :, :, :],
+                                         axis=0)
+                        d1, d2 = d * stride[0], d * stride[0] + f_d
+                        i1, i2 = i * stride[1], i * stride[1] + f_h
+                        j1, j2 = j * stride[2], j * stride[2] + f_w
+                        out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
+
+    return out
+
+
+class TestConv3dTransposeOp(OpTest):
+    def setUp(self):
+        # init as conv transpose
+        self.init_op_type()
+
+        # [2, 3, 5, 5, 5] -> kernel [3, 6, 3, 3, 3] -> output [2, 6, 7, 7, 7]
+        self.init_test_case()
+
+        conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
+        input_ = np.random.random(self.input_size).astype("float32")
+        filter_ = np.random.random(self.filter_size).astype("float32")
+        output = conv3dtranspose_forward_naive(
+            input_, filter_, conv3dtranspose_param).astype("float32")
+        # print 'deconv output py', output, output.shape
+
+        self.inputs = {'Input': input_, 'Filter': filter_}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            # 'dilations': self.dilations
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        print 'check output here'
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+
+    def test_check_grad_no_filter(self):
+        self.check_grad(
+            ['Input'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        self.check_grad(
+            ['Filter'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose"
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
@@ -21,9 +21,14 @@ class TestFillConstantBatchSizeLikeWhenSecondDimIsBatchSize(OpTest):
    def setUp(self):
        self.op_type = "fill_constant_batch_size_like"
        self.inputs = {'Input': np.random.random((219, 232)).astype("float32")}
-        self.attrs = {'value': 3.5, 'shape': [132, -1, 7], 'dim_idx': 1}
-
-        out = np.random.random((132, 232, 7)).astype("float32")
+        self.attrs = {
+            'value': 3.5,
+            'shape': [132, -1, 7],
+            'input_dim_idx': 0,
+            'output_dim_idx': 1
+        }
+
+        out = np.random.random((132, 219, 7)).astype("float32")
        out.fill(3.5)
        self.outputs = {'Out': out}


--- a/python/paddle/v2/framework/tests/test_framework_debug_str.py
+++ b/python/paddle/v2/framework/tests/test_framework_debug_str.py
+import unittest
+from paddle.v2.framework.framework import Program
+
+
+class TestDebugStringFramework(unittest.TestCase):
+    def test_debug_str(self):
+        p = Program()
+        p.current_block().create_var(name='t', shape=[0, 1])
+        self.assertRaises(ValueError, callableObj=p.__str__)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_gru_op.py
+++ b/python/paddle/v2/framework/tests/test_gru_op.py
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+from test_lstm_op import identity, sigmoid, tanh, relu
+
+
+class TestGRUOp(OpTest):
+    batch_size = 9
+    frame_size = 5
+    activate = {
+        'identity': identity,
+        'sigmoid': sigmoid,
+        'tanh': tanh,
+        'relu': relu
+    }
+
+    @staticmethod
+    def seq_to_batch(lod, is_reverse):
+        idx_in_seq_list = []
+        seq_starts = lod[0]
+        seq_lens = []
+        for i in range(len(seq_starts) - 1):
+            seq_lens.append(seq_starts[i + 1] - seq_starts[i])
+        sorted_seqs = sorted(
+            range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
+        num_batch = seq_lens[sorted_seqs[0]]
+        for batch_idx in range(num_batch):
+            idx_in_seq = []
+            for i in range(len(seq_lens)):
+                if seq_lens[sorted_seqs[i]] <= batch_idx:
+                    break
+                idx = (seq_starts[sorted_seqs[i] + 1] - 1 - batch_idx
+                       ) if is_reverse else (
+                           seq_starts[sorted_seqs[i]] + batch_idx)
+                idx_in_seq.append(idx)
+            idx_in_seq_list.append(idx_in_seq)
+        return idx_in_seq_list
+
+    def gru_step(self, x, h_p, w, b):
+        batch_size = x.shape[0]
+        frame_size = w.shape[0]
+        g = x + np.tile(b, (batch_size, 1))
+        w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
+            (frame_size, frame_size * 2))
+        u_r = self.activate[self.attrs['gate_activation']](np.dot(
+            h_p, w_u_r) + g[:, :frame_size * 2])
+        u = u_r[:, :frame_size]
+        r = u_r[:, frame_size:frame_size * 2]
+        r_h_p = r * h_p
+        w_c = w.flatten()[frame_size * frame_size * 2:].reshape(
+            (frame_size, frame_size))
+        c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
+                                                    g[:, frame_size * 2:])
+        g = np.hstack((u_r, c))
+        h = u * c + (1 - u) * h_p
+        return g, r_h_p, h
+
+    def gru(self):
+        input, lod = self.inputs['Input']
+        w = self.inputs['Weight']
+        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+            (1, self.frame_size * 3))
+        batch_gate = self.outputs['BatchGate']
+        batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev']
+        batch_hidden = self.outputs['BatchHidden']
+        hidden = self.outputs['Hidden']
+        idx_in_seq_list = self.idx_in_seq_list
+        h_p = self.inputs['H0'] if self.inputs.has_key('H0') else np.zeros(
+            (len(idx_in_seq_list[0]), self.frame_size))
+        num_batch = len(idx_in_seq_list)
+        end_idx = 0
+        for batch_idx in range(num_batch):
+            x = input[idx_in_seq_list[batch_idx]]
+            g, r_h_p, h = self.gru_step(x, h_p, w, b)
+            if batch_idx < (num_batch - 1):
+                h_p = h[:len(idx_in_seq_list[batch_idx + 1])]
+            start_idx = end_idx
+            end_idx = start_idx + len(idx_in_seq_list[batch_idx])
+            batch_gate[start_idx:end_idx] = g
+            batch_reset_hidden_prev[start_idx:end_idx] = r_h_p
+            batch_hidden[start_idx:end_idx] = h
+            hidden[idx_in_seq_list[batch_idx]] = h
+        return batch_gate, batch_reset_hidden_prev, hidden
+
+    def set_data(self):
+        lod = [[0, 2, 6, self.batch_size]]
+        self.idx_in_seq_list = self.seq_to_batch(lod, self.is_reverse)
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        input = np.random.rand(batch_size, frame_size * 3).astype('float64')
+        h0 = np.random.rand(len(self.idx_in_seq_list[0]),
+                            frame_size).astype('float64')
+        weight = np.random.rand(frame_size, frame_size * 3).astype('float64')
+        bias = np.random.rand(1, frame_size * 3).astype('float64')
+
+        self.inputs = {
+            'Input': (input, lod),
+            'H0': h0,
+            'Weight': weight,
+            'Bias': bias
+        }
+
+        self.outputs = {
+            'BatchGate': np.zeros(
+                (batch_size, frame_size * 3), dtype='float64'),
+            'BatchResetHiddenPrev': np.zeros(
+                (batch_size, frame_size), dtype='float64'),
+            'BatchHidden': np.zeros(
+                (batch_size, frame_size), dtype='float64'),
+            'Hidden': np.zeros(
+                (batch_size, frame_size), dtype='float64')
+        }
+
+    def set_confs(self):
+        self.is_reverse = False
+        self.attrs = {
+            'activation': 'tanh',
+            'gate_activation': 'sigmoid',
+            'is_reverse': self.is_reverse
+        }
+
+    def setUp(self):
+        self.op_type = "gru"
+        self.set_confs()
+        self.set_data()
+        self.gru()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
+
+
+class TestGRUOpNoInitial(TestGRUOp):
+    def set_data(self):
+        super(TestGRUOpNoInitial, self).set_data()
+        self.inputs.pop('H0')
+
+    def test_check_grad(self):
+        self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden'])
+
+
+class TestGRUOpReverse(TestGRUOp):
+    def set_confs(self):
+        self.is_reverse = True
+        self.attrs = {
+            'activation': 'identity',
+            'gate_activation': 'sigmoid',
+            'is_reverse': self.is_reverse
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_huber_loss_op.py
@@ -21,7 +21,8 @@ class TestHuberLossOp(OpTest):
            'Y': np.random.uniform(0, 1., (samples_num, 1)).astype('float32'),
        }
        residual = self.inputs['Y'] - self.inputs['X']
-        loss = np.vectorize(huber_loss_forward)(residual, delta)
+        loss = np.vectorize(huber_loss_forward)(residual,
+                                                delta).astype('float32')
        self.attrs = {'delta': delta}
        self.outputs = {
            'Residual': residual,
@@ -43,6 +44,5 @@ class TestHuberLossOp(OpTest):
            ['X'], 'Out', max_relative_error=0.008, no_grad_set=set('residual'))


-# TODO(typhoonzero): should add this back till we fix it
-#if __name__ == '__main__':
-#    unittest.main()
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_positive_negative_pair_op.py
+++ b/python/paddle/v2/framework/tests/test_positive_negative_pair_op.py
+import unittest
+import itertools
+import numpy as np
+from op_test import OpTest
+
+
+def py_pnpair_op(score, label, query, column=-1, weight=None):
+    # group by query id
+    predictions = {}
+    batch_size = label.shape[0]
+    if weight is None:
+        weight = np.ones(shape=(batch_size, 1)).astype('float32')
+    for s, l, q, w in zip(score, label, query, weight):
+        s, l, q, w = s[column], l[0], q[0], w[0]
+        if q not in predictions:
+            predictions[q] = []
+        predictions[q].append((s, l, w))
+
+    # accumulate statistics
+    pos, neg, neu = 0, 0, 0
+    for _, ranks in predictions.items():
+        for e1, e2 in itertools.combinations(ranks, 2):
+            s1, s2, l1, l2, w1, w2 = e1[0], e2[0], e1[1], e2[1], e1[2], e2[2]
+            w = (w1 + w2) * 0.5
+            if l1 == l2:
+                continue
+            if s1 == s2:
+                neu += w
+            elif (s1 - s2) * (l1 - l2) > 0:
+                pos += w
+            else:
+                neg += w
+
+    return np.array(pos).astype('float32'), np.array(neg).astype(
+        'float32'), np.array(neu).astype('float32')
+
+
+class TestPositiveNegativePairOp(OpTest):
+    def setUp(self):
+        self.op_type = 'positive_negative_pair'
+        batch_size = 20
+        max_query_id = 5
+        score = np.random.normal(size=(batch_size, 1)).astype('float32')
+        label = np.random.normal(size=(batch_size, 1)).astype('float32')
+        query = np.array(
+            [np.random.randint(max_query_id) for i in range(batch_size)])
+        query = np.reshape(query, newshape=(batch_size, 1)).astype('int64')
+
+        pos, neg, neu = py_pnpair_op(score, label, query)
+        self.inputs = {'Score': score, 'Label': label, 'QueryID': query}
+        self.attrs = {'column': -1}
+        self.outputs = {
+            'PositivePair': pos,
+            'NegativePair': neg,
+            'NeutralPair': neu
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestPositiveNegativePairOpAccumulateWeight(OpTest):
+    def setUp(self):
+        self.op_type = 'positive_negative_pair'
+        batch_size = 20
+        max_query_id = 5
+        max_random_num = 2 << 15
+        score_dim = 2
+        score = np.random.normal(size=(batch_size, 2)).astype('float32')
+        label = np.random.normal(size=(batch_size, 1)).astype('float32')
+        weight = np.random.normal(size=(batch_size, 1)).astype('float32')
+        query = np.array(
+            [np.random.randint(max_query_id) for i in range(batch_size)])
+        query = np.reshape(query, newshape=(batch_size, 1)).astype('int64')
+        acc_pos = np.reshape(
+            np.random.randint(max_random_num), newshape=(1)).astype('float32')
+        acc_neg = np.reshape(
+            np.random.randint(max_random_num), newshape=(1)).astype('float32')
+        acc_neu = np.reshape(
+            np.random.randint(max_random_num), newshape=(1)).astype('float32')
+        column = np.random.randint(score_dim)
+
+        pos, neg, neu = py_pnpair_op(
+            score, label, query, column=column, weight=weight)
+        self.inputs = {
+            'Score': score,
+            'Label': label,
+            'QueryID': query,
+            'AccumulatePositivePair': acc_pos,
+            'AccumulateNegativePair': acc_neg,
+            'AccumulateNeutralPair': acc_neu,
+            'Weight': weight
+        }
+        self.attrs = {'column': column}
+        self.outputs = {
+            'PositivePair': pos + acc_pos,
+            'NegativePair': neg + acc_neg,
+            'NeutralPair': neu + acc_neu
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
 import unittest

-import logging
-
-from op_test import get_numeric_gradient
-from paddle.v2.framework.layers import *
+import paddle.v2.framework.layers as layers
 from paddle.v2.framework.framework import Program
 from paddle.v2.framework.executor import Executor
 from paddle.v2.framework.backward import append_backward_ops
@@ -16,8 +13,8 @@ class PyRNNBase(object):
        self.x = np.ones(shape=input_shape).astype("float32")
        self.y = np.zeros(shape=output_shape).astype("float32")

-    def step(self):
-        pass
+    def step(self, step_id, x):
+        raise NotImplementedError

    def forward(self):
        for step_id in range(self.x.shape[0]):
@@ -116,30 +113,30 @@ class RecurrentOpTest1(unittest.TestCase):
        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
        self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)

-        self.output = mean(x=self.create_rnn_op(), **self.p_info)
+        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)

    def create_rnn_op(self):
-        x = data(
+        x = layers.data(
            shape=[self.sent_len, self.batch_size, self.input_dim],
            data_type='float32',
            name='x',
            append_batch_size=False,
            **self.p_info)
        x.stop_gradient = False
-        h_boot = data(
+        h_boot = layers.data(
            shape=[self.input_dim],
            data_type='float32',
            name='h_boot',
            **self.p_info)
        h_boot.stop_gradient = False

-        rnn = StaticRNN(main_program=self.main_program)
+        rnn = layers.StaticRNN(main_program=self.main_program)
        with rnn.step():
            h_pre = rnn.memory(init=h_boot)
            x_t = rnn.step_input(x)

-            h = scale(
-                x=elementwise_add(
+            h = layers.scale(
+                x=layers.elementwise_add(
                    x=h_pre, y=x_t, **self.p_info),
                scale=self.py_rnn.scale,
                **self.p_info)
@@ -249,41 +246,41 @@ class RecurrentOpTest2(RecurrentOpTest1):
        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
        self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape)

-        self.output = mean(x=self.create_rnn_op(), **self.p_info)
+        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)

    def create_rnn_op(self):
-        x = data(
+        x = layers.data(
            shape=[self.sent_len, self.batch_size, self.input_dim],
            data_type='float32',
            name='x',
            append_batch_size=False,
            **self.p_info)
        x.stop_gradient = False
-        h_boot = data(
+        h_boot = layers.data(
            shape=[self.input_dim],
            data_type='float32',
            name='h_boot',
            **self.p_info)
        h_boot.stop_gradient = False

-        rnn = StaticRNN(main_program=self.main_program)
+        rnn = layers.StaticRNN(main_program=self.main_program)
        with rnn.step():
            h_pre = rnn.memory(init=h_boot)
            x_t = rnn.step_input(x)

-            temp_l = fc(input=x_t,
-                        size=self.input_dim,
-                        param_attr={'name': 'W'},
-                        bias_attr=False,
-                        **self.p_info)
-            temp_r = fc(input=h_pre,
-                        size=self.input_dim,
-                        param_attr={'name': 'U'},
-                        bias_attr=False,
-                        **self.p_info)
-
-            h = sigmoid(
-                x=elementwise_add(
+            temp_l = layers.fc(input=x_t,
+                               size=self.input_dim,
+                               param_attr={'name': 'W'},
+                               bias_attr=False,
+                               **self.p_info)
+            temp_r = layers.fc(input=h_pre,
+                               size=self.input_dim,
+                               param_attr={'name': 'U'},
+                               bias_attr=False,
+                               **self.p_info)
+
+            h = layers.sigmoid(
+                x=layers.elementwise_add(
                    x=temp_l, y=temp_r, **self.p_info),
                **self.p_info)

@@ -293,7 +290,7 @@ class RecurrentOpTest2(RecurrentOpTest1):
        return rnn()


-class RecurrentOpTest3(RecurrentOpTest1):
+class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):
    '''
    Test RNNOp with two memories
    equation:
@@ -310,8 +307,8 @@ class RecurrentOpTest3(RecurrentOpTest1):

    class PySimpleRNN3(PyRNNBase):
        def __init__(self, input_shape, output_shape):
-            super(RecurrentOpTest3.PySimpleRNN3, self).__init__(input_shape,
-                                                                output_shape)
+            super(RecurrentOpMultipleMemoryTest.PySimpleRNN3, self).__init__(
+                input_shape, output_shape)

            seq_len, batch_size, input_dim = input_shape
            self.h_boot1 = np.random.normal(size=(batch_size,
@@ -345,27 +342,27 @@ class RecurrentOpTest3(RecurrentOpTest1):

        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = RecurrentOpTest3.PySimpleRNN3(self.input_shape,
-                                                    self.output_shape)
+        self.py_rnn = RecurrentOpMultipleMemoryTest.PySimpleRNN3(
+            self.input_shape, self.output_shape)

-        self.output = mean(x=self.create_rnn_op(), **self.p_info)
+        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)

    def create_rnn_op(self):
-        x = data(
+        x = layers.data(
            shape=[self.sent_len, self.batch_size, self.input_dim],
            data_type='float32',
            name='x',
            append_batch_size=False,
            **self.p_info)
        x.stop_gradient = False
-        h_boot1 = data(
+        h_boot1 = layers.data(
            shape=[self.batch_size, self.input_dim],
            data_type='float32',
            name='h_boot1',
            append_batch_size=False,
            **self.p_info)
        h_boot1.stop_gradient = False
-        h_boot2 = data(
+        h_boot2 = layers.data(
            shape=[self.batch_size, self.input_dim],
            data_type='float32',
            name='h_boot2',
@@ -373,15 +370,15 @@ class RecurrentOpTest3(RecurrentOpTest1):
            **self.p_info)
        h_boot2.stop_gradient = False

-        rnn = StaticRNN(main_program=self.main_program)
+        rnn = layers.StaticRNN(main_program=self.main_program)
        with rnn.step():
            h_pre1 = rnn.memory(init=h_boot1)
            h_pre2 = rnn.memory(init=h_boot2)
            x_t = rnn.step_input(x)

-            mem1 = scale(x=h_pre1, scale=1.0, **self.p_info)
-            mem2 = scale(x=h_pre2, scale=1.0, **self.p_info)
-            out = sums(input=[mem1, x_t, mem2], **self.p_info)
+            mem1 = layers.scale(x=h_pre1, scale=1.0, **self.p_info)
+            mem2 = layers.scale(x=h_pre2, scale=1.0, **self.p_info)
+            out = layers.sums(input=[mem1, x_t, mem2], **self.p_info)

            rnn.update_memory(h_pre1, mem1)
            rnn.update_memory(h_pre2, mem2)
@@ -390,5 +387,70 @@ class RecurrentOpTest3(RecurrentOpTest1):
        return rnn()


+class RecurrentOpNoMemBootTest(RecurrentOpTest1):
+    '''
+    Test RNNOp with two memories
+    equation:
+        mem = x + mem_pre
+        y = mem
+    vars:
+        - x
+    memories:
+        - mem
+    outputs:
+       - y
+    '''
+
+    class PySimpleRNN4(PyRNNBase):
+        def __init__(self, input_shape, output_shape):
+            super(RecurrentOpNoMemBootTest.PySimpleRNN4, self).__init__(
+                input_shape, output_shape)
+            men_dim = input_shape
+            self.mems = np.zeros(shape=men_dim).astype("float32")
+
+        def step(self, step_id, x):
+            if step_id == 0:
+                pre_mem = np.zeros_like(x)
+            else:
+                pre_mem = self.mems[step_id - 1]
+            self.mems[step_id] = pre_mem + x
+            self.y[step_id] = self.mems[step_id]
+
+    input_dim = 1
+    batch_size = 1
+    sent_len = 2
+
+    def setUp(self):
+        self.setup_program()
+
+        self.data_field = {"x"}
+
+        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape,
+                                                            self.output_shape)
+        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
+        print self.main_program
+
+    def create_rnn_op(self):
+        x = layers.data(
+            shape=[self.sent_len, self.batch_size, self.input_dim],
+            data_type='float32',
+            name='x',
+            append_batch_size=False,
+            **self.p_info)
+        x.stop_gradient = False
+
+        rnn = layers.StaticRNN(main_program=self.main_program)
+        with rnn.step():
+            mem_pre = rnn.memory(shape=[-1, self.input_dim], batch_ref=x)
+            x_t = rnn.step_input(x)
+            mem = layers.elementwise_add(x=mem_pre, y=x_t, **self.p_info)
+            rnn.update_memory(mem_pre, mem)
+            rnn.output(mem)
+
+        return rnn()
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
@@ -29,6 +29,9 @@ class TestSeqAvgPool(OpTest):
        self.check_output()

    def test_check_grad(self):
+        # Remove MaxIndex after check_grad is refined.
+        self.outputs['MaxIndex'] = \
+            np.zeros(self.outputs['Out'].shape).astype('int32')
        self.check_grad(["X"], "Out")


@@ -85,31 +88,53 @@ class TestSeqSqrtPool2D(TestSeqAvgPool2D):
            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17))

    def test_check_grad(self):
+        # Remove MaxIndex after check_grad is refined.
+        self.outputs['MaxIndex'] = \
+            np.zeros(self.outputs['Out'].shape).astype('int32')
        self.check_grad(["X"], "Out", max_relative_error=0.06)


 class TestSeqMaxPool(TestSeqAvgPool):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
+        lod = [[0, 4, 5, 8, 13]]
+        for i in range(4):
+            l = lod[0][i + 1] - lod[0][i]
+            x[lod[0][i] + np.random.randint(l), :] += 2.0
+
+        self.inputs = {'X': (x, lod)}
+
+        out = np.zeros((4, 23)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+
    def compute(self, x, lod, out):
        self.attrs = {'pooltype': "MAX"}
        for i in range(4):
            sub_x = x[lod[0][i]:lod[0][i + 1], :]
            out[i] = np.amax(sub_x, axis=0)

-    def test_check_grad(self):
-        # Remove MaxPool2D from gradient check to confirm the success of CI.
-        return
-

 class TestSeqMaxPool2D(TestSeqAvgPool2D):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32')
+        lod = [[0, 4, 5, 8, 13]]
+        self.inputs = {'X': (x, lod)}
+        for i in range(4):
+            l = lod[0][i + 1] - lod[0][i]
+            x[lod[0][i] + np.random.randint(l), :] += 1.0
+
+        out = np.zeros((4, 3, 11)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+
    def compute(self, x, lod, out):
        self.attrs = {'pooltype': "MAX"}
        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
-            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 17))
-
-    def test_check_grad(self):
-        # Remove MaxPool2D from gradient check to confirm the success of CI.
-        return
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 11))
+            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))


 class TestSeqLastPool(TestSeqAvgPool):

--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -12,30 +12,30 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):

    def setUp(self):
        self.op_type = "softmax_with_cross_entropy"
-        batch_size = 3
+        batch_size = 2
        class_num = 37

        logits = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float32")
+                                   [batch_size, class_num]).astype("float64")
        softmax = np.apply_along_axis(stable_softmax, 1, logits)
-        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int32")
+        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")

        cross_entropy = np.asmatrix(
            [[-np.log(softmax[i][labels[i][0]])]
             for i in range(softmax.shape[0])],
-            dtype="float32")
+            dtype="float64")

        self.inputs = {"Logits": logits, "Label": labels}
        self.outputs = {
-            "Softmax": softmax.astype('float32'),
-            "Loss": cross_entropy.astype('float32')
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
        }

    def test_check_output(self):
        self.check_output()

    def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+        self.check_grad(["Logits"], "Loss")


 class TestSoftmaxWithCrossEntropyOp2(OpTest):
@@ -49,19 +49,19 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
        class_num = 37

        logits = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float32")
+                                   [batch_size, class_num]).astype("float64")
        softmax = np.apply_along_axis(stable_softmax, 1, logits)
        labels = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float32")
+                                   [batch_size, class_num]).astype("float64")
        labels /= np.sum(labels, axis=1, keepdims=True)

        cross_entropy = (-labels * np.log(softmax)).sum(
-            axis=1, keepdims=True).astype("float32")
+            axis=1, keepdims=True).astype("float64")

        self.inputs = {"Logits": logits, "Label": labels}
        self.outputs = {
-            "Softmax": softmax.astype('float32'),
-            "Loss": cross_entropy.astype('float32')
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
        }
        self.attrs = {"soft_label": True}

@@ -69,9 +69,8 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
        self.check_output()

    def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+        self.check_grad(["Logits"], "Loss")


 if __name__ == "__main__":
-    exit(0)  # FIXME: xe has bug
    unittest.main()
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -11,11 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Optimizers(update equation) for SGD method.
-
-TODO(yuyang18): Complete comments.
-"""

 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
@@ -101,32 +96,37 @@ class Optimizer(object):

 class Momentum(Optimizer):
    """
-    SGD Optimizer.
-
-    SGD is an optimization method, trying to find a neural network that
-    minimize the "cost/error" of it by iteration. In paddle's implementation
-    SGD Optimizer is synchronized, which means all gradients will be wait to
-    calculate and reduced into one gradient, then do optimize operation.
+    Momentum Optimizer.

-    The neural network consider the learning problem of minimizing an objective
-    function, that has the form of a sum
+    When sparse=False, the momentum update formula is as follows:

    ..  math::

-        Q(w) = \\sum_{i}^{n} Q_i(w)
+        v_{t} &= k * v_{t-1} - \\gamma_t / (g_{t} + \\lambda w_{t-1}) \\\\
+        w_{t} &= w_{t-1} + v_{t} \\\\

-    The value of function Q sometimes is the cost of neural network (Mean
-    Square Error between prediction and label for example). The function Q is
-    parametrised by w, the weight/bias of neural network. And weights is what to
-    be learned. The i is the i-th observation in (trainning) data.
+    where, :math:`k` is momentum, :math:`\\lambda` is decay rate,
+    :math:`\\gamma_t` is learning rate at the t'th iteration.
+    :math:`w_{t}` is the weight as the t'th iteration.
+    And the :math:`v_{t}` is the history momentum variable.

-    So, the SGD method will optimize the weight by
+    When sparse=True, the update scheme:

    ..  math::

-        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
-
-    where :math:`\\eta` is learning rate. And :math:`n` is batch size.
+        \\alpha_t &= \\alpha_{t-1} / k \\\\
+        \\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\
+        u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\
+        v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\
+        \\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t
+    
+    where :math:`k` is momentum, :math:`\\lambda` is decay rate, 
+    :math:`\\gamma_t` is learning rate at the t'th iteration.
+
+    :param momentum: the momentum factor.
+    :type momentum: float
+    :param sparse: with sparse support or not, False by default.
+    :type sparse: bool
    """

    def __init__(self, momentum=None, sparse=False, **kwargs):
@@ -146,7 +146,7 @@ class Adam(Optimizer):

        m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
        v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}

    :param beta1: the :math:`\\beta_1` in equation.
    :type beta1: float

--- a/python/setup.py.in
+++ b/python/setup.py.in
-from setuptools import setup, Distribution
+from setuptools import setup, Distribution, Extension
 class BinaryDistribution(Distribution):
    def has_ext_modules(foo):
        return True
@@ -41,6 +41,7 @@ setup(name='paddlepaddle',
      description='Parallel Distributed Deep Learning',
      install_requires=setup_requires,
      packages=packages,
+      ext_modules=[Extension('_foo', ['stub.cc'])],
      package_data={
        'paddle.v2.master': ['libpaddle_master.so'],
        'paddle.v2.framework': ['core.so'],
@@ -54,6 +55,5 @@ setup(name='paddlepaddle',
          'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
      },
      scripts=paddle_bins,
-      distclass=BinaryDistribution,
      data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
 )