diff --git a/CMakeLists.txt b/CMakeLists.txt
index 264420ad830ed39b38f1918951d8d66c84fd5ee9..fd3582a1bca199d62d19550ffdd1efe9db520fa7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,7 +126,7 @@ include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
-include(external/pybind11)    # download pybind11
+include(external/pybind11)  # download pybind11
 include(external/nccl)
 
 include(cudnn)              # set cudnn libraries, must before configure
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
index 0b38943952f7fb9052368fe95eb31dd7592d8a47..310450f7d009dc0cdae9c0079a96445af8ec8f95 100644
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -79,9 +79,8 @@ if(NOT DEFINED IOS_ARCH)
     # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
     set(IOS_ARCH "arm64")
   elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    set(IOS_ARCH "i386;x86_64")
-  elseif(IOS_PLATFORM STREQUAL "WATCHOS")
-    set(IOS_ARCH armv7k)
+    # FIXME(liuyiqun): support "i386;x86_64" future
+    set(IOS_ARCH "x86_64")
   endif()
 endif()
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake
index 57d2c0a352507afd01d1cbf2c7b23c00ff7ad81b..fc43766efafc3d3e16f2906ce7f9a3d692c8e4ff 100644
--- a/cmake/external/nccl.cmake
+++ b/cmake/external/nccl.cmake
@@ -1,3 +1,21 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_GPU)
+  return()
+endif()
+
 include(ExternalProject)
 
 set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 143b57a954e4e6b2bf273535ebdf0fa8e3dab768..3f86e456cfbe55fe47e5b18e755e34829ebe9930 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 9391c285c7544669a5b1a078b7473d7a656c1bb4..4e87dc49d8956d1fa6dec777efc5a63c6b0f79a5 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -1,8 +1,26 @@
-INCLUDE(ExternalProject)
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+if(NOT WITH_PYTHON)
+    return()
+endif()
+
+include(ExternalProject)
 
-INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+
+include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
 
 ExternalProject_Add(
         extern_pybind
@@ -17,14 +35,12 @@ ExternalProject_Add(
         TEST_COMMAND      ""
 )
 
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";")
     add_library(pybind STATIC ${dummyfile})
 else()
     add_library(pybind INTERFACE)
 endif()
 
 add_dependencies(pybind extern_pybind)
-
-LIST(APPEND external_project_dependencies pybind)
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
index ce088ae7eaa3355f2f9761e8c421da0d7ef89fa7..9db457c7b2d61228e5d5af6827c4cda11a20a463 100644
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index e2c9fe56f335ae5b627b4d8d4bb17e4a2a466677..a98e069b7cd1654ddd5868560d0905eab6d9c692 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 46035a908b588861607a25d3a21cf34b7b6fd4b8..53c2de332ea74b06d1bd6e5bb119cad6af27ed01 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -1,27 +1,28 @@
 # This file is use to check all support level of AVX on your machine
 # so that PaddlePaddle can unleash the vectorization power of muticore.
 
-INCLUDE(CheckCXXSourceRuns)
-INCLUDE(CheckCXXSourceCompiles)
+include(CheckCXXSourceRuns)
+include(CheckCXXSourceCompiles)
 
-IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
     set(MMX_FLAG "-mmmx")
     set(SSE2_FLAG "-msse2")
     set(SSE3_FLAG "-msse3")
-    SET(AVX_FLAG "-mavx")
-    SET(AVX2_FLAG "-mavx2")
-ELSEIF(MSVC)
+    set(AVX_FLAG "-mavx")
+    set(AVX2_FLAG "-mavx2")
+elseif(MSVC)
     set(MMX_FLAG "/arch:MMX")
     set(SSE2_FLAG "/arch:SSE2")
     set(SSE3_FLAG "/arch:SSE3")
     SET(AVX_FLAG "/arch:AVX")
     SET(AVX2_FLAG "/arch:AVX2")
-ENDIF()
+endif()
 
 set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
 
 # Check  MMX
 set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
+set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <mmintrin.h>
 int main()
@@ -32,6 +33,7 @@ int main()
 
 # Check SSE2
 set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
+set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <emmintrin.h>
 int main()
@@ -42,6 +44,7 @@ int main()
 
 # Check SSE3
 set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
+set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <pmmintrin.h>
 int main()
@@ -55,6 +58,7 @@ int main()
 
 # Check AVX
 set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
@@ -67,6 +71,7 @@ int main()
 
 # Check AVX 2
 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 30b144d849bec367cd0197b6082889e011193a9a..0d34dec8e908c5e61001500725187a2233797f46 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -145,7 +145,7 @@ PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以
 
 Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
 
-PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
 如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
 
 我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index e767856d5012fd205f6b57f9721d0cbca8dc46ed..d267b14657be2a773d1dacfd9ac3767cddc47415 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -29,32 +29,32 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
 add_dependencies(paddle_capi paddle_proto)
 
 # TODO: paddle_capi_whole will be removed.
+set(PADDLE_CAPI_LAYERS_LIBS
+    paddle_function
+    paddle_gserver)
 if(MOBILE_INFERENCE)
-    set(PADDLE_CAPI_INFER_LIBS
-        paddle_utils
-        paddle_parameter
-        paddle_math
-        paddle_cuda
-        paddle_function
-        paddle_gserver
-        paddle_proto)
+  set(PADDLE_CAPI_ENGINE_LIBS
+      paddle_utils
+      paddle_parameter
+      paddle_math
+      paddle_cuda
+      paddle_proto)
 else()
-    set(PADDLE_CAPI_INFER_LIBS
-        paddle_utils
-        paddle_parameter
-        paddle_math
-        paddle_cuda
-        paddle_function
-        paddle_gserver
-        paddle_proto
-        paddle_pserver
-        paddle_network)
+  set(PADDLE_CAPI_ENGINE_LIBS
+      paddle_utils
+      paddle_parameter
+      paddle_math
+      paddle_cuda
+      paddle_proto
+      paddle_pserver
+      paddle_network)
 endif()
+set(PADDLE_CAPI_INFER_LIBS ${PADDLE_CAPI_LAYERS_LIBS} ${PADDLE_CAPI_ENGINE_LIBS})
 cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
 
 # Link the static library for inference
-cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto)
-cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver)
+cc_library(paddle_capi_engine DEPS paddle_capi ${PADDLE_CAPI_ENGINE_LIBS})
+cc_library(paddle_capi_layers DEPS ${PADDLE_CAPI_LAYERS_LIBS})
 
 # Link the shared library for inference
 if(NOT IOS)
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index c1a009f131c803f7881faf2e322eecf3e6e8ea05..2fcf41d69f0011b0d9a3d89c97fcebacb0703e97 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
 
@@ -73,6 +74,8 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
     var->GetMutable<std::vector<framework::Scope>>();
   } else if (var_type == VarDesc::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
+  } else if (var_type == VarDesc::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
   } else {
     PADDLE_THROW(
         "Variable type %d is not in "
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 54ce461ce81955c9c3b550998e4a1df32c201ab3..f1fc4529e15502927560eefd74110f6ca7eab4a9 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -109,6 +109,11 @@ message LoDTensorDesc {
   optional int32 lod_level = 2 [ default = 0 ];
 }
 
+message LoDTensorArrayDesc {
+  required TensorDesc tensor = 1;
+  optional int32 lod_level = 2 [ default = 0 ];
+}
+
 message VarDesc {
   enum VarType {
     LOD_TENSOR = 1;
@@ -117,11 +122,13 @@ message VarDesc {
     FETCH_LIST = 4;
     STEP_SCOPES = 5;
     LOD_RANK_TABLE = 6;
+    LOD_TENSOR_ARRAY = 7;
   }
   required string name = 1;
   required VarType type = 2;
   optional LoDTensorDesc lod_tensor = 3;
   optional TensorDesc selected_rows = 4;
+  optional LoDTensorArrayDesc tensor_array = 6;
   optional bool persistable = 5 [ default = false ];
 }
 
diff --git a/paddle/framework/lod_rank_table.cc b/paddle/framework/lod_rank_table.cc
index f9abf902a13f3ebe5ea75c7456b117462cec8da5..68a83def7e5a12fe3261be9e27cbb9bb54e1f8ad 100644
--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
@@ -33,10 +33,15 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
     item.length = vec[i + 1] - vec[i];
     items_.emplace_back(item);
   }
-  std::sort(items_.begin(), items_.end(),
-            [](const TableItem& a, const TableItem& b) {
-              return a.length > b.length;
-            });
+  // NOTE(yuyang18):
+  //
+  // The time complexity of stable_sort is O(N*log(N)) if additional memory is
+  // available. It is easy to debug and unit test when using `stable_sort`
+  // instead of `sort`. Also, the items of a rank table will not be too large.
+  std::stable_sort(items_.begin(), items_.end(),
+                   [](const TableItem& a, const TableItem& b) {
+                     return a.length > b.length;
+                   });
 }
 
 }  // namespace framework
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 584308a5388da0d02d29f71a28097b02b6ea825f..2bcfffb134f46416301b28043e1875e822fbc3e4 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -135,5 +135,43 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
   PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
   ShareDataWith(Slice(begin, end));
 }
+
+void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
+                             std::vector<std::vector<size_t>>* lod_length,
+                             size_t* start_offset) {
+  lod_length->clear();
+  PADDLE_ENFORCE(start_idx < lod.size() - 1,
+                 "start_idx should be >= 0 and < lod.size() - 1.");
+  PADDLE_ENFORCE(end_idx < lod.size(),
+                 "end_idx should be >= 0 and < lod.size().");
+  PADDLE_ENFORCE_LE(start_idx, end_idx,
+                    "start_idx should be less than end_idx.");
+  for (size_t level_idx = 0; level_idx < lod.size(); ++level_idx) {
+    std::vector<size_t> level_lens;
+    for (size_t i = start_idx; i < end_idx; ++i) {
+      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
+    }
+    lod_length->emplace_back(level_lens);
+    start_idx = lod[level_idx][start_idx];
+    end_idx = lod[level_idx][end_idx];
+  }
+  *start_offset = start_idx;
+}
+
+void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length) {
+  PADDLE_ENFORCE_EQ(
+      lod->size(), lod_length.size(),
+      "The lod_length should has the same size with the appended lod.");
+  for (size_t i = 0; i < lod->size(); ++i) {
+    auto& level = (*lod)[i];
+    if (level.empty()) {
+      level.push_back(0);
+    }
+    for (size_t len : lod_length[i]) {
+      level.push_back(level.back() + len);
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index f4fe4cdac6019a1899fd3db8e1b6ca588be0d436..1437da399a28288429527f9672ace0482825159f 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -181,5 +181,11 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
   return tensor;
 }
 
+void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
+                             std::vector<std::vector<size_t>>* lod_length,
+                             size_t* start_offset);
+
+void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor_array.h b/paddle/framework/lod_tensor_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..13f0608d24be97d8bba149b74f1a4deb57deeb48
--- /dev/null
+++ b/paddle/framework/lod_tensor_array.h
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+using LoDTensorArray = std::vector<LoDTensor>;
+}
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index aa2f6c993d41ae98e0769d470dccad3b410da53e..bf61c9ee7aa99b06e78b9b27dff72d216c74a86e 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -144,5 +144,47 @@ TEST(LodExpand, test) {
   }
 }
 
+TEST(LoD, GetFineGrainedLoDLength) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>{0, 2, 4, 5});
+  lod.push_back(std::vector<size_t>{0, 1, 6, 8, 10, 11});
+  lod.push_back(
+      std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29});
+
+  std::vector<std::vector<size_t>> lod_length;
+  size_t start_offset;
+  paddle::framework::GetFineGrainedLoDLength(lod, 1, 2, &lod_length,
+                                             &start_offset);
+
+  std::vector<std::vector<size_t>> expected;
+  expected.push_back(std::vector<size_t>{2});
+  expected.push_back(std::vector<size_t>{2, 2});
+  expected.push_back(std::vector<size_t>{2, 3, 4, 2});
+  EXPECT_EQ(lod_length, expected);
+  EXPECT_EQ(start_offset, 15UL);
+}
+
+TEST(LoD, AppendLoD) {
+  std::vector<std::vector<size_t>> lod_lens;
+  lod_lens.push_back(std::vector<size_t>{2});
+  lod_lens.push_back(std::vector<size_t>{2, 2});
+  lod_lens.push_back(std::vector<size_t>{2, 3, 4, 2});
+
+  LoD origin;
+  origin.push_back(std::vector<size_t>{0, 2});
+  origin.push_back(std::vector<size_t>{0, 1, 6});
+  origin.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15});
+
+  paddle::framework::AppendLoD(&origin, lod_lens);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>{0, 2, 4});
+  expected.push_back(std::vector<size_t>{0, 1, 6, 8, 10});
+  expected.push_back(
+      std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26});
+
+  EXPECT_EQ(origin, expected);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 8e92c81d1137472737230be79d71824593d3256f..16aca192d41a64003de85ce45f7697bf45c556ed 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -37,13 +37,27 @@ std::vector<int64_t> VarDescBind::Shape() const {
 DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }
 
 void VarDescBind::SetLoDLevel(int32_t lod_level) {
-  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
-  desc_.mutable_lod_tensor()->set_lod_level(lod_level);
+  switch (desc_.type()) {
+    case VarDesc::LOD_TENSOR:
+      desc_.mutable_lod_tensor()->set_lod_level(lod_level);
+      break;
+    case VarDesc::LOD_TENSOR_ARRAY:
+      desc_.mutable_tensor_array()->set_lod_level(lod_level);
+      break;
+    default:
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+  }
 }
 
 int32_t VarDescBind::GetLodLevel() const {
-  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
-  return desc_.lod_tensor().lod_level();
+  switch (desc_.type()) {
+    case VarDesc::LOD_TENSOR:
+      return desc_.lod_tensor().lod_level();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().lod_level();
+    default:
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+  }
 }
 
 const TensorDesc &VarDescBind::tensor_desc() const {
@@ -53,6 +67,8 @@ const TensorDesc &VarDescBind::tensor_desc() const {
       return desc_.selected_rows();
     case VarDesc::LOD_TENSOR:
       return desc_.lod_tensor().tensor();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().tensor();
     default:
       PADDLE_THROW("Unexpected branch.");
   }
@@ -66,6 +82,8 @@ TensorDesc *VarDescBind::mutable_tensor_desc() {
       return desc_.mutable_selected_rows();
     case VarDesc::LOD_TENSOR:
       return desc_.mutable_lod_tensor()->mutable_tensor();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.mutable_tensor_array()->mutable_tensor();
     default:
       PADDLE_THROW("Unexpected branch.");
   }
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8eb700723f2cf7dda969739bb5e3d48358d278a0
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -0,0 +1,154 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNAddtoLayer.h"
+
+using namespace mkldnn;  // NOLINT
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_addto, MKLDNNAddtoLayer);
+
+bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  layerSize_ = getSize();
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(layerSize_, inputLayers_[i]->getSize()) << "input size must equal";
+  }
+  if (biasParameter_.get() != NULL) {
+    biases_ =
+        std::unique_ptr<Weight>(new Weight(1, layerSize_, biasParameter_, 0));
+  }
+  return true;
+}
+
+void MKLDNNAddtoLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+  CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
+  reshapeInput(bs, ih, iw);
+  ic = inputLayers_[0]->getSize() / ih / iw;
+  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
+  CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
+    CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
+  }
+
+  oc = ic;
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+  printSizeInfo();
+}
+
+void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& wgt,
+                                MKLDNNMatrixPtr& bias,
+                                MKLDNNMatrixPtr& out) {
+  if (biases_) {
+    LOG(FATAL) << "not implemented yet";
+  }
+  resetFwdBuffers(inVals_, out);
+  in = inVals_[0];
+
+  std::shared_ptr<sum::primitive_desc> fwdPD;
+  resetFwdPD(fwdPD, inVals_, out);
+
+  resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+}
+
+void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& wgt,
+                                MKLDNNMatrixPtr& bias,
+                                MKLDNNMatrixPtr& out) {
+  resetBwdBuffers(inGrads_, out);
+  in = inGrads_[0];
+
+  // backward only need share output grad to input grad
+  for (size_t i = 0; i < inGrads_.size(); i++) {
+    if (inGrads_[i] != nullptr) {
+      inGrads_[i] = out;
+      inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
+    }
+  }
+}
+
+void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& out) {
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInValue(inputs[i], nullptr, i);
+    CHECK(inputs[i]);
+    inputs[i]->downSpatial();
+  }
+  for (size_t i = 1; i < inputs.size(); i++) {
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inputs[0]->getPrimitiveDesc());
+  }
+
+  resetOutValue(out, inputs[0]->getPrimitiveDesc());
+}
+
+void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
+                                  std::vector<MKLDNNMatrixPtr>& inputs,
+                                  MKLDNNMatrixPtr out) {
+  std::vector<double> scales(inputs.size(), 1.0);
+  std::vector<memory::primitive_desc> srcPDs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
+  }
+  CHECK(out);
+  pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+}
+
+void MKLDNNAddtoLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<sum::primitive_desc>& pd,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  std::vector<primitive::at> srcs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcs.push_back(*(inputs[i]));
+  }
+  fwd_.reset(new sum(*pd, srcs, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& out) {
+  CHECK(outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  CHECK(out);
+
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/gserver/layers/MKLDNNAddtoLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..15f74ec5bdf3d1e4ae5e09051be6be418590a67a
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -0,0 +1,110 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer Addto layer.
+ *
+ * The config file api is mkldnn_addto
+ */
+class MKLDNNAddtoLayer : public MKLDNNLayer {
+protected:
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
+
+  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
+  size_t layerSize_;
+
+  // TODO(TJ): this part has not been optimized by MKL-DNN
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNAddtoLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateWeights(const UpdateCallback& callback) override;
+
+  void printValueFormat() override {
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
+    }
+    if (extOutVal_) {
+      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+    }
+  }
+
+  void printGradFormat() override {
+    if (extOutGrad_) {
+      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
+    }
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+    }
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
+    }
+  }
+
+protected:
+  /**
+   * Forward functions: reset buffers(inputs, output, bias),
+   *                    reset primitive descriptor,
+   *                    reset pipeline.
+   */
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+
+  /**
+   * Backward functions: reset buffers(inputs, output, bias)
+   */
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index 663a10509857ec9fb487c1cda1621bdfac1250ac..5fd62f4f73b18df683ccf74143e45054c3631c22 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -77,7 +77,7 @@ void MKLDNNLayer::forward(PassType passType) {
       needResetBwd_ = true;
     }
 
-    if (inputLayers_[0]->getType() == "data") {
+    if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
       // Update input value data when input layer is "data" type,
       // since the input value data address might be changed.
       CHECK(extInVal_);
@@ -171,14 +171,16 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
 }
 
 void MKLDNNLayer::resetInValue(
-    MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) {
+    MKLDNNMatrixPtr& in,
+    const std::shared_ptr<memory::primitive_desc>& intPD,
+    size_t inputIdx) {
   cvtInVal_ = nullptr;
   extInVal_ = nullptr;
   in = nullptr;
   CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
   auto extPD = MKLDNNMatrix::createPrimitiveDesc(
       {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  const MatrixPtr& inMat = inputLayers_[inputIdx]->getOutputValue();
   in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
   CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
   if (in == nullptr || in->getFormat() == format::nc) {
@@ -216,11 +218,12 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
 }
 
 void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
-                              memory::primitive_desc intPD) {
+                              memory::primitive_desc intPD,
+                              size_t inputIdx) {
   cvtInGrad_ = nullptr;
   extInGrad_ = nullptr;
   in = nullptr;
-  LayerPtr& input = inputLayers_[0];
+  LayerPtr& input = inputLayers_[inputIdx];
   if (input->getOutputGrad() == nullptr) {
     // no need input grad
     return;
@@ -245,7 +248,6 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
     return;
   }
   // need create reorder
-  // TODO(TJ): add macro definition to simplify it
   CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
       << "should have external input value and the format must be nchw(nc)";
   extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 2c21a5b2aaecb17a52a5de9a98664068f2255d83..7479c34c92b5231b2521493bc631474d4efd4224 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -199,7 +199,8 @@ protected:
    */
   void resetInValue(
       MKLDNNMatrixPtr& in,
-      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr);
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
+      size_t inputIdx = 0);
 
   /**
    * reset output value from internal primitive desc.
@@ -212,7 +213,9 @@ protected:
    * reset input grad from internal primitive desc.
    * reset both internal and external buffer and create reorder if necessary.
    */
-  void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD);
+  void resetInGrad(MKLDNNMatrixPtr& in,
+                   mkldnn::memory::primitive_desc intPD,
+                   size_t inputIdx = 0);
 
   /**
    * reset output grad from internal primitive desc.
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index 19b7ad1869af98e6313fe85a40203fd1e84f31d6..00d8ce017aa0121217688a1afc1fe31b4c3619ec 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -98,8 +98,19 @@ void SubSequenceLayer::forward(PassType passType) {
   CHECK_EQ(numSequences2, numSequences3);
 
   MatrixPtr inputValue = input.value;
-  IVectorPtr offsetValue = offsetSeq.ids;
-  IVectorPtr sizeValue = sizeSeq.ids;
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
 
   CHECK_EQ(offsetValue->getSize(), numSequences1);
   CHECK_EQ(sizeValue->getSize(), numSequences1);
@@ -176,8 +187,21 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) {
   size_t numSequences1 = startPositions1->getSize() - 1;
   const int* starts1 = startPositions1->getData();
 
-  IVectorPtr offsetValue = getInput(1).ids;
-  IVectorPtr sizeValue = getInput(2).ids;
+  const Argument& offsetSeq = getInput(1);
+  const Argument& sizeSeq = getInput(2);
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
 
   int* offsets = offsetValue->getData();
   int* sizes = sizeValue->getData();
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
index 7670cb88fb67dec0ab1d170458d102da166dc7b6..afe1608eab8eaf1217a7a0c8a2774e37c5ea83f4 100644
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -132,7 +132,7 @@ void MKLDNNTester::checkForward() {
   VLOG(MKLDNN_TESTS) << "Check Forward";
   printTopDatas();
   double delta =
-      compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
+      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
   EXPECT_LE(fabs(delta), eps_);
 }
 
@@ -147,7 +147,7 @@ void MKLDNNTester::checkBackwardData() {
     VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
     printMatrix(refDiff);
 
-    double delta = compareMatrix(dnnDiff, refDiff);
+    double delta = compareMatrix(refDiff, dnnDiff);
     EXPECT_LE(fabs(delta), eps_);
     if (isBN) {
       // the other two inputs in batch norm are for moving mean and var
@@ -177,7 +177,7 @@ void MKLDNNTester::checkBackwardWgts() {
                      << parameters_[REF][i]->getName();
     printVector(ref);
 
-    double delta = compareVector(dnn, ref);
+    double delta = compareVector(ref, dnn);
     EXPECT_LE(fabs(delta), eps_);
   }
 
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index d60b0f04a1613acc3711e711cfe18ced5f0f924d..2e8d9f3333b36005c9b3b28449c76a4a44c74cc6 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -271,20 +271,53 @@ TEST(MKLDNNLayer, BatchNormLayer) {
   testBatchNormLayer({16, 32, 16, 16});
 }
 
-struct testActDesc {
+struct testImageDesc {
   int bs, ic, ih, iw;
 };
 
-static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
+static void getAddtoConfig(TestConfig& cfg,
+                           const testImageDesc& pm,
+                           const size_t nInputs = 1) {
   cfg.biasSize = 0;
   cfg.layerConfig.set_type("addto");
   size_t layerSize = pm.ic * pm.ih * pm.iw;
   cfg.layerConfig.set_size(layerSize);
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
-  cfg.layerConfig.add_inputs();
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < nInputs; ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(pm.ic);
+    img_conf->set_img_size_y(pm.ih);
+    img_conf->set_img_size(pm.iw);
+  }
+}
+
+void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
+  CHECK_GE(nInputs, 1);
+  TestConfig dnnConfig;
+  getAddtoConfig(dnnConfig, pm, nInputs);
+  dnnConfig.layerConfig.set_type("mkldnn_addto");
+  // TODO(TJ): test with bias
+  for (auto withBias : {false}) {
+    if (withBias) {
+      dnnConfig.biasSize = pm.ic * pm.ih * pm.iw;
+    } else {
+      dnnConfig.biasSize = 0;
+    }
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
+  }
+}
+
+TEST(MKLDNNLayer, AddtoLayer) {
+  testAddtoLayer({16, 5, 14, 14}, 1);
+  testAddtoLayer({8, 10, 8, 8}, 2);
+  testAddtoLayer({4, 12, 1, 1}, 3);
 }
 
-void testActivation(std::string actType, const testActDesc& pm) {
+void testActivation(std::string actType, const testImageDesc& pm) {
   // TODO(TJ): remove me when paddle support elu activation
   if (actType == "mkldnn_elu") {
     return;
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index f2736983c68b8695a37f8d17d260b572b8e399e7..c5ecccf9023d7fba7d0edec4c687c9515da034e6 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -150,8 +150,10 @@ set(DEPS_OPS
     conv_transpose_op
     nccl_op
     sequence_conv_op
+    sequence_pool_op
     lod_rank_table_op
-    lstm_op)
+    lstm_op
+    gru_op)
 
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
@@ -164,8 +166,10 @@ if(WITH_GPU)
 op_library(nccl_op DEPS nccl_common)
 endif()
 op_library(sequence_conv_op DEPS context_project)
+op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(conv_transpose_op DEPS vol2col)
+op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
         DEPS net_op tensor_array)
 op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index 483f9888973edc9db6317723c136778d40cc7878..83d35a450d0e8ebf5311cdfd948b066642ccec8c 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -44,7 +44,7 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Sigmoid operator");
     AddOutput("Y", "Output of Sigmoid operator");
     AddComment(R"DOC(
-Sigmoid activation operator.
+Sigmoid Activation Operator.
 
 $y = 1 / (1 + e^{-x})$
 
@@ -60,7 +60,7 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of LogSigmoid operator");
     AddOutput("Y", "Output of LogSigmoid operator");
     AddComment(R"DOC(
-Logsigmoid activation operator.
+Logsigmoid Activation Operator.
 
 $y = \log(1 / (1 + e^{-x}))$
 
@@ -75,7 +75,7 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Exp operator");
     AddOutput("Y", "Output of Exp operator");
     AddComment(R"DOC(
-Exp activation operator.
+Exp Activation Operator.
 
 $y = e^x$
 
@@ -90,7 +90,7 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Relu operator");
     AddOutput("Y", "Output of Relu operator");
     AddComment(R"DOC(
-Relu activation operator.
+Relu Activation Operator.
 
 $y = \max(x, 0)$
 
@@ -109,7 +109,7 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("alpha", "The small negative slope")
         .SetDefault(static_cast<AttrType>(0.02f));
     AddComment(R"DOC(
-LeakyRelu activation operator.
+LeakyRelu Activation Operator.
 
 $y = \max(x, \alpha * x)$
 
@@ -128,7 +128,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("lambda", "non-negative offset")
         .SetDefault(static_cast<AttrType>(0.5f));
     AddComment(R"DOC(
-Softshrink activation operator.
+Softshrink Activation Operator.
 
 $$
 y = \begin{cases} 
@@ -149,7 +149,7 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Tanh operator");
     AddOutput("Y", "Output of Tanh operator");
     AddComment(R"DOC(
-Tanh activation operator.
+Tanh Activation Operator.
 
 $$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
@@ -165,7 +165,7 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of TanhShrink operator");
     AddOutput("Y", "Output of TanhShrink operator");
     AddComment(R"DOC(
-TanhShrink activation operator.
+TanhShrink Activation Operator.
 
 $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
@@ -184,7 +184,7 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
         .SetDefault(static_cast<AttrType>(0.5));
     AddComment(R"DOC(
-HardShrink activation operator.
+HardShrink Activation Operator.
 
 $$
 y = \begin{cases} 
@@ -205,7 +205,7 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Sqrt operator");
     AddOutput("Y", "Output of Sqrt operator");
     AddComment(R"DOC(
-Sqrt activation operator.
+Sqrt Activation Operator.
 
 $y = \sqrt{x}$
 
@@ -220,7 +220,7 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Abs operator");
     AddOutput("Y", "Output of Abs operator");
     AddComment(R"DOC(
-Abs activation operator.
+Abs Activation Operator.
 
 $y = |x|$
 
@@ -236,7 +236,7 @@ class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Reciprocal operator");
     AddOutput("Y", "Output of Reciprocal operator");
     AddComment(R"DOC(
-Reciprocal activation operator.
+Reciprocal Activation Operator.
 
 $$y = \frac{1}{x}$$
 
@@ -251,7 +251,7 @@ class LogOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Log operator");
     AddOutput("Y", "Output of Log operator");
     AddComment(R"DOC(
-Log activation operator.
+Log Activation Operator.
 
 $y = \ln(x)$
 
@@ -268,7 +268,7 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Square operator");
     AddOutput("Y", "Output of Square operator");
     AddComment(R"DOC(
-Square activation operator.
+Square Activation Operator.
 
 $y = x^2$
 
@@ -284,7 +284,7 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Softplus operator");
     AddOutput("Y", "Output of Softplus operator");
     AddComment(R"DOC(
-Softplus activation operator.
+Softplus Activation Operator.
 
 $y = \ln(1 + e^{x})$
 
@@ -300,7 +300,7 @@ class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Softsign operator");
     AddOutput("Y", "Output of Softsign operator");
     AddComment(R"DOC(
-Softsign activation operator.
+Softsign Activation Operator.
 
 $$y = \frac{x}{1 + |x|}$$
 
@@ -320,7 +320,7 @@ class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
         .SetDefault(static_cast<AttrType>(24));
     AddComment(R"DOC(
-BRelu activation operator.
+BRelu Activation Operator.
 
 $y = \max(\min(x, t_{min}), t_{max})$
 
@@ -339,7 +339,7 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
         .SetDefault(static_cast<AttrType>(40));
     AddComment(R"DOC(
-SoftRelu activation operator.
+SoftRelu Activation Operator.
 
 $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
 
@@ -357,7 +357,7 @@ class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("alpha", "The alpha value of ELU")
         .SetDefault(static_cast<AttrType>(1.0f));
     AddComment(R"DOC(
-ELU activation operator.
+ELU Activation Operator.
 
 Applies the following element-wise computation on the input according to
 https://arxiv.org/abs/1511.07289.
@@ -378,7 +378,7 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("threshold", "The threshold value of Relu6")
         .SetDefault(static_cast<AttrType>(6));
     AddComment(R"DOC(
-Relu6 activation operator.
+Relu6 Activation Operator.
 
 $y = \min(\max(0, x), 6)$
 
@@ -396,7 +396,7 @@ class PowOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("factor", "The exponential factor of Pow")
         .SetDefault(static_cast<AttrType>(1));
     AddComment(R"DOC(
-Pow activation operator.
+Pow Activation Operator.
 
 $y = x^{factor}$
 
@@ -416,7 +416,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
         .SetDefault(static_cast<AttrType>(1.7159));
     AddComment(R"DOC(
-STanh activation operator.
+STanh Activation Operator.
 
 $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 
@@ -435,7 +435,7 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("threshold", "The threshold location of activation")
         .SetDefault(static_cast<AttrType>(1.0));
     AddComment(R"DOC(
-ThresholdedRelu activation operator.
+ThresholdedRelu Activation Operator.
 
 $$
 y = \begin{cases} 
@@ -461,7 +461,7 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
         .SetDefault(static_cast<AttrType>(0.5));
     AddComment(R"DOC(
-HardSigmoid activation operator.
+HardSigmoid Activation Operator.
 
 Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
 which is much faster than sigmoid.
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
index 24e419b532d97bc16ab96dad418d6e73c03f30a0..b717e1647e4b89285b841420650dc69e8a1e0c58 100644
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -64,16 +64,15 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("AvgSquaredGrad",
-             "(Tensor) Input expectation of squared gradient");
+    AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
     AddInput("AvgSquaredUpdate",
-             "(Tensor) Input expectation of squared parameter updates");
+             "(Tensor) Input average of squared parameter updates");
 
     AddOutput("ParamOut", "(Tensor) Output parameter");
     AddOutput("AvgSquaredGradOut",
-              "(Tensor) Output expectation of squared gradient");
+              "(Tensor) Output average of squared gradient");
     AddOutput("AvgSquaredUpdateOut",
-              "(Tensor) Output expectation of squared parameter updates");
+              "(Tensor) Output average of squared parameter updates");
 
     AddAttr<float>("rho",
                    "(float, default 0.95) Exponential decay rate "
@@ -84,22 +83,21 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
                    "numerical stability")
         .SetDefault(1.0e-6f);
     AddComment(R"DOC(
-Adadelta Updates Operator.
+Adadelta Optimizer.
 
-This implements the Adadelta optimizer[1]. Adadelta is a per-dimension
-adaptive learning rate method for gradient descent.
+Adadelta optimizer is implemented as explained in:
+https://arxiv.org/abs/1212.5701
+Adadelta is a per-dimension adaptive learning rate method used
+for gradient descent.
 
-Adadelta updates:
+Adadelta updates are as follows:
 
-avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * grad * grad
-param_update =  - sqrt((avg_squared_update + epsilon) /
-                       (avg_squared_grad_out + epsilon)) * grad
-avg_squared_update_out = rho * avg_squared_update + (1 - rho) * param_update**2
-param_out = param + param_update
-
-References:
-  [1] ADADELTA: An Adaptive Learning Rate Method
-      https://arxiv.org/abs/1212.5701
+$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
+paramUpdate =  - $\sqrt{((avgSquaredUpdate + \epsilon) /
+                       (avgSquaredGrad_out + \epsilon))}$ * grad \break
+avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
+                                  {(paramUpdate)}^2 \break
+paramOut = param + paramUpdate$$
 
 )DOC");
   }
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
index bc081f87dcab0dcd8ef329dcb1f66b627c82b4a2..8d1a2b7938d2c6607cbeb3cecb72d1d5b83dd8b9 100644
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -73,12 +73,16 @@ class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
 
 Adaptive Gradient Algorithm (Adagrad).
 
-moment_out = moment + grad * grad
-param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
+The update is done as follows:
+
+$$momentOut = moment + grad * grad \break
+paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
+$$
 
 The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-does not have the epsilon attribute. It is added here for numerical stability 
-by avoiding division by zero.
+does not have the epsilon attribute. It is added here in our implementation
+as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
+for numerical stability to avoid the division by zero error.
 
 )DOC");
   }
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
index 3572de06bd60f7979e3bfbf39856b04942ce81c0..97a091ae766abfba5412bbd32c34a6f80701fbf7 100644
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -51,8 +51,8 @@ class AdamOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
                       "Beta1 power accumulator should have 1 dimension");
     auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-                      "Beta1 power accumulator should have 1 dimension");
+    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
+                      "Beta2 power accumulator should have 1 dimension");
 
     auto param_dims = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
@@ -60,10 +60,10 @@ class AdamOp : public framework::OperatorWithKernel {
         "Param and Grad input of AdamOp should have same dimension");
     PADDLE_ENFORCE_EQ(
         param_dims, ctx->GetInputDim("Moment1"),
-        "Param and Moment input of AdamOp should have same dimension");
+        "Param and Moment1 input of AdamOp should have same dimension");
     PADDLE_ENFORCE_EQ(
         param_dims, ctx->GetInputDim("Moment2"),
-        "Param and InfNorm input of AdamOp should have same dimension");
+        "Param and Moment2 input of AdamOp should have same dimension");
 
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("Moment1Out", param_dims);
@@ -103,23 +103,20 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(1.0e-8f);
 
     AddComment(R"DOC(
-Adam Updates Operator.
+Adam Optimizer.
 
 This implements the Adam optimizer from Section 2 of the Adam
-paper[1]. Adam is a first-order gradient-based optimization
-method based on adaptive estimates of lower-order moments.
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
 
 Adam updates:
 
-moment1_out = beta1 * moment1 + (1 − beta1) * grad
-moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad
-learning_rate_t = learning_rate_t *
-                  sqrt(1 - beta2_pow) / (1 - beta1_pow)
-param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon)
-
-References:
-  [1] Adam: A Method for Stochastic Optimization
-      (https://arxiv.org/abs/1412.6980)
+$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
+moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
+learningRate = learningRate *
+                  $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
+paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
 
 )DOC");
   }
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
index ff2565774115571166712b03c8990e5bf8de12a5..14cf3841b33a8153549e4c99ed2b75286e9c64db 100644
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -99,26 +99,22 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
                    "Constant for numerical stability")
         .SetDefault(1.0e-8f);
     AddComment(R"DOC(
-Adamax Updates Operator.
+Adamax Optimizer.
 
-This implements the Adamax optimizer from Section 7 of the Adam
-paper[1]. Adamax is a variant of the
+We implement the Adamax optimizer from Section 7 of the Adam
+paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
 Adam algorithm based on the infinity norm.
 
 Adamax updates:
 
-moment_out = beta1 * moment + (1 - beta1) * grad
-inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad))
-learning_rate_t = learning_rate/(1 - beta1_pow)
-param_out = param - learning_rate_t * moment_out/inf_norm_out
+$$momentOut = \beta_1 * moment + (1 - \beta_1) * grad \break
+infNormOut = max(\beta_2 * infNorm + \epsilon, |grad|) \break
+learningRate = learningRate /(1 - \beta_1_{pow}) \break
+paramOut = param - learningRate * momentPut / infNormOut$$
 
 The original paper does not have an epsilon attribute.
-However, it is added here for numerical stability
-by preventing divide by 0.
-
-References:
-  [1] Adam: A Method for Stochastic Optimization
-      (https://arxiv.org/abs/1412.6980)
+However, it is added here for numerical stability to prevent the
+division by 0 error.
 
 )DOC");
   }
diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc
index f5784922af1cafcf4e6873c0e906bbd66ce8ae69..ccb969ab23a8e1df713278513a66b10f21690108 100644
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -23,11 +23,11 @@ class AucOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input of Indices must be initialized.");
+                   "Input of Indices should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input of Label must be initialized.");
+                   "Input of Label should not be null.");
     auto inference_height = ctx->GetInputDim("Out")[0];
     auto label_height = ctx->GetInputDim("Label")[0];
 
@@ -52,20 +52,20 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Out",
              "A floating point 2D tensor, values are in the range [0, 1]."
-             "Each row is descend sorted. This input should be the"
+             "Each row is sorted in descending order. This input should be the"
              "output of topk."
              "Typically, this tensor indicates the probability of each label");
     AddInput("Indices",
              "An int 2D tensor, indicating the indices of original"
-             "tensor before sort. Typically, this tensor indicates which label"
-             "the probability stands for.");
+             "tensor before sorting. Typically, this tensor indicates which "
+             "label the probability stands for.");
     AddInput("Label",
              "A 2D int tensor indicating the label of the training data."
              "The height is batch size and width is always 1.");
     // TODO(typhoonzero): support weight input
     AddOutput("AUC",
               "A scalar representing the "
-              "current area-under-curve.");
+              "current area-under-the-curve.");
 
     AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
         .SetDefault("ROC");
@@ -74,19 +74,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
                  " roc curve.")
         .SetDefault(200);
 
-    AddComment(
-        R"DOC(Computes the AUC according forward output and label.
-Best to use for binary classification evaluations.
+    AddComment(R"DOC(
+Area Under The Curve (AUC) Operator.
 
+This implementation computes the AUC according to forward output and label.
+It is used very widely in binary classification evaluation. As a note:
 If input label contains values other than 0 and 1, it will be cast
-to bool.
-
-You can find the definations here: 
+to bool. You can find the relevant definitions here:
 https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
 
-Possible curves are:
-- ROC: Receiver operating characteristic
-- PR: Precision Recall
+There are two types of possible curves:
+1. ROC: Receiver operating characteristic
+2. PR: Precision Recall
 )DOC");
   }
 };
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index 9c4bfd24c132cfe3f9170b1178b1b92788a4ac64..7d73dfde786208fe217dac97325d432fb80052ad 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -70,7 +70,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
                                              : x_dims[x_dims.size() - 1]);
 
     PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "Input x must have 3 to 5 dimensions.");
+                   "Input X must have 3 to 5 dimensions.");
 
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
@@ -97,16 +97,16 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The input tensor");
     AddInput("Scale",
              "Scale is a 1-dimensional tensor of size C "
-             "to be applied to the output");
+             "that is applied to the output");
     AddInput("Bias",
              "Bias is a 1-dimensional tensor of size C "
-             "to be applied to the output");
+             "that is applied to the output");
     AddInput("Mean",
-             "The global mean (for training) or the "
+             "The global mean (for training) or "
              "estimated mean (for testing)");
     AddInput("Variance",
              "The global variance (for training) "
-             "or the estimated Variance (for testing)");
+             "or estimated Variance (for testing)");
     AddOutput("Y", "result after normalization");
     AddOutput("MeanOut",
               "Share memory with Mean. "
@@ -123,10 +123,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
               "will apply to output when training")
         .AsIntermediate();
     AddComment(R"DOC(
-https://arxiv.org/pdf/1502.03167.pdf
+Batch Normalization.
 
-NHWC `[batch, in_height, in_width, in_channels]`
-NCHW `[batch, in_channels, in_height, in_width]`
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Can be used as a normalizer function for conv2d and fully_connected operations.
+The required data format for this layer is one of the following:
+1. NHWC `[batch, in_height, in_width, in_channels]`
+2. NCHW `[batch, in_channels, in_height, in_width]`
 
 )DOC");
   }
diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc
index 19187894c3f4803ef241d5e0c159852c0d9687da..70ee7861bab3a982eae60dd85b10c2e41f5827d0 100644
--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -23,13 +23,17 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   CastOpProtoMaker(framework::OpProto *proto,
                    framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input tensor of cast op");
-    AddOutput("Out", "the output tensor of cast op");
-    AddComment(R"DOC(Cast operator.
-cast the input tensor to other data type.
-)DOC");
+    AddInput("X", "The input tensor of cast op");
+    AddOutput("Out", "The output tensor of cast op");
     AddAttr<int>("out_data_type", "output data type");
     AddAttr<int>("in_data_type", "input data type");
+    AddComment(R"DOC(
+Cast Operator.
+
+This Operator casts the input tensor to another data type and
+returns tha Output Tensor.
+
+)DOC");
   }
 };
 
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index f80204c6833d6436f2cf21610beea45b36787eea..3e9066ceb2a4a4dc19fdf5ef02bb7fadaab4bfff 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -49,8 +49,11 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>(
         "max", "(float)Maximum value, above which element is replaced by max");
     AddComment(R"DOC(
-Clip operator limits the given input within an interval. The interval is
+Clip Operator.
+
+The clip operator limits the value of given input within an interval. The interval is
 specified with arguments 'min' and 'max'.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 24df1fcadac75315890635f4d3aaa7146c1cc27b..9d41879b27a24f83090f5abf1325eca5f9488d00 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -114,21 +114,17 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
              "where N is the batch size and D is the number of classes. "
              "This input is a probability computed by the previous operator, "
              "which is almost always the result of a softmax operator.");
-    AddInput(
-        "Label",
-        "(Tensor, default Tensor<int>), the ground truth which is "
-        "a 2-D tensor. "
-        "When soft_label is set to false, Label is a Tensor<int> with shape "
-        "[N x 1]. "
-        "When soft_label is set to true, Label is a Tensor<float/double> "
-        "with shape [N x K].");
+    AddInput("Label",
+             "(Tensor), the ground truth which is a 2-D tensor. When "
+             "soft_label is set to false, Label is a Tensor<int64> with shape "
+             "[N x 1]. When soft_label is set to true, Label is a "
+             "Tensor<float/double> with shape [N x K].");
     AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 2-D tensor "
-              "with shape [N x 1]. The cross entropy loss.");
-    AddAttr<bool>(
-        "soft_label",
-        "(bool, default false), a flag to indicate whether to interpretate "
-        "the given labels as soft labels.")
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
+              "[N x 1]. The cross entropy loss.");
+    AddAttr<bool>("soft_label",
+                  "(bool, default false), a flag indicating whether to "
+                  "interpretate the given labels as soft labels.")
         .SetDefault(false);
     AddComment(R"DOC(
 CrossEntropy Operator.
diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5aa03f8916a67222fb0ca5781533766063e52683
--- /dev/null
+++ b/paddle/operators/gru_op.cc
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/gru_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class GRUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                   "Output(%s) of GRUOp should not be null.", "BatchGate");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
+                   "Output(%s) of GRUOp should not be null.",
+                   "BatchResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
+                   "Output(%s) of GRUOp should not be null.", "BatchHidden");
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(%s) of GRUOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int input_size = input_dims[1];
+    int frame_size = weight_dims[0];
+    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
+                      "The input_size must be 3 times of frame_size in GRUOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_dims[1], frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("H0")) {
+      auto h0_dims = ctx->GetInputDim("H0");
+      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                        "The width of H0 must be equal to frame_size.");
+    }
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+    }
+    ctx->SetOutputDim("BatchGate", input_dims);
+    ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size});
+    ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size});
+    ctx->SetOutputDim("Hidden", {input_dims[0], frame_size});
+    ctx->ShareLoD("Input", "Hidden");
+  }
+};
+
+class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GRUOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) The first input is a LodTensor, which supports "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTenosr is a matrix with shape (T X 3D), where, T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) The initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size, D is the hidden size.")
+        .AsDispensable();
+    AddInput(
+        "Weight",
+        "(Tensor) The learnable hidden-hidden weight matrix with shape "
+        "(D x 3D), where D is the hidden size. The elements continuous in "
+        "memory can be divided into two parts. The first part are weights of "
+        "the update gate and reset gate with shape (D x 2D), and the second "
+        "part are weights of output candidate with shape (D x D).");
+    AddInput("Bias",
+             "(Tensor, optional) Bias vector with shape (1 x 3D) concating "
+             "bias of the update gate, reset gate and output candidate.")
+        .AsDispensable();
+    AddOutput("BatchGate",
+              "(LoDTensor) To compute with batches, sequence data will be "
+              "reorganized into several successive batches each containing "
+              "data from the same time step. The LoDTensor BatchGate contains "
+              "the update gate, reset gate and output candidate values "
+              "organized in batches. The LoD size is 2. The first LoD contains "
+              "the batch offsets and the second LoD contains the indexes in "
+              "the raw sequence data.")
+        .AsIntermediate();
+    AddOutput(
+        "BatchResetHiddenPrev",
+        "(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.")
+        .AsIntermediate();
+    AddOutput(
+        "BatchHidden",
+        "(LoDTensor) The hidden state LoDTensor organized in batches.  "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.")
+        .AsIntermediate();
+    AddOutput(
+        "Hidden",
+        "(LoDTensor) the hidden state LoDTensor organized in sequences. "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.");
+    AddAttr<std::string>("activation",
+                         "(string, default tanh) "
+                         "The activation type used for output candidate {h}_t.")
+        .SetDefault("tanh");
+    AddAttr<std::string>(
+        "gate_activation",
+        "(string, default sigmoid) "
+        "The activation type used in update gate and reset gate.")
+        .SetDefault("sigmoid");
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed GRU.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+GRU Operator implements part calculations of the complete GRU as following:
+
+\f[
+update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+\f]
+
+@note To implement the complete GRU, fully-connected operator must be used  
+before to feed xu, xr and xc as the Input of GRU operator.
+)DOC");
+  }
+};
+
+class GRUGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUGradOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUGradOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(%s) of GRUGradOp should not be null.", "BatchGate");
+    PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"),
+                   "Input(%s) of GRUGradOp should not be null.",
+                   "BatchResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("BatchHidden"),
+                   "Input(%s) of GRUOp should not be null.", "BatchHidden");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(%s) of GRUGradOp should not be null.", "Hidden");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
+                   "Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int input_size = input_dims[1];
+    int frame_size = weight_dims[0];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
+                      "The input_size must be 3 times of frame_size in GRUOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("H0")) {
+      auto h0_dims = ctx->GetInputDim("H0");
+      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                        "The width of H0 must be equal to frame_size.");
+      auto h0_grad_name = framework::GradVarName("H0");
+      if (ctx->HasOutput(h0_grad_name))
+        ctx->SetOutputDim(h0_grad_name, h0_dims);
+    }
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+    auto input_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(input_grad_name))
+      ctx->SetOutputDim(input_grad_name, input_dims);
+    auto weight_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(weight_grad_name))
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
+REGISTER_OP_CPU_KERNEL(gru, ops::GRUKernel<paddle::platform::CPUPlace, float>,
+                       ops::GRUKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(gru_grad,
+                       ops::GRUGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::GRUGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/gru_op.cu b/paddle/operators/gru_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..35538c74b4bf678f8068999bfadb2589a1671be0
--- /dev/null
+++ b/paddle/operators/gru_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/gru_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gru, ops::GRUKernel<paddle::platform::GPUPlace, float>,
+                       ops::GRUKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(gru_grad,
+                       ops::GRUGradKernel<paddle::platform::GPUPlace, float>,
+                       ops::GRUGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba90ec9816c40a6a49065ac6efcee6b93dffce90
--- /dev/null
+++ b/paddle/operators/gru_op.h
@@ -0,0 +1,231 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence2batch.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class GRUKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* input = context.Input<LoDTensor>("Input");
+    auto* h0 = context.Input<Tensor>("H0");
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(context.GetPlace());
+    auto* batch_reset_hidden_prev =
+        context.Output<LoDTensor>("BatchResetHiddenPrev");
+    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<LoDTensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    context.ShareLoD("Input", "Hidden");
+
+    auto hidden_dims = hidden->dims();
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    to_batch(context.device_context(), *input, *batch_gate, true, is_reverse);
+
+    int frame_size = hidden_dims[1];
+    int batch_size = hidden_dims[0];
+    auto g = EigenMatrix<T>::From(*batch_gate);
+    auto place = context.GetEigenDevice<Place>();
+    if (bias) {
+      auto b = EigenMatrix<T>::From(*bias);
+      g.device(place) = g +
+                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
+                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
+    }
+
+    math::hl_gru_value<T> gru_value;
+    gru_value.gateWeight = const_cast<T*>(weight_data);
+    gru_value.stateWeight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    gru_value.prevOutValue = const_cast<T*>(h0_data);
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      gru_value.outputValue = hidden_t.data<T>();
+      gru_value.gateValue = gate_t.data<T>();
+      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+      math::GRUUnitFunctor<Place, T>::compute(
+          context.device_context(), gru_value, frame_size, cur_batch_size,
+          math::ActiveType(context.Attr<std::string>("activation")),
+          math::ActiveType(context.Attr<std::string>("gate_activation")));
+      gru_value.prevOutValue = gru_value.outputValue;
+    }
+
+    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(context.device_context(), *batch_hidden, *hidden);
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+template <typename Place, typename T>
+class GRUGradKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* h0 = context.Input<Tensor>("H0");
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* batch_gate = context.Input<LoDTensor>("BatchGate");
+    auto* batch_reset_hidden_prev =
+        context.Input<LoDTensor>("BatchResetHiddenPrev");
+    auto* batch_hidden = context.Input<LoDTensor>("BatchHidden");
+    auto* hidden = context.Input<LoDTensor>("Hidden");
+    auto* hidden_grad =
+        context.Input<LoDTensor>(framework::GradVarName("Hidden"));
+    auto* input_grad =
+        context.Output<LoDTensor>(framework::GradVarName("Input"));
+    auto* h0_grad = context.Output<Tensor>(framework::GradVarName("H0"));
+    auto* weight_grad =
+        context.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto gate_dims = batch_gate->dims();
+    auto hidden_dims = hidden->dims();
+    int frame_size = hidden_dims[1];
+
+    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
+    batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
+    batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
+    batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
+                                                 context.GetPlace());
+    math::SetConstant<Place, T> zero;
+    zero(context.device_context(), &batch_hidden_grad, static_cast<T>(0.0));
+    zero(context.device_context(), &batch_gate_grad, static_cast<T>(0.0));
+    zero(context.device_context(), &batch_reset_hidden_prev_grad,
+         static_cast<T>(0.0));
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    batch_hidden_grad.set_lod(batch_hidden->lod());
+    to_batch(context.device_context(), *hidden_grad, batch_hidden_grad, false,
+             is_reverse);
+
+    math::hl_gru_value<T> gru_value;
+    gru_value.gateWeight = const_cast<T*>(weight_data);
+    gru_value.stateWeight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+
+    math::hl_gru_grad<T> gru_grad;
+    if (weight_grad) {
+      gru_grad.gateWeightGrad =
+          weight_grad->mutable_data<T>(context.GetPlace());
+      zero(context.device_context(), weight_grad, static_cast<T>(0.0));
+      gru_grad.stateWeightGrad =
+          weight_grad->data<T>() + 2 * frame_size * frame_size;
+    } else {
+      gru_grad.gateWeightGrad = nullptr;
+      gru_grad.stateWeightGrad = nullptr;
+    }
+
+    auto batch_starts = batch_hidden_grad.lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      gru_value.gateValue = gate_t.data<T>();
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+
+      Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
+      gru_grad.outputGrad = hidden_grad_t.data<T>();
+      Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
+      gru_grad.gateGrad = gate_grad_t.data<T>();
+      Tensor reset_hidden_prev_grad_t =
+          batch_reset_hidden_prev_grad.Slice(bstart, bend);
+      gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
+      if (n == 0) {
+        gru_value.prevOutValue = const_cast<T*>(h0_data);
+        if (h0_grad) {
+          T* h0_grad_data = h0_grad->mutable_data<T>(context.GetPlace());
+          zero(context.device_context(), h0_grad, static_cast<T>(0.0));
+          gru_grad.prevOutGrad = h0_grad_data;
+        } else {
+          gru_grad.prevOutGrad = nullptr;
+        }
+      } else {
+        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
+        gru_value.prevOutValue = hidden_prev_t.data<T>();
+        Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
+        gru_grad.prevOutGrad = hidden_prev_grad_t.data<T>();
+      }
+
+      math::GRUUnitGradFunctor<Place, T>::compute(
+          context.device_context(), gru_value, gru_grad, frame_size,
+          cur_batch_size,
+          math::ActiveType(context.Attr<std::string>("activation")),
+          math::ActiveType(context.Attr<std::string>("gate_activation")));
+    }
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      math::Batch2LoDTensorFunctor<Place, T> to_seq;
+      batch_gate_grad.set_lod(batch_gate->lod());
+      to_seq(context.device_context(), batch_gate_grad, *input_grad);
+    }
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(context.GetPlace());
+      auto d_b = EigenMatrix<T>::From(*bias_grad);
+      auto d_g = EigenMatrix<T>::From(batch_gate_grad);
+      auto place = context.GetEigenDevice<Place>();
+      d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index 6864e3b0b7ace69f7e4b5f7e129b476d28591a34..bcb48e13bd948b4e91ce8cbd7231a9619fac8d18 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -23,21 +23,21 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
                         framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Emission",
-             "(LoDTensor, default: LoDTensor<float>). "
-             "A 2-D LoDTensor with shape [N x D] where N is the size of the "
+             "(LoDTensor, default LoDTensor<float>) "
+             "A 2-D LoDTensor with shape [N x D], where N is the size of the "
              "mini-batch and D is the total tag number. The unscaled emission "
              "weight matrix for the linear chain CRF. ");
     AddInput("Transition",
-             "(Tensor, default: Tensor<float>). A 2-D Tensor with shape "
+             "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
              "[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
              "operator. See more details in the operator's comments.");
     AddInput("Label",
-             "(LoDTensor, default: LoDTensor<int>). A LoDTensor with shape "
+             "(LoDTensor, default LoDTensor<int>) A LoDTensor with shape "
              "[N x 1], where N is the total element number in a mini-batch. "
              "The ground truth.");
     AddOutput(
         "Alpha",
-        "(Tensor, default: Tensor<float>). A 2-D Tensor with shape [N x D]. "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
         "The forward vectors for the entire batch. Denote it as \f$\alpha\f$. "
         "\f$\alpha$\f is a memo table used to calculate the normalization "
         "factor in CRF. \f$\alpha[k, v]$\f stores the unnormalized "
@@ -49,26 +49,28 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddOutput(
         "EmissionExps",
-        "(Tensor, default: Tensor<float>). A 2-D Tensor with shape [N x D]. "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
         "The exponentials of Input(Emission). This is an intermediate "
         "computational result in forward computation, and will be reused in "
         "backward computation.")
         .AsIntermediate();
     AddOutput(
         "TransitionExps",
-        "(Tensor, default: Tensor<float>). A 2-D Tensor with shape "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
         "[(D + 2) x D]. The exponentials of Input(Transition). This is an "
         "intermediate computational result in forward computation, and "
         "will be reused in backward computation.")
         .AsIntermediate();
     AddOutput(
         "LogLikelihood",
-        "(Tensor, default: Tensor<float>). The logarithm of the conditional "
+        "(Tensor, default Tensor<float>) The logarithm of the conditional "
         "likelihood of each training sample in a mini-batch. This is a 2-D "
         "tensor with shape [S x 1], where S is the sequence number in a "
         "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
         "The output is no longer a LoDTensor.");
     AddComment(R"DOC(
+LinearChainCRF Operator.
+
 Conditional Random Field defines an undirected probabilistic graph with nodes
 denoting random variables and edges denoting dependencies between these
 variables. CRF learns the conditional probability \f$P(Y|X)\f$, where
@@ -82,29 +84,28 @@ and output must be linear sequences. Thus, the graph of such a CRF is a simple
 chain or a line, which results in the linear chain CRF.
 
 This operator implements the Forward-Backward algorithm for the linear chain
-CRF. Please see http://www.cs.columbia.edu/~mcollins/fb.pdf and
-http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for reference.
+CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
+http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
 
 Equation:
-
-- Denote Input(Emission) to this operator as \f$x\f$ here.
-- The first D values of Input(Transition) to this operator are for starting
+1. Denote Input(Emission) to this operator as \f$x\f$ here.
+2. The first D values of Input(Transition) to this operator are for starting
 weights, denoted as \f$a\f$ here.
-- The next D values of Input(Transition) of this operator are for ending
+3. The next D values of Input(Transition) of this operator are for ending
 weights, denoted as \f$b\f$ here.
-- The remaning values of Input(Transition) are for transition weights,
+4. The remaning values of Input(Transition) are for transition weights,
 denoted as \f$w\f$ here.
-- Denote Input(Label) as \f$s\f$ here.
+5. Denote Input(Label) as \f$s\f$ here.
 
 The probability of a sequence \f$s\f$ of length \f$L\f$ is defined as:
-\f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
+\f$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
                  + \sum_{l=1}^L x_{s_l}
                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
 where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
 all possible sequences is \f$1\f$, and \f$x\f$ is the emission feature weight
 to the linear chain CRF.
 
-Finaly, the linear chain CRF operator outputs the logarithm of the conditional
+Finally, the linear chain CRF operator outputs the logarithm of the conditional
 likelihood of each training sample in a mini-batch.
 
 NOTE:
diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc
index 638a99addc2119e8f44648cc54b97bd8a892d2bc..d7e8a0ea7632650203106b01531d724cf0b8e085 100644
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -55,8 +55,6 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
              "(2-D tensor with shape [batch_size x 1]) "
              "The label indicating X1 ranked higher than X2 or not, "
              "can only be +1 or -1.");
-    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
-        .SetDefault(static_cast<T>(0));
     AddOutput("Activated",
               "(2-D tensor with shape [batch_size x 1]) Intermediate tensor "
               "to indicate whether each element of Output(Out) is activated.")
@@ -64,23 +62,26 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out",
               "(2-D tensor with shape [batch_size x 1]) "
               "The output loss of MarginRankLoss operator.");
+    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
+        .SetDefault(static_cast<T>(0));
     AddComment(R"DOC(
+MarginRankLoss Operator.
 
-MarginRankLoss operator measures the loss given a pair of training sample
+This operator measures the loss given a pair of training sample
 {`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` 
-indicating X1 is ranked higher than `X2`, otherwise `Label = -1`. The loss 
-turns out
+indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss 
+is calculated as:
 
-loss(X1, X2, Label) = max(0, -Label * (X1 - X2) + margin).
+$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$
 
-The attribute `margin` involved here helps make the predictions more robust.
+The attribute `margin` here helps make the predictions more robust.
 Denote the item ranked higher as the positive sample, otherwise the negative 
 sample. If the score of the two samples satisfies 
 
-positive sample - negative sample < margin,
+$positive sample - negative sample < margin$
 
-the pair of samples will contribute to the final loss, which will backpropogate 
-and train the ranking model to enlarge the difference of the two score.
+the pair of samples will contribute to the final loss, which will backpropagate 
+and train the ranking model to enlarge the difference between the two scores.
 
 For batch input with size `batch_size`, `X1`, `X2` and `Label`
 all have the same shape [batch_size x 1].
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 40cc177d0f19c2359626ef972e787a0b1c5580f8..90bc9f4f922e7aa09523bad8ffb3ef477dd89857 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -8,20 +8,24 @@ if(WITH_GPU)
     nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
     nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
+    nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
     nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
     cc_library(softmax SRCS softmax.cc DEPS operator)
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
     cc_library(pooling SRCS pooling.cc DEPS device_context)
+    cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
     cc_library(vol2col SRCS vol2col.cc DEPS device_context)
     cc_library(context_project SRCS context_project.cc DEPS device_context)
     cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
+    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
 endif()
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/operators/math/detail/gru_cpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..51af140cf4d5e6581765bea00033fa53d383230d
--- /dev/null
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
@@ -0,0 +1,424 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/gru_compute.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+#ifndef __NVCC__
+
+template <class OpResetOutput, typename T>
+void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
+                                       T *gateValue, T *resetOutputValue,
+                                       T *prevOutputValue, int frameSize,
+                                       activation_mode_t active_gate) {
+  T rValueUpdateGate;
+  T rValueResetGate;
+  T rValueResetOutput;
+  T rPrevOut = 0;
+  T *updateGate = gateValue;
+  T *resetGate = gateValue + frameSize;
+
+  for (int i = 0; i < frameSize; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueResetGate = resetGate[i];
+    if (prevOutputValue) {
+      rPrevOut = prevOutputValue[i];
+    }
+
+    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
+                  rValueResetOutput, active_gate);
+
+    updateGate[i] = rValueUpdateGate;
+    resetGate[i] = rValueResetGate;
+    resetOutputValue[i] = rValueResetOutput;
+  }
+}
+
+template <class OpFinalOutput, typename T>
+void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
+                                       T *gateValue, T *prevOutputValue,
+                                       T *outputValue, int frameSize,
+                                       activation_mode_t active_node) {
+  T rValueUpdateGate;
+  T rValueFrameState;
+  T rPrevOut = 0;
+  T rOutput;
+  T *updateGate = gateValue;
+  T *frameState = gateValue + frameSize * 2;
+
+  for (int i = 0; i < frameSize; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueFrameState = frameState[i];
+    if (prevOutputValue) {
+      rPrevOut = prevOutputValue[i];
+    }
+
+    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
+                  active_node);
+
+    frameState[i] = rValueFrameState;
+    outputValue[i] = rOutput;
+  }
+}
+
+template <class OpResetOutput, typename T>
+void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue,
+                                     T *resetOutputValue, T *prevOutputValue,
+                                     int frameSize,
+                                     activation_mode_t active_gate) {
+#ifdef __AVX__
+  __m256 rValueUpdateGate;
+  __m256 rValueResetGate;
+  __m256 rValueResetOutput;
+  __m256 rPrevOut = _mm256_set1_ps(0.0f);
+  __m256 *updateGate = (__m256 *)gateValue;
+  __m256 *resetGate = (__m256 *)(gateValue + frameSize);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueResetGate = resetGate[i];
+    if (prevOutputValue) {
+      rPrevOut = ((__m256 *)prevOutputValue)[i];
+    }
+
+    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
+                  rValueResetOutput, active_gate);
+
+    updateGate[i] = rValueUpdateGate;
+    resetGate[i] = rValueResetGate;
+    ((__m256 *)resetOutputValue)[i] = rValueResetOutput;
+  }
+#endif
+}
+
+template <class OpFinalOutput, typename T>
+void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue,
+                                     T *prevOutputValue, T *outputValue,
+                                     int frameSize,
+                                     activation_mode_t active_node) {
+#ifdef __AVX__
+  __m256 rValueUpdateGate;
+  __m256 rValueFrameState;
+  __m256 rPrevOut = _mm256_set1_ps(0.0f);
+  __m256 rOutput;
+  __m256 *updateGate = (__m256 *)gateValue;
+  __m256 *frameState = (__m256 *)(gateValue + frameSize * 2);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rValueUpdateGate = updateGate[i];
+    rValueFrameState = frameState[i];
+    if (prevOutputValue) {
+      rPrevOut = ((__m256 *)prevOutputValue)[i];
+    }
+
+    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
+                  active_node);
+
+    frameState[i] = rValueFrameState;
+    ((__m256 *)outputValue)[i] = rOutput;
+  }
+#endif
+}
+
+template <class OpResetOutput, typename T>
+inline void forward_reset_output(OpResetOutput opResetOutput,
+                                 hl_gru_value<T> value, int frameSize,
+                                 int batchSize, activation_mode_t active_gate) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_reset_output(
+          opResetOutput, value.gateValue, value.resetOutputValue,
+          value.prevOutValue, frameSize, active_gate);
+    } else {
+      hl_naive_gru_forward_reset_output(
+          opResetOutput, value.gateValue, value.resetOutputValue,
+          value.prevOutValue, frameSize, active_gate);
+    }
+
+    value.gateValue += frameSize * 3;
+    value.resetOutputValue += frameSize;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+  }
+}
+
+template <class OpFinalOutput, typename T>
+inline void forward_final_output(OpFinalOutput opFinalOutput,
+                                 hl_gru_value<T> value, int frameSize,
+                                 int batchSize, activation_mode_t active_node) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue,
+                                      value.prevOutValue, value.outputValue,
+                                      frameSize, active_node);
+    } else {
+      hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue,
+                                        value.prevOutValue, value.outputValue,
+                                        frameSize, active_node);
+    }
+
+    value.gateValue += frameSize * 3;
+    value.outputValue += frameSize;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
+                                      T *gateGrad, T *prevOutValue,
+                                      T *prevOutGrad, T *outputGrad,
+                                      int frameSize,
+                                      activation_mode_t active_node) {
+  T rUpdateGateValue;
+  T rUpdateGateGrad;
+  T rFrameStateValue;
+  T rFrameStateGrad;
+  T rOutGrad;
+  T rPrevOutValue = 0;
+  T rPrevOutGrad = 0;
+  T *updateGateValue = gateValue;
+  T *updateGateGrad = gateGrad;
+  T *frameStateValue = gateValue + frameSize * 2;
+  T *frameStateGrad = gateGrad + frameSize * 2;
+
+  for (int i = 0; i < frameSize; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rFrameStateValue = frameStateValue[i];
+    rOutGrad = outputGrad[i];
+    if (prevOutValue) {
+      rPrevOutValue = prevOutValue[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad = prevOutGrad[i];
+    }
+
+    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
+                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
+                active_node);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    frameStateGrad[i] = rFrameStateGrad;
+    if (prevOutGrad) {
+      prevOutGrad[i] = rPrevOutGrad;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
+                                      T *gateGrad, T *prevOutValue,
+                                      T *prevOutGrad, T *resetOutputGrad,
+                                      int frameSize,
+                                      activation_mode_t active_gate) {
+  T rUpdateGateValue;
+  T rUpdateGateGrad;
+  T rResetGateValue;
+  T rResetGateGrad;
+  T rResetOutputGrad = 0;
+  T rPrevOutValue = 0;
+  T rPrevOutGrad = 0;
+  T *updateGateValue = gateValue;
+  T *updateGateGrad = gateGrad;
+  T *resetGateValue = gateValue + frameSize;
+  T *resetGateGrad = gateGrad + frameSize;
+
+  for (int i = 0; i < frameSize; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rUpdateGateGrad = updateGateGrad[i];
+    rResetGateValue = resetGateValue[i];
+
+    if (prevOutValue && prevOutGrad) {
+      rResetOutputGrad = resetOutputGrad[i];
+    }
+    if (prevOutValue) {
+      rPrevOutValue = prevOutValue[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad = prevOutGrad[i];
+    }
+
+    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
+                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
+                active_gate);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    resetGateGrad[i] = rResetGateGrad;
+    if (prevOutGrad) {
+      prevOutGrad[i] = rPrevOutGrad;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
+                                    T *gateGrad, T *prevOutValue,
+                                    T *prevOutGrad, T *outputGrad,
+                                    int frameSize,
+                                    activation_mode_t active_node) {
+#ifdef __AVX__
+  __m256 rUpdateGateValue;
+  __m256 rUpdateGateGrad;
+  __m256 rFrameStateValue;
+  __m256 rFrameStateGrad;
+  __m256 rOutGrad;
+  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
+  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
+  __m256 *updateGateValue = (__m256 *)gateValue;
+  __m256 *updateGateGrad = (__m256 *)gateGrad;
+  __m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2);
+  __m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rFrameStateValue = frameStateValue[i];
+    rOutGrad = ((__m256 *)outputGrad)[i];
+    if (prevOutValue) {
+      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    }
+
+    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
+                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
+                active_node);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    frameStateGrad[i] = rFrameStateGrad;
+    if (prevOutGrad) {
+      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    }
+  }
+#endif
+}
+
+template <class OpResetGrad, typename T>
+void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
+                                    T *gateGrad, T *prevOutValue,
+                                    T *prevOutGrad, T *resetOutputGrad,
+                                    int frameSize,
+                                    activation_mode_t active_gate) {
+#ifdef __AVX__
+  __m256 rUpdateGateValue;
+  __m256 rUpdateGateGrad;
+  __m256 rResetGateValue;
+  __m256 rResetGateGrad;
+  __m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
+  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
+  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
+  __m256 *updateGateValue = (__m256 *)gateValue;
+  __m256 *updateGateGrad = (__m256 *)gateGrad;
+  __m256 *resetGateValue = (__m256 *)(gateValue + frameSize);
+  __m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rUpdateGateValue = updateGateValue[i];
+    rUpdateGateGrad = updateGateGrad[i];
+    rResetGateValue = resetGateValue[i];
+
+    if (prevOutValue && prevOutGrad) {
+      rResetOutputGrad = ((__m256 *)resetOutputGrad)[i];
+    }
+    if (prevOutValue) {
+      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+    }
+    if (prevOutGrad) {
+      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    }
+
+    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
+                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
+                active_gate);
+
+    updateGateGrad[i] = rUpdateGateGrad;
+    resetGateGrad[i] = rResetGateGrad;
+    if (prevOutGrad) {
+      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    }
+  }
+#endif
+}
+
+template <class OpStateGrad, typename T>
+inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value<T> value,
+                                hl_gru_grad<T> grad, int frameSize,
+                                int batchSize, activation_mode_t active_node) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_backward_state_grad(
+          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+    } else {
+      hl_naive_gru_backward_state_grad(
+          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+    }
+
+    value.gateValue += frameSize * 3;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+
+    grad.gateGrad += frameSize * 3;
+    grad.outputGrad += frameSize;
+    if (grad.prevOutGrad) {
+      grad.prevOutGrad += frameSize;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value<T> value,
+                                hl_gru_grad<T> grad, int frameSize,
+                                int batchSize, activation_mode_t active_gate) {
+  for (int b = 0; b < batchSize; b++) {
+    if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_backward_reset_grad(
+          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+    } else {
+      hl_naive_gru_backward_reset_grad(
+          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+    }
+
+    value.gateValue += frameSize * 3;
+    if (value.prevOutValue) {
+      value.prevOutValue += frameSize;
+    }
+
+    grad.gateGrad += frameSize * 3;
+    grad.resetOutputGrad += frameSize;
+    if (grad.prevOutGrad) {
+      grad.prevOutGrad += frameSize;
+    }
+  }
+}
+
+#endif
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6441c648b048422c110872a85aa8cb719f11a8d7
--- /dev/null
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
@@ -0,0 +1,203 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/device_context.h"
+
+#include <glog/logging.h>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class OpResetOutput, bool isBatch, typename T>
+__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
+                                        T *gateValue, T *resetOutputValue,
+                                        T *prevOutputValue, int frameSize,
+                                        int batchSize,
+                                        activation_mode_t active_gate) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    resetOutputValue += batchIdx * frameSize;
+  }
+
+  T rPrevOut = 0;
+  T rValueResetOutput;
+  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  T rValueResetGate = gateValue[frameIdx + frameSize * 1];
+
+  if (prevOutputValue) {
+    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    rPrevOut = prevOutputValue[frameIdx];
+  }
+
+  opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput,
+                active_gate);
+
+  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
+  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
+  resetOutputValue[frameIdx] = rValueResetOutput;
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class OpFinalOutput, bool isBatch, typename T>
+__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
+                                        T *gateValue, T *prevOutputValue,
+                                        T *outputValue, int frameSize,
+                                        int batchSize,
+                                        activation_mode_t active_node) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    outputValue += batchIdx * frameSize;
+  }
+
+  T rOutput;
+  T rPrevOut = 0;
+  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  T rValueFrameState = gateValue[frameIdx + frameSize * 2];
+
+  if (prevOutputValue) {
+    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    rPrevOut = prevOutputValue[frameIdx];
+  }
+
+  opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
+                active_node);
+
+  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
+  outputValue[frameIdx] = rOutput;
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class OpStateGrad, bool isBatch, typename T>
+__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue,
+                                       T *gateGrad, T *prevOutValue,
+                                       T *prevOutGrad, T *outputGrad,
+                                       int frameSize, int batchSize,
+                                       activation_mode_t active_node) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    gateGrad += batchIdx * 3 * frameSize;
+    outputGrad += batchIdx * frameSize;
+  }
+
+  T rUpdateGateGrad;
+  T rFrameStateGrad;
+  T rPrevOutValue = 0;
+  T rPrevOutGrad = 0;
+  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  T rFrameStateValue = gateValue[frameIdx + frameSize * 2];
+  T rOutGrad = outputGrad[frameIdx];
+
+  if (prevOutValue && prevOutGrad) {
+    if (isBatch) prevOutValue += batchIdx * frameSize;
+    rPrevOutValue = prevOutValue[frameIdx];
+
+    if (isBatch) prevOutGrad += batchIdx * frameSize;
+    rPrevOutGrad = prevOutGrad[frameIdx];
+  }
+
+  opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
+              rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
+              active_node);
+
+  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
+  if (prevOutGrad) {
+    prevOutGrad[frameIdx] = rPrevOutGrad;
+  }
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class OpResetGrad, bool isBatch, typename T>
+__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue,
+                                       T *gateGrad, T *prevOutValue,
+                                       T *prevOutGrad, T *resetOutputGrad,
+                                       int frameSize, int batchSize,
+                                       activation_mode_t active_gate) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    gateGrad += batchIdx * 3 * frameSize;
+    resetOutputGrad += batchIdx * frameSize;
+  }
+
+  T rResetGateGrad;
+  T rPrevOutValue = 0;
+  T rPrevOutGrad = 0;
+  T rResetOutputGrad = 0;
+  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0];
+  T rResetGateValue = gateValue[frameIdx + frameSize * 1];
+
+  if (prevOutValue && prevOutGrad) {
+    if (isBatch) prevOutValue += batchIdx * frameSize;
+    if (isBatch) prevOutGrad += batchIdx * frameSize;
+    rPrevOutValue = prevOutValue[frameIdx];
+    rPrevOutGrad = prevOutGrad[frameIdx];
+    rResetOutputGrad = resetOutputGrad[frameIdx];
+  }
+
+  opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
+              rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
+              active_gate);
+
+  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
+  if (prevOutGrad) {
+    prevOutGrad[frameIdx] = rPrevOutGrad;
+  }
+}
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/operators/math/detail/gru_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a681d8d8bced72e1296f863489f6ccbc7913167
--- /dev/null
+++ b/paddle/operators/math/detail/gru_kernel.h
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/platform/hostdevice.h"
+
+#include <type_traits>
+
+// TODO(guosheng): refine code style in gru_kernel
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+namespace forward {
+
+template <typename T>
+class gru_resetOutput {
+ public:
+  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut,
+                             T &valueResetOutput, activation_mode_t actGate) {
+    valueUpdateGate = activation(valueUpdateGate, actGate);
+    valueResetGate = activation(valueResetGate, actGate);
+    valueResetOutput = prevOut * valueResetGate;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate,
+                             __m256 &prevOut, __m256 &valueResetOutput,
+                             activation_mode_t actGate) {
+    valueUpdateGate = activation(valueUpdateGate, actGate);
+    valueResetGate = activation(valueResetGate, actGate);
+    valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
+  }
+#endif
+#endif
+};
+
+template <typename T>
+class gru_finalOutput {
+ public:
+  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut,
+                             T &valueOutput, activation_mode_t actInput) {
+    valueFrameState = activation(valueFrameState, actInput);
+    valueOutput = prevOut - (valueUpdateGate * prevOut) +
+                  (valueUpdateGate * valueFrameState);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState,
+                             __m256 &prevOut, __m256 &valueOutput,
+                             activation_mode_t actInput) {
+    valueFrameState = activation(valueFrameState, actInput);
+    valueOutput = _mm256_add_ps(
+        _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
+        _mm256_mul_ps(valueUpdateGate, valueFrameState));
+  }
+#endif
+#endif
+};
+}  // namespace forward
+
+namespace backward {
+
+template <typename T>
+class gru_stateGrad {
+ public:
+  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
+                             T &valueFrameState, T &gradFrameState,
+                             T &valuePrevOut, T &gradPrevOut, T &gradOutput,
+                             activation_mode_t actInput) {
+    gradUpdateGate = (gradOutput * valueFrameState);
+    gradUpdateGate -= (gradOutput * valuePrevOut);
+    gradPrevOut -= (gradOutput * valueUpdateGate);
+    gradPrevOut += gradOutput;
+    gradFrameState =
+        activation(gradOutput * valueUpdateGate, valueFrameState, actInput);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
+                             __m256 &valueFrameState, __m256 &gradFrameState,
+                             __m256 &valuePrevOut, __m256 &gradPrevOut,
+                             __m256 &gradOutput, activation_mode_t actInput) {
+    gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
+    gradUpdateGate =
+        _mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
+    gradPrevOut = _mm256_add_ps(
+        _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
+        gradOutput);
+    gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate),
+                                valueFrameState, actInput);
+  }
+#endif
+#endif
+};
+
+template <typename T>
+class gru_resetGrad {
+ public:
+  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
+                             T &valueResetGate, T &gradResetGate,
+                             T &valuePrevOut, T &gradPrevOut,
+                             T &gradResetOutput, activation_mode_t actGate) {
+    gradResetGate = (gradResetOutput * valuePrevOut);
+    gradPrevOut += (gradResetOutput * valueResetGate);
+    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
+    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
+                             __m256 &valueResetGate, __m256 &gradResetGate,
+                             __m256 &valuePrevOut, __m256 &gradPrevOut,
+                             __m256 &gradResetOutput,
+                             activation_mode_t actGate) {
+    gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
+    gradPrevOut = _mm256_add_ps(gradPrevOut,
+                                _mm256_mul_ps(gradResetOutput, valueResetGate));
+    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
+    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  }
+#endif
+#endif
+};
+
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..125af449d3f700e24be5e4b7615c3b0e03fd4e5b
--- /dev/null
+++ b/paddle/operators/math/gru_compute.cc
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/detail/gru_cpu_kernel.h"
+#include "paddle/operators/math/detail/gru_kernel.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<platform::CPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+#ifndef __NVCC__
+    if (value.prevOutValue) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, false, batchSize, frameSize * 2, frameSize, 1,
+          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
+          value.gateValue, frameSize * 3);
+    }
+
+    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
+                                 frameSize, batchSize, active_gate);
+
+    if (value.prevOutValue) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, false, batchSize, frameSize, frameSize, 1,
+          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
+          value.gateValue + frameSize * 2, frameSize * 3);
+    }
+
+    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
+                                 frameSize, batchSize, active_node);
+#endif
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<platform::CPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
+                      int batchSize, activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+#ifndef __NVCC__
+    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
+                                grad, frameSize, batchSize, active_node);
+
+    if (value.prevOutValue && grad.prevOutGrad) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, true, batchSize, frameSize, frameSize, 1,
+          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
+          frameSize, 0, grad.resetOutputGrad, frameSize);
+
+      if (grad.stateWeightGrad) {
+        math::gemm<platform::CPUPlace, T>(
+            context, true, false, frameSize, frameSize, batchSize, 1,
+            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
+            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+      }
+    }
+
+    detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
+                                grad, frameSize, batchSize, active_gate);
+
+    if (grad.prevOutGrad && value.prevOutValue) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, true, batchSize, frameSize, frameSize * 2, 1,
+          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
+          grad.prevOutGrad, frameSize);
+
+      if (grad.gateWeightGrad) {
+        math::gemm<platform::CPUPlace, T>(
+            context, true, false, frameSize, frameSize * 2, batchSize, 1,
+            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
+            grad.gateWeightGrad, frameSize * 2);
+      }
+    }
+#endif
+  }
+};
+
+template struct GRUUnitFunctor<platform::CPUPlace, float>;
+template struct GRUUnitFunctor<platform::CPUPlace, double>;
+template struct GRUUnitGradFunctor<platform::CPUPlace, float>;
+template struct GRUUnitGradFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7b9e54ac029f6aa00553338435684097d6d02b25
--- /dev/null
+++ b/paddle/operators/math/gru_compute.cu
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/gru_gpu_kernel.h"
+#include "paddle/operators/math/detail/gru_kernel.h"
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<platform::GPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+    dim3 threads;
+    dim3 grid;
+    if (batchSize == 1) {
+      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+      int frameBlocks = (frameSize + 1024 - 1) / 1024;
+      threads = dim3(framePerBlock, 1);
+      grid = dim3(frameBlocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+    }
+
+    if (value.prevOutValue) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, false, batchSize, frameSize * 2, frameSize, 1,
+          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
+          value.gateValue, frameSize * 3);
+    }
+
+    if (batchSize == 1) {
+      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
+                                      /* isBatch= */ false,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_resetOutput<T>(), value.gateValue,
+          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
+          active_gate);
+    } else {
+      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
+                                      /* isBatch= */ true,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_resetOutput<T>(), value.gateValue,
+          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
+          active_gate);
+    }
+
+    if (value.prevOutValue) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, false, batchSize, frameSize, frameSize, 1,
+          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
+          value.gateValue + frameSize * 2, frameSize * 3);
+    }
+
+    if (batchSize == 1) {
+      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
+                                      /* isBatch= */ false,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_finalOutput<T>(), value.gateValue,
+          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          active_node);
+    } else {
+      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
+                                      /* isBatch= */ true,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_finalOutput<T>(), value.gateValue,
+          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          active_node);
+    }
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<platform::GPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
+                      int batchSize, activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+    dim3 threads;
+    dim3 grid;
+    if (batchSize == 1) {
+      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+      int frameBlocks = (frameSize + 1024 - 1) / 1024;
+      threads = dim3(framePerBlock, 1);
+      grid = dim3(frameBlocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+    }
+
+    if (batchSize == 1) {
+      detail::KeGruBackwardStateGrad<
+          detail::backward::gru_stateGrad<T>,
+          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
+          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
+          batchSize, active_node);
+    } else {
+      detail::KeGruBackwardStateGrad<
+          detail::backward::gru_stateGrad<T>,
+          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
+          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
+          batchSize, active_node);
+    }
+
+    if (value.prevOutValue && grad.prevOutGrad) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, true, batchSize, frameSize, frameSize, 1,
+          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
+          frameSize, 0, grad.resetOutputGrad, frameSize);
+
+      if (grad.stateWeightGrad) {
+        math::gemm<platform::GPUPlace, T>(
+            context, true, false, frameSize, frameSize, batchSize, 1,
+            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
+            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+      }
+    }
+
+    if (batchSize == 1) {
+      detail::KeGruBackwardResetGrad<
+          detail::backward::gru_resetGrad<T>,
+          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
+          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
+          batchSize, active_gate);
+    } else {
+      detail::KeGruBackwardResetGrad<
+          detail::backward::gru_resetGrad<T>,
+          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
+          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
+          batchSize, active_gate);
+    }
+
+    if (grad.prevOutGrad && value.prevOutValue) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, true, batchSize, frameSize, frameSize * 2, 1,
+          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
+          grad.prevOutGrad, frameSize);
+
+      if (grad.gateWeightGrad) {
+        math::gemm<platform::GPUPlace, T>(
+            context, true, false, frameSize, frameSize * 2, batchSize, 1,
+            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
+            grad.gateWeightGrad, frameSize * 2);
+      }
+    }
+  }
+};
+
+template struct GRUUnitFunctor<platform::GPUPlace, float>;
+template struct GRUUnitFunctor<platform::GPUPlace, double>;
+template struct GRUUnitGradFunctor<platform::GPUPlace, float>;
+template struct GRUUnitGradFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..1475fb38104f353857dfd968e46af98a6d52c52a
--- /dev/null
+++ b/paddle/operators/math/gru_compute.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// TODO(guosheng): refine code style in gru_compute
+template <typename T>
+struct hl_gru_value {
+  T *gateWeight;
+  T *stateWeight;
+  T *gateValue;
+  T *resetOutputValue;
+  T *outputValue;
+  T *prevOutValue;
+};
+
+template <typename T>
+struct hl_gru_grad {
+  T *gateWeightGrad;
+  T *stateWeightGrad;
+  T *gateGrad;
+  T *resetOutputGrad;
+  T *outputGrad;
+  T *prevOutGrad;
+};
+
+template <typename Place, typename T>
+struct GRUUnitFunctor {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate);
+};
+
+template <typename Place, typename T>
+struct GRUUnitGradFunctor {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
+                      int batchSize, activation_mode_t active_node,
+                      activation_mode_t active_gate);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_pooling.cc b/paddle/operators/math/sequence_pooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5913c99fdb01100d0de44ab317124550fa626528
--- /dev/null
+++ b/paddle/operators/math/sequence_pooling.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_pooling.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class MaxSeqPoolFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), 1);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int* max_index = index->data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      for (int64_t k = 0; k < dim; ++k) {
+        out_data[i * dim + k] = in_data[starts[i] * dim + k];
+        max_index[i * dim + k] = starts[i];
+      }
+      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
+        for (int64_t k = 0; k < dim; ++k) {
+          if (in_data[j * dim + k] > out_data[i * dim + k]) {
+            out_data[i * dim + k] = in_data[j * dim + k];
+            max_index[i * dim + k] = j;
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad) {
+    auto og_dims = out_grad.dims();
+    auto ig_dims = in_grad->dims();
+    auto idx_dims = index.dims();
+    PADDLE_ENFORCE_GT(og_dims.size(), 1);
+    PADDLE_ENFORCE_GT(ig_dims.size(), 1);
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+
+    const T* og_data = out_grad.data<T>();
+    const int* max_index = index.data<int>();
+    T* ig_data = in_grad->data<T>();
+
+    SetConstant<platform::CPUPlace, T> set_zero;
+    set_zero(context, in_grad, static_cast<T>(0.0));
+    int64_t num_seq = og_dims[0];
+    int64_t dim = out_grad.numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      for (int64_t j = 0; j < dim; ++j) {
+        int step_id = max_index[i * dim + j];
+        ig_data[step_id * dim + j] = og_data[i * dim + j];
+      }
+    }
+  }
+};
+
+template class MaxSeqPoolFunctor<platform::CPUPlace, float>;
+template class MaxSeqPoolFunctor<platform::CPUPlace, double>;
+template class MaxSeqPoolGradFunctor<platform::CPUPlace, float>;
+template class MaxSeqPoolGradFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5ed951402fecba66a8960f4d024bf3785dac51c7
--- /dev/null
+++ b/paddle/operators/math/sequence_pooling.cu
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence_pooling.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename T>
+__global__ void KeMaxSequencePool(const T* input, const size_t* starts,
+                                  T* output, int* index, int64_t num_seq,
+                                  int64_t dim) {
+  int dim_idx = threadIdx.x;
+  int seq_id = blockIdx.x;
+  if (seq_id >= num_seq) return;
+  size_t start = starts[seq_id];
+  size_t end = starts[seq_id + 1];
+
+  for (int64_t i = dim_idx; i < dim; i += blockDim.x) {
+    T max_val = static_cast<T>(-FLT_MAX);
+    int max_id = -1;
+    for (size_t step_id = start; step_id < end; step_id++) {
+      if (max_val < input[step_id * dim + i]) {
+        max_val = input[step_id * dim + i];
+        max_id = step_id;
+      }
+    }
+    output[seq_id * dim + i] = max_val;
+    index[seq_id * dim + i] = max_id;
+  }
+}
+
+template <typename T>
+class MaxSeqPoolFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), static_cast<int64_t>(1));
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int* max_index = index->data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+
+    dim3 threads(256, 1);
+    dim3 grid(num_seq, 1);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
+        in_data, starts.data(), out_data, max_index, num_seq, dim);
+  }
+};
+
+template <typename T>
+__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
+                                      T* in_grad, int64_t num_seq,
+                                      int64_t dim) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int col_idx = idx % dim;
+  if (idx < num_seq * dim) {
+    int step_id = max_index[idx];
+    in_grad[step_id * dim + col_idx] = out_grad[idx];
+  }
+}
+
+template <typename T>
+class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad) {
+    auto og_dims = out_grad.dims();
+    auto idx_dims = index.dims();
+    auto ig_dims = in_grad->dims();
+    PADDLE_ENFORCE_GT(og_dims.size(), static_cast<int64_t>(1));
+    PADDLE_ENFORCE_GT(ig_dims.size(), static_cast<int64_t>(1));
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+
+    const T* og_data = out_grad.data<T>();
+    const int* max_index = index.data<int>();
+    T* ig_data = in_grad->data<T>();
+
+    SetConstant<platform::GPUPlace, T> set_zero;
+    set_zero(context, in_grad, static_cast<T>(0.0));
+    int64_t num_seq = og_dims[0];
+    int64_t dim = out_grad.numel() / num_seq;
+
+    unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
+    dim3 threads(128, 1);
+    dim3 grid(blocks, 1);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
+        og_data, max_index, ig_data, num_seq, dim);
+  }
+};
+
+template class MaxSeqPoolFunctor<platform::GPUPlace, float>;
+template class MaxSeqPoolFunctor<platform::GPUPlace, double>;
+template class MaxSeqPoolGradFunctor<platform::GPUPlace, float>;
+template class MaxSeqPoolGradFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_pooling.h b/paddle/operators/math/sequence_pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..35dfe26de1a87a064410401244914d4e2a94176e
--- /dev/null
+++ b/paddle/operators/math/sequence_pooling.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename Place, typename T>
+class MaxSeqPoolFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index);
+};
+
+template <typename Place, class T>
+class MaxSeqPoolGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc
index 5ecbee3b413617e3a5523d9a32e72bc08bd316c5..5a1a6154203d40186f1e41491194b19612931b1f 100644
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
@@ -144,7 +144,10 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
         )DOC")
         .SetDefault(false);
     AddComment(R"DOC(
-The MatMul operator is used to perform (batched) matrix multiplication
+MatMul Operator.
+
+
+This operator is used to perform (batched) matrix multiplication
 over the last two dimensions of the input tensors `X` and `Y`.
 
 If a transpose flag is specified, the last two dimensions of the
@@ -166,7 +169,8 @@ The differences are:
 - We add `transpose_X` and `transpose_Y` flags.
 
 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 7caa1c9d0cf4dba33a206c85bcbed1fb1cb4e010..78b4bbca84d4670aba73222f1d679604d7516b02 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -36,7 +36,11 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
     AddOutput("Out", "The output of mean op");
-    AddComment(R"DOC( Mean Operator
+    AddComment(R"DOC(
+Mean Operator.
+
+Out is a scalar which is the mean of all elements in X. 
+
 )DOC");
   }
 };
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index f7943e99acc5975d077f2319b6f678cfc693c1f3..4684c20208501a3239fd57b35428946bb52af4a0 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -52,14 +52,16 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Y", "The right tensor of minus operator.");
     AddOutput("Out", "The output tensor of minus operator.");
 
-    AddComment(R"DOC(Minus Operator
+    AddComment(R"DOC(
+Minus Operator.
 
 Equation:
 
-    Out = X - Y
+    $Out = X - Y$
 
 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
index 7b9e9528952d552a69ffe6a628672901c5c1a7fd..28528848af1f467bf38be53f9d05fee6ca3f93cc 100644
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -43,27 +43,35 @@ class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
                            framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "The input tensor of modified huber loss op."
+             "The input tensor of modified huber loss op. "
              "X is 2-D tensor with shape [batch_size, 1].");
     AddInput("Y",
-             "The target labels of modified huber loss op."
-             "The shape of Y is same as X. Values of Y must be 0 or 1.");
+             "The target labels of modified huber loss op. "
+             "The shape of Y is the same as X. Values of Y must be 0 or 1.");
     AddOutput("IntermediateVal",
               "Variable to save intermediate result which will be reused in "
               "backward processing.")
         .AsIntermediate();
     AddOutput("Out", "Classification loss for X.");
     AddComment(R"DOC(
-Modified huber loss is used in binary classification problem. The shape of
-input X and target Y are both [N, 1] and so is the shape of output loss.
-Since target Y is not differentiable, cacluating gradient for Y is illegal.
-The formulation of modified huber loss is:
-
-L(y, f(x)) = max(0, 1 - yf(x))^2  for yf(x) >= -1,
-             -4yf(x)              otherwise.
-
-Make sure the values of target label Y are in {0, 1} here. The operator will
+Modified Huber Loss Operator.
+
+This operator is used in binary classification problem. The shape of
+input X and target Y are both [N, 1] and so is the shape of the output loss.
+Since target Y is not differentiable, calculating gradient for Y is illegal.
+The formula of modified huber loss is:
+
+$$
+L(y, f(x)) = 
+\begin{cases}
+(\max(0, 1 - yf(x)))^2,  \text{if} \  yf(x) >= -1    \\
+             -4yf(x),    \quad \text{otherwise}
+\end{cases}
+$$
+
+Make sure the values of target label Y are in {0, 1} here. This operator will
 scale values of Y to {-1, +1} when computing losses and gradients.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc
index 2d4d6f13720f0e6888edbddcb3243116506227ba..e8ce16f4cfcf83fd13e4d3a5318a4ae0c8c8449c 100644
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
@@ -75,17 +75,23 @@ class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("VelocityOut", "(Tensor) Output updated velocity");
 
     AddAttr<float>("mu", "(float) Momentum coefficient");
-    AddAttr<bool>("useNesterov", "(bool) Use Nesterov Momentum")
+    AddAttr<bool>("useNesterov",
+                  "(bool, default false) "
+                  "Use Nesterov Momentum")
         .SetDefault(false);
     AddComment(R"DOC(
-
-Momentum Algorithm with a flag for Nestrov Moemntum (momentum).
-
-velocity = mu * velocity + gradient
-if (use_nesterov):
-  param = param - gradient * learning_rate + mu * velocity * learning_rate
-else:
-  param = param - learning_rate * velocity
+Momentum Optimizer.
+
+This optimizer has a flag for Nestrov Momentum.
+The update equations are as follows:
+
+$$
+velocity = mu * velocity + gradient \\
+if (use\_nesterov):   \\
+  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
+else:   \\
+  param = param - learning\_rate * velocity. \\
+$$
 
 )DOC");
   }
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 90acf034d905e6ab3ba7bf8c3d29e1ef1161ed0c..3c39ae10dc50084cff284c307167c33c9208a3ce 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -78,6 +78,7 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "The output of mul op");
     AddAttr<int>(
         "x_num_col_dims",
+        "(int, default 1) "
         R"DOC(mul_op can take tensors with more than two dimensions as input `X`,
             in that case, tensors will be reshaped to a matrix. The matrix's first
             dimension(column length) will be the product of tensor's last
@@ -88,20 +89,24 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
         .EqualGreaterThan(1);
     AddAttr<int>(
         "y_num_col_dims",
+        "(int, default 1) "
         R"DOC(mul_op can take tensors with more than two dimensions as input `Y`,
              in that case, tensors will be reshaped to a matrix. Just like input `X`.
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
     AddComment(R"DOC(
-Mul operator is used to perform matrix multiplication for input X and Y.
+Mul Operator. 
+
+This operator is used to perform matrix multiplication for input X and Y.
 
 The equation is:
 
-    Out = X * Y
+    $$Out = X * Y$$
 
 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 4d86769026e4b3e3040bdcb3bc6dc2edea58b4b0..234fddcfd55ccc66f6378689dbc426499474b11f 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -66,7 +66,8 @@ class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The candidate tensors of multiplex operator.")
         .AsDuplicable();
     AddOutput("Out", "The output tensor of multiplex operator.");
-    AddComment(R"DOC(Multiplex operator
+    AddComment(R"DOC(
+Multiplex Operator.
 
 Multiplex multiple tensors according to the index provided by the index tensor.
 
@@ -77,10 +78,11 @@ the (Ids[i])-th tensor.
 
 For i-th row of the output tensor:
 
-y[i] = x_{k}[i]
+$$y[i] = x_{k}[i]$$
 
-where y is the output tensor. `x_{k}` is the k-th input tensor
+where `y` is the output tensor, `x_{k}` is the k-th input tensor,
 and `k = Ids[i]`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/name_convention.md b/paddle/operators/name_convention.md
index 5a216907950100070ba57176c382eb659effb293..b5cb176e003b4584321142ac9f1c3380b7010936 100644
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/name_convention.md
@@ -4,10 +4,10 @@ To make the operator document itself more clear, we recommend operator names obe
 
 ### OpProtoMaker names
 
-When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator. 
+When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator.
 
 - Input/Output.
-  - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words. 
+  - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words.
   - If an operator's Input/Output are tensors in math, not match to any meaningful words, input name should starts from `X`. e.g. `X`, `Y`, and output name should starts from `Out`. e.g. `Out`. This rule intends making operators which have few inputs/outputs unified.
 
 - Attribute.
@@ -15,7 +15,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith
 
 - Comments.
   - Input/Output/Attr comment follow the format of **(type,default value) usage**, corresponding to which type it can be and how it will be used in the operator. e.g.  Attribute in Accumulator`"gamma" `,`(float, default 1.0) Accumulation multiplier`.
-  - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`. 
+  - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`.
 
 - Order.
   - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
@@ -24,7 +24,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith
 
 Here we give some examples to show how these rules will be used.
 
-- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`. 
+- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`.
 
 - The operator has two input, one output. e.g. `rowwise_add`, inputs : `X`, `Y`, outputs : `Out`.
 
@@ -38,23 +38,27 @@ public:
   AccumulateOpMaker(framework::OpProto *proto,
                             framework::OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor. 
-    If the output size is not the same as input size, 
+    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
+    If the output size is not the same as input size,
     the output tensor is first reshaped and initialized to zero, and only then, accumulation is done.");
     AddOutput("Out", "(Tensor) Accumulated output tensor");
     AddAttr<float>("gamma", "(float, default 1.0) Accumulation multiplier").SetDefault(1.0f);
     AddComment(R"DOC(
-Accumulate operator accumulates the input tensor to the output tensor. If the
+Accumulate Operator.
+
+This operator accumulates the input tensor to the output tensor. If the
 output tensor already has the right size, we add to it; otherwise, we first
 initialize the output tensor to all zeros, and then do accumulation. Any
 further calls to the operator, given that no one else fiddles with the output
 in the interim, will do simple accumulations.
-Accumulation is done as shown:
+
+Accumulation is done as follows:
 
 Out = 1*X + gamma*Out
 
 where X is the input tensor, Out is the output tensor and gamma is the multiplier
 argument.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index d39cb2fcf9cc205edf86f8ab1d5e04b5672e00f6..66fcc09bc877867e66a37adc73230d8dabf4cbed 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -48,12 +48,17 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Communicator",
               "Create Communicator for communicating between gpus");
-    AddAttr<std::vector<int>>("gpus", "gpu id lists");
-    AddAttr<int>("data_type", "output data type")
+    AddAttr<std::vector<int>>("gpus", "(vector<int>) GPU id lists");
+    AddAttr<int>("data_type",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
         .SetDefault(framework::DataType::FP32);
     AddComment(R"DOC(
-               create communicator.
-        )DOC");
+NCCLInit Operator.
+
+Create communicator.
+
+)DOC");
   }
 };
 
@@ -143,11 +148,15 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of AllReduce op");
     AddAttr<std::string>("reduction",
+                         "(string, default 'ncclSum') "
                          "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
         .SetDefault("ncclSum");
     AddComment(R"DOC(
-            AllReduce the input tensors.
-        )DOC");
+NCCLAllReduce Operator.
+
+AllReduce the input tensors.
+
+)DOC");
   }
 };
 
@@ -161,14 +170,20 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Reduce op");
     AddAttr<std::string>("reduction",
+                         "(string, default 'ncclSum') "
                          "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
         .SetDefault("ncclSum");
     AddAttr<int>("root",
-                 "root gpu of the parameter. if not "
-                 "set(platform::kInvalidGPUId). hashed by name.")
+                 "(int, default kInvalidGPUId) "
+                 "Root gpu of the parameter. If not, "
+                 "set(platform::kInvalidGPUId). Hashed by name.")
         .SetDefault(platform::kInvalidGPUId);
     AddComment(R"DOC(
-            Reduce the tensors)DOC");
+NCCLReduce Operator.
+
+Reduce the tensors.
+
+)DOC");
   }
 };
 
@@ -182,12 +197,16 @@ class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Bcast");
     AddAttr<int>("root",
-                 "root gpu of the parameter. if not "
-                 "set(platform::kInvalidGPUId). hashed by name.")
+                 "(int, default kInvalidGPUId) "
+                 "Root gpu of the parameter. If not, "
+                 "set(platform::kInvalidGPUId). Hashed by name.")
         .SetDefault(platform::kInvalidGPUId);
     AddComment(R"DOC(
-            Bcast the tensors.
-        )DOC");
+NCCLBcast Operator.
+
+Bcast the tensors.
+
+)DOC");
   }
 };
 
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index 73a0b8baff530840ddd0d4c65cd4c060ab18e401..adb75df6ef10c59fc6f3db4d36e1ffb1ae0b4b1e 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -54,41 +54,44 @@ class PadOpMaker : public framework::OpProtoAndCheckerMaker {
              "The input of pad op. "
              "The input should be a k-D tensor(k > 0 and k < 7)");
     AddOutput("Out",
-              "The output of pad op."
+              "The output of pad op. "
               "A tensor with the same shape as X.");
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>) "
+        "A list<int> to describe the padding rules for each dimension. "
+        "For 2-D image tensor, paddings=[0, 1, 2, 3] means "
+        "padding 0 row to top, 1 row to bottom, 2 columns to left "
+        "and 3 columns to right. Size of paddings should be equal to "
+        "2 * dimension size of the input tensor.");
+    AddAttr<float>("pad_value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas.")
+        .SetDefault(0.0f);
     AddComment(R"DOC(
-Pad input into output, as specified by paddings and pad_value. The input should be a k-D tensor(k > 0 and k < 7). As an example:
+Pad Operator.
+
+Pad input into output, as specified by paddings and pad_value. 
+The input should be a k-D tensor(k > 0 and k < 7). As an example:
 
 Given:
 
 X = [[1, 2],
-   [3, 4]]
-
-and
+     [3, 4]],
 
-paddings = [0, 1, 1, 2]
+paddings = [0, 1, 1, 2],
 
 and
 
-pad_value = 0
+pad_value = 0,
 
-then we get
+we have:
 
 Out = [[0, 1, 2, 0, 0]
        [0, 3, 4, 0, 0]
        [0, 0, 0, 0, 0]]
+
 )DOC");
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "A list<int> to describes padding rules for each dimension."
-        " For 2-D image tensor, paddings=[0, 1, 2, 3] means"
-        " padding 0 row to top, 1 row to bottom, 2 columns to left"
-        " and 3 columns to right.Size of paddings should be equal to"
-        " 2 * dimension size of input tensor.");
-    AddAttr<float>("pad_value",
-                   "(float) default to 0; "
-                   "The value to fill padded areas.")
-        .SetDefault(0.0f);
   }
 };
 
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index 4d75c11bc8130343e95f75e687529303179caa93..f58aab733866973f477ca79e5e53ba58adbf3dc7 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -73,125 +73,138 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
   AddInput(
       "X",
       "(Tensor) The input tensor of pooling operator. "
-      "The format of input tensor is NCHW. Where N is batch size, C is the "
-      "number of channels, H and W is the height and width of feature.");
+      "The format of input tensor is NCHW, where N is batch size, C is the "
+      "number of channels, H is the height of the feature, "
+      "and W is the width of the feature.");
   AddOutput("Out",
-            "(Tensor) The output tensor of pooling operator."
-            "The format of output tensor is also NCHW."
-            "Where N is batch size, C is "
-            "the number of channels, H and W is the height and "
-            "width of feature.");
+            "(Tensor) The output tensor of pooling operator. "
+            "The format of output tensor is also NCHW, "
+            "where N is batch size, C is the number of channels, "
+            "H is the height of the feature, "
+            "and W is the width of the feature.");
 
   AddAttr<std::string>("poolingType",
                        "(string), pooling type, can be \"max\" for max-pooling "
                        "and \"avg\" for average-pooling.")
       .InEnum({"max", "avg"});
   AddAttr<std::vector<int>>("ksize",
-                            "(vector ), the pooling window size(height, width) "
-                            "of pooling operator."
+                            "(vector<int>) The pooling window "
+                            "size(height, width) of the pooling operator. "
                             "If globalPooling = true, ksize and paddings will "
                             "be ignored.");  // TODO(Chengduo): Add checker.
                                              // (Currently,
   // TypedAttrChecker don't support vector type.)
   AddAttr<bool>("globalPooling",
-                "(bool default: false), whether to use the global pooling."
+                "(bool, default false) Whether to use the global pooling. "
                 "If globalPooling = true, ksize and paddings will be ignored.")
       .SetDefault(false);
-  AddAttr<std::vector<int>>(
-      "strides",
-      "(vector, default:{1, 1}), strides(height, width) of pooling operator.")
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int>, default {1, 1}), strides(height, "
+                            "width) of pooling operator.")
       .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
   // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector defalut:{0,0}), paddings(height, width) of pooling operator."
+      "(vector<int>, defalut {0,0}), paddings(height, width) of pooling "
+      "operator."
       "If globalPooling = true, paddings and ksize will be ignored.")
       .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
   // TypedAttrChecker don't support vector type.)
 
   AddComment(R"DOC(
+Pool2d Operator.
+
 The pooling2d operation calculates the output based on
 the input, poolingType and ksize, strides, paddings parameters.
-Input(X) and output(Out) are in NCHW format. Where N is batch size, C is the
-number of channels, H and W is the height and width of feature.
+Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
   Input:
-       X shape: (N, C, H_in, W_in)
+       X shape: $(N, C, H_{in}, W_{in})$
   Output:
-       Out shape: (N, C, H_out, W_out)
-  where
-       H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+       Out shape: $(N, C, H_{out}, W_{out})$
+  where 
+       $$ 
+       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       $$
+
 )DOC");
 }
 
 Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
                              framework::OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput(
-      "X",
-      "(Tensor) The input tensor of pooling operator. "
-      "The format of input tensor is NCDHW. Where N is batch size, C is "
-      "the number of channels, D, H and W is the depth, height and width of "
-      "feature.");
+  AddInput("X",
+           "(Tensor) The input tensor of pooling operator. "
+           "The format of input tensor is NCDHW, where N is batch size, C is "
+           "the number of channels, and D, H and W is the depth, height and "
+           "width of "
+           "the feature, respectively.");
   AddOutput("Out",
             "(Tensor) The output tensor of pooling operator."
-            "The format of output tensor is also NCDHW."
-            "Where N is batch size, C is "
-            "the number of channels, D, H and W is the depth, height and "
-            "width of feature.");
+            "The format of output tensor is also NCDHW, "
+            "where N is batch size, C is "
+            "the number of channels, and D, H and W is the depth, height and "
+            "width of the feature, respectively.");
 
   AddAttr<std::string>("poolingType",
-                       "(string), pooling type, can be \"max\" for max-pooling "
+                       "(string) Pooling type, can be \"max\" for max-pooling "
                        "and \"avg\" for average-pooling.")
       .InEnum({"max", "avg"});
-  AddAttr<std::vector<int>>("ksize",
-                            "(vector ), the pooling window size(depth, height, "
-                            "width) of pooling "
-                            "operator."
-                            "If globalPooling = true, ksize and paddings wille "
-                            "be ignored.");  // TODO(Chengduo): Add checker.
-                                             // (Currently,
+  AddAttr<std::vector<int>>(
+      "ksize",
+      "(vector<int>) The pooling window size(depth, height, "
+      "width) of pooling operator. "
+      "If globalPooling = true, ksize and paddings will "
+      "be ignored.");  // TODO(Chengduo): Add checker.
+                       // (Currently,
   // TypedAttrChecker don't support vector type.)
   AddAttr<bool>("globalPooling",
-                "(bool default: false), whether to use the global pooling."
+                "(bool, default false) Whether to use the global pooling. "
                 "If globalPooling = true, ksize and paddings wille be ignored.")
       .SetDefault(false);
-  AddAttr<std::vector<int>>("strides",
-                            "(vector, default:{1,1,1}), strides(depth, height, "
-                            "width) of pooling operator.")
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector<int>, default {1,1,1}) Strides(depth, height, "
+      "width) of the pooling operator.")
       .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
                                // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector defalut:{0,0,0}), paddings(depth, height, "
-      "width) of pooling operator."
-      "If globalPooling = true, ksize and paddings wille be ignored.")
+      "(vector<int>, defalut {0,0,0}), paddings(depth, height, "
+      "width) of pooling operator. "
+      "If globalPooling = true, ksize and paddings will be ignored.")
       .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                                // TypedAttrChecker don't support vector type.)
 
   AddComment(R"DOC(
+Pool3d Operator.
+
 The pooling3d operation calculates the output based on
-the input, poolingType and ksize, strides, paddings parameters.
-Input(X) and output(Out) are in NCDHW format. Where N is batch
-size, C is the number of channels, D, H and W is the depth, height and
-width of feature. Parameters(ksize, strides, paddings) are three elements.
-These three elements represent depth, height and width, respectively.
-The input(X) size and output(Out) size may be different.
+the input, poolingType, ksize, strides, and paddings parameters.
+Input(X) and output(Out) are in NCDHW format, where N is batch
+size, C is the number of channels, and D, H and W are the depth, height and
+width of the feature, respectively. Parameters(ksize, strides, paddings) 
+are three elements. These three elements represent depth, height and 
+width, respectively. The input(X) size and output(Out) size may be different.
 
 Example:
   Input:
-       X shape: (N, C, D_in, H_in, W_in)
+       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
   Output:
-       Out shape: (N, C, D_out, H_out, W_out)
+       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
   where
-       D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
-       W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+       $$
+       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
+       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+       $$
+
 )DOC");
 }
 }  // namespace operators
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index 95e896e7cc33b1aebe78d1af8746a25318048041..a31b3fcb7083ba5bfbcb6666c4641833ae1e69eb 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -89,64 +89,73 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
-        "(Tensor), the input tensor of pooling operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of image.");
+        "(Tensor) The input tensor of pooling operator. "
+        "The format of input tensor is NCHW, where N is batch size, C is the "
+        "number of channels, H is the height of the image, "
+        "and W is the width of the image.");
     AddOutput("Out",
-              "(Tensor), the output tensor of pooling operator."
-              "The format of output tensor is also NCHW."
-              "Where N is batch size, C is "
-              "the number of channels, H and W is the height and "
-              "width of image.");
+              "(Tensor) The output tensor of pooling operator. "
+              "The format of output tensor is also NCHW, "
+              "where N is batch size, C is "
+              "the number of channels, H is the height of the image "
+              "and W is the width of the image.");
     AddOutput("Mask",
-              "(Tensor), the Mask tensor of pooling operator."
-              "The format of output tensor is also NCHW."
-              "Where N is batch size, C is the number of channels, H and W "
-              "is the height and width of image."
-              "The value in it is the index in current feature map");
+              "(Tensor) The Mask tensor of pooling operator."
+              "The format of output tensor is also NCHW, "
+              "where N is batch size, C is the number of channels, "
+              "H is the height of the image, "
+              "and W is the width of the image. "
+              "It represents the index in the current feature map.");
 
     AddAttr<std::vector<int>>("ksize",
-                              "(vector ), the pooling window size(height, "
-                              "width) of pooling operator."
+                              "(vector<int>) The pooling window size(height, "
+                              "width) of pooling operator. "
                               "If globalPooling = true, ksize and paddings "
                               "will be ignored.");  // TODO(Chengduo): Add
                                                     // checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
         "globalPooling",
-        "(bool default: false), whether to use the global pooling."
+        "(bool, default false) Whether to use the global pooling. "
         "If globalPooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
-    AddAttr<std::vector<int>>(
-        "strides",
-        "(vector, default:{1, 1}), strides(height, width) of pooling operator.")
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int>, default {1, 1}), strides(height, "
+                              "width) of pooling operator.")
         .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector defalut:{0, 0}), paddings(height, width) of pooling operator."
+        "(vector<int>, defalut {0, 0}), paddings(height, width) of pooling "
+        "operator. "
         "If globalPooling = true, paddings and will be ignored.")
         .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
 
     AddComment(R"DOC(
+MaxPool2d Operator.
+
 The maxPooling2d with index operation calculates the output and the mask
-based on the input and ksize, strides, paddings parameters. Input(X) and
-output(Out, Mask) are in NCHW format. Where N is batch size, C is the
-number of channels, H and W is the height and width of feature.
+based on the input, ksize, strides, and paddings parameters. Input(X) and
+output(Out, Mask) are in NCHW format, where N is batch size, C is the
+number of channels, H is the height of the feature, 
+and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out, Mask) size may be different.
 
 Example:
   Input:
-       X shape: (N, C, H_in, W_in)
+       X shape: $(N, C, H_{in}, W_{in})$
   Output:
-       Out shape: (N, C, H_out, W_out)
-       Mask shape: (N, C, H_out, W_out)
+       Out shape: $(N, C, H_{out}, W_{out})$
+       Mask shape: $(N, C, H_{out}, W_{out})$
   where
-       H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+       $$
+       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       $$
+
 )DOC");
   }
 };
@@ -156,70 +165,76 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
   MaxPool3dWithIndexOpMaker(framework::OpProto *proto,
                             framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor), the input tensor of pooling operator. "
-        "The format of input tensor is NCDHW. Where N is batch size, C is "
-        "the number of channels, D, H and W is the depth, height and width of "
-        "image.");
+    AddInput("X",
+             "(Tensor) The input tensor of pooling operator. "
+             "The format of input tensor is NCDHW, where N is batch size, C is "
+             "the number of channels, and D, H and W are the depth, height and "
+             "width of "
+             "the image, respectively");
     AddOutput("Out",
-              "(Tensor), the output tensor of pooling operator."
-              "The format of output tensor is also NCDHW."
-              "Where N is batch size, C is "
-              "the number of channels, D, H and W is the depth, height and "
-              "width of image.");
+              "(Tensor) The output tensor of pooling operator. "
+              "The format of output tensor is also NCDHW, "
+              "where N is the batch size, C is the number of channels, "
+              "and D, H and W are the depth, height and "
+              "width of the image, respectively.");
     AddOutput("Mask",
-              "(Tensor), the Mask tensor of pooling operator."
-              "The format of output tensor is also NCDHW."
-              "Where N is batch size, C is the number of channels, D, H and W "
-              "is the depth, height and width of image."
-              "The value in it is the index in current feature map");
+              "(Tensor) The Mask tensor of pooling operator. "
+              "The format of output tensor is also NCDHW, "
+              "where N is the batch size, C is the number of channels, and "
+              "D, H and W are the depth, height and width "
+              "of the image, respectively. "
+              "It represents the index in the current feature map.");
 
     AddAttr<std::vector<int>>("ksize",
-                              "(vector), the pooling window size(depth, "
-                              "height, width) of pooling "
-                              "operator."
+                              "(vector<int>) The pooling window size(depth, "
+                              "height, width) of pooling operator. "
                               "If globalPooling = true, ksize and paddings "
                               "will be ignored.");  // TODO(Chengduo): Add
                                                     // checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
         "globalPooling",
-        "(bool default: false), whether to use the global pooling."
+        "(bool, default false) Whether to use the global pooling. "
         "If globalPooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
-                              "(vector, default:{1,1,1}), strides(depth, "
+                              "(vector<int>, default {1,1,1}), strides(depth, "
                               "height, width) of pooling operator.")
         .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector defalut:{0,0,0}), paddings(depth, "
-        "height, width) of pooling operator."
+        "(vector, defalut {0,0,0}), paddings(depth, "
+        "height, width) of pooling operator. "
         "If globalPooling = true, paddings and ksize will be ignored.")
         .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
 
     AddComment(R"DOC(
+MaxPool3d Operator.
+
 The maxpooling3d with index operation calculates the output and the mask
 based on the input and ksize, strides, paddings parameters.
-Input(X) and output(Out, Mask) are in NCDHW format. Where N is batch
-size, C is the number of channels, D, H and W is the depth, height and
-width of feature. Parameters(ksize, strides, paddings) are three elements.
+Input(X) and output(Out, Mask) are in NCDHW format, where N is batch
+size, C is the number of channels, and D, H and W are the depth, height and
+width of the feature, respectively. 
+Parameters(ksize, strides, paddings) are three elements.
 These three elements represent depth, height and width, respectively.
 The input(X) size and output(Out, Mask) size may be different.
 
 Example:
   Input:
-       X shape: (N, C, D_in, H_in, W_in)
+       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
   Output:
-       Out shape: (N, C, D_out, H_out, W_out)
-       Mask shape: (N, C, D_out, H_out, W_out)
+       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+       Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
   where
-       D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
-       W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+       $$
+       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
+       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+       $$
+
 )DOC");
   }
 };
diff --git a/paddle/operators/precision_recall_op.cc b/paddle/operators/precision_recall_op.cc
index 39da1e0bf89ce308de62d38a6cce6dbd4c7c7f83..641f7135ded159b1c7330e87c4b1d983e959b466 100644
--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
@@ -92,76 +92,78 @@ class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
                          framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("MaxProbs",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. Each row contains the max probability "
              "of an instance which computed by the previous top_k (k=1) "
              "operator.");
     AddInput("Indices",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. Each row contains the corresponding "
              "index which computed by the previous top_k (k=1) operator.");
     AddInput("Labels",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. Each element is a label and the "
              "value should be in [0, class_number - 1].");
     AddInput("Weights",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. This input is optional. If provided, "
              "weight of instance would be considered when computing metrics.")
         .AsDispensable();
     AddInput("StatesInfo",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape D x 4, "
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape D x 4, "
              "where D is the number of classes. This input is optional. If "
              "provided, current state will be accumulated to this state and "
-             "the accumulation state will be as the output state.")
+             "the accumulation state will be the output state.")
         .AsDispensable();
     AddOutput("BatchMetrics",
-              "(Tensor, default Tensor<float>), a 1-D tensor with shape {6}."
-              "This output tensor contains metrics for current batch data."
+              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
+              "This output tensor contains metrics for current batch data. "
               "The layout is [macro average precision, macro average recall, "
               "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score]");
+              "micro f1 score].");
     AddOutput("AccumMetrics",
-              "(Tensor, default Tensor<float>), a 1-D tensor with shape {6}."
-              "This output tensor contains metrics for accumulated data."
+              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
+              "This output tensor contains metrics for accumulated data. "
               "The layout is [macro average precision, macro average recall, "
               "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score]");
+              "micro f1 score].");
     AddOutput("AccumStatesInfo",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape D x 4, "
+              "(Tensor, default Tensor<float>) A 2-D tensor with shape D x 4, "
               "where D is equal to class number. This output tensor contains "
               "accumulated state variables used to compute metrics. The layout "
               "for each class is [true positives, false positives, "
               "true negatives, false negatives].");
-    AddAttr<int>("class_number", "Number of classes to be evaluated.");
+    AddAttr<int>("class_number", "(int) Number of classes to be evaluated.");
     AddComment(R"DOC(
-When given 'Input(Indices)' and 'Input(Labels)', this operator can be used
+Precision Recall Operator.
+
+When given Input(Indices) and Input(Labels), this operator can be used
 to compute various metrics including:
-  - macro average precision
-  - macro average recall
-  - macro f1 score
-  - micro average precision
-  - micro average recall
-  - micro f1 score
+1. macro average precision
+2. macro average recall
+3. macro f1 score
+4. micro average precision
+5. micro average recall
+6. micro f1 score
 
 To compute the above metrics, we need to do statistics for true positives,
-false positives and false negatives. Here count of true negatives is not
+false positives and false negatives. Here the count of true negatives is not
 necessary, but counting it may provide potential usage and the cost is
-trivial, so the operator also provides count of true negatives.
+trivial, so the operator also provides the count of true negatives.
 
 We define state as a 2-D tensor with shape [class_number, 4]. Each row of a
 state contains statistic variables for corresponding class. Layout of each row
 is: TP(true positives), FP(false positives), TN(true negatives),
-FN(false negatives). If 'Input(Weights)' provided, TP, FP, TN, FN will be
-calculated by given weight instead of instance count.
+FN(false negatives). If Input(Weights) is provided, TP, FP, TN, FN will be
+calculated by given weight instead of the instance count.
 
 This operator also supports metrics computing for cross-batch situation. To
-achieve this, 'Input(StatesInfo)' should be provided. State of current batch
-data will be accumulated to 'Input(StatesInfo)' and 'Output(AccumStatesInfo)'
+achieve this, Input(StatesInfo) should be provided. State of current batch
+data will be accumulated to Input(StatesInfo) and Output(AccumStatesInfo)
 is the accumulation state.
 
-'Output(BatchMetrics)' is metrics of current batch data while
-'Output(AccumStatesInfo)' is metrics of accumulation data.
+Output(BatchMetrics) is metrics of current batch data while
+Output(AccumStatesInfo) is metrics of accumulation data.
 
 )DOC");
   }
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index eef2e34eaacf59b9adacb343e9a0091ebabeaea3..055c471b4561e5fd3c7a65c6f81d66cdce1a5578 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -41,17 +41,24 @@ class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
   PReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of prelu operator.");
-    AddInput("Alpha", "The alpha weight of PRelu operator.");
-    AddOutput("Out", "The output tensor of PRelu operator.");
-    AddComment(R"DOC(PRelu operator
+    AddInput("Alpha", "The alpha weight of prelu operator.");
+    AddOutput("Out", "The output tensor of prelu operator.");
+    AddComment(R"DOC(
+PRelu Operator.
 
 The equation is:
 
-  f(x) = alpha * x , for x < 0
-  f(x) = x         , for x >= 0
+$$
+f(x) =
+\begin{cases}
+\alpha * x, \quad  \text{if} \ x < 0 \\
+x,         \qquad  \text{if} \ x >= 0
+\end{cases}
+$$
 
 The input `X` can carry the LoD (Level of Details) information,
-or not. And the output shares the LoD with input `X`.
+or not. And the output shares the LoD information with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/operators/proximal_adagrad_op.cc
index 39fbf800031cd559a49654667e5a6f634384523d..36e460103ab46bf6f1408840a0699793e2be134d 100644
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
@@ -83,22 +83,26 @@ class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
                    "L1 regularization strength.")
         .SetDefault(0.0f);
     AddAttr<float>("l2",
-                   "(float, default 0.0)"
+                   "(float, default 0.0) "
                    "L2 regularization strength.")
         .SetDefault(0.0f);
     AddComment(R"DOC(
+Proximal Adagrad Optimizer.
 
-Optimizer that implements the proximal adagrad algorithm.
+Optimizer that implements the proximal adagrad algorithm:
 
-moment = moment + grad * grad
-prox_param = param - learning_rate * grad * (1 / sqrt(moment))
-param = sign(prox_param) / (1 + learning_rate * l2) *
-        max { |prox_param| - learning_rate * l1 , 0 }
+$$
+moment = moment + grad * grad \\
+prox\_param = param - learning\_rate * grad * (1 / \sqrt{moment}) \\
+param = sign(prox\_param) / (1 + learning\_rate * l2) *
+        \max(|prox\_param| - learning\_rate * l1 , 0)
+$$
 
 The paper that proposed Proximal GD: 
 (http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
 Here, we use the adagrad learning rate as specified here: 
 (http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+
 )DOC");
   }
 };
diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc
index e4b014b9f5866ec0791cba9b3998b1734066eeeb..5693d0ec9ebf4c470dfa5141b6eeee431f24f2ea 100644
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
@@ -67,19 +67,23 @@ class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
                    "L1 regularization strength.")
         .SetDefault(0.0f);
     AddAttr<float>("l2",
-                   "(float, default 0.0)"
+                   "(float, default 0.0) "
                    "L2 regularization strength.")
         .SetDefault(0.0f);
     AddComment(R"DOC(
+ProximalGD Operator.
 
-Optimizer that implements the proximal gradient descent algorithm.
+Optimizer that implements the proximal gradient descent algorithm:
 
-prox_param = param - learning_rate * grad
-param = sign(prox_param) / (1 + learning_rate * l2) *
-        max { |prox_param| - learning_rate * l1 , 0 }
+$$
+prox\_param = param - learning\_rate * grad \\
+param = sign(prox\_param) / (1 + learning\_rate * l2) *
+        \max(|prox\_param| - learning\_rate * l1, 0)
+$$        
 
 The paper that proposed Proximal Gradient Descent:
 (http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
+
 )DOC");
   }
 };
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 17ef2b1d01bd37abf2ece97ed0a307c2f1bf7e6f..061e82412ea5f4f17fd26a7094e68b97138cc09c 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -26,9 +26,9 @@ class RankLossOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     // input check
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null");
-    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null");
-    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");
 
     auto label_dims = ctx->GetInputDim("Label");
     auto left_dims = ctx->GetInputDim("Left");
@@ -50,32 +50,32 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Label",
              "The label indicating A ranked higher than B or not, row vector.");
     AddInput("Left", "The output of RankNet for doc A, vector.");
-    AddInput("Right", "The output of RankNet for doc B, vetor");
+    AddInput("Right", "The output of RankNet for doc B, vetor.");
     AddOutput("Out", "The output loss of RankLoss operator, vector.");
-    AddComment(R"DOC(RankLoss operator
+    AddComment(R"DOC(
+RankLoss Operator.
 
-Rank loss operator for RankNet[1]. RankNet is a pairwise ranking model with
+RankLoss operator for RankNet
+(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf). 
+RankNet is a pairwise ranking model with
 one training sample consisting of a pair of doc A and B, and the label P
 indicating that A is ranked higher than B or not:
 
 P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
 the input pair.
 
-The RankLoss operator contains three inputs: Left (o_i), Right (o_j) and Label
-(P_{i,j}), which represent the output of RankNet for two docs and the label
-respectively, and yields the rank loss C_{i,j} by following the expression
+The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
+(P_{i,j}), which represent the output of RankNet for the two docs and the label, 
+respectively, and yields the rank loss C_{i,j} using the following equation:
 
-\f[
+\f$$
   C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
   o_{i,j} =  o_i - o_j  \\
   \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-\f]
+\f$$
 
 The operator can take inputs of one sample or in batch.
 
-[1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
-     Rank using Gradient Descent.
-     http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf
 )DOC");
   }
 };
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 9eb2d79b4f65d23222e68ad2a439f7554469278b..b0e87b7059eab3772c179fe31cdb09477b589ed1 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -509,14 +509,14 @@ class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddInput(kInitialStates, "rnn initial states").AsDuplicable();
     AddInput(kParameters,
              "Parameters are used by step block as its input. However, the "
-             "inputs is not a sequence tensor. Every time step, each operator "
-             "in step block just use the parameter directly")
+             "input is not a sequence tensor. Every time step, each operator "
+             "in step block just use the parameter directly.")
         .AsDuplicable();
     AddOutput(kOutputs,
-              "The output sequence of RNN. The sequence length must be same")
+              "The output sequence of RNN. The sequence length must be same.")
         .AsDuplicable();
     AddOutput(kStepScopes,
-              "StepScopes contains all local variables in each time step.");
+              "StepScopes contain all local variables in each time step.");
     AddAttr<std::vector<std::string>>(kExStates,
                                       string::Sprintf(
                                           R"DOC(The ex-state variable names.
@@ -556,10 +556,12 @@ if reverse is True
       o          o          o         o
 )DOC").SetDefault(false);
     AddAttr<bool>(kIsTrain, "").SetDefault(true);
-    AddComment(R"DOC(Static Length Recurrent Operator
+    AddComment(R"DOC(
+Static Length Recurrent Operator.
+
+The static length recurrent operator can only operate on fixed size sequence
+data, i.e. in each mini-batch, the sequence length of all inputs are the same.
 
-The static length recurrent operator can only operate on fix sized sequence
-data, i.e. in each mini-batch, the sequence length of all inputs are same.
 )DOC");
   }
 };
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index 0599daa7688a5658ebea8902c4e15e63570539fb..2589a54cfc7fc5bc11ae983797d480a134e0eb25 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -80,24 +80,27 @@ class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor. Tensors with rank at most 6 are supported");
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 6 are "
+             "supported.");
     AddOutput("Out", "(Tensor) The result tensor.");
     AddAttr<int>(
         "dim",
-        "(int, default 1) The dimension to reduce. "
+        "(int, default 0) The dimension to reduce. "
         "Must be in the range [-rank(input), rank(input)). "
         "If `dim < 0`, the dim to reduce is `rank + dim`. "
-        "Noting that reducing on the first dim will make the LoD info lost.")
+        "Note that reducing on the first dim will make the LoD info lost.")
         .SetDefault(0);
     AddAttr<bool>("keep_dim",
                   "(bool, default false) "
                   "If true, retain the reduced dimension with length 1.")
         .SetDefault(false);
     comment_ = R"DOC(
-{ReduceOP} operator computes the {reduce} of input tensor along the given dimension. 
-The result tensor has 1 fewer dimension than the input unless `keep_dim` is true.
+{ReduceOp} Operator.
+
+This operator computes the {reduce} of input tensor along the given dimension. 
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+
 )DOC";
     AddComment(comment_);
   }
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index 9213cc7a85822e4c78ef72aec2bf86d2edac023a..ba774ec2160c0460867de42f7ad9d5cd65ad8d6a 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -71,8 +71,11 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of reshape operator.");
     AddOutput("Out", "The output tensor of reshape operator.");
-    AddAttr<std::vector<int>>("shape", "Target shape of reshape operator.");
-    AddComment(R"DOC(Reshape operator
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "Target shape of reshape operator.");
+    AddComment(R"DOC(
+Reshape Operator.
 
 Reshape Input(X) into the shape specified by Attr(shape).
 
@@ -81,7 +84,7 @@ Given a 2-D tensor X with 2 rows and 2 columns
 
     [[1, 2], [3, 4]]
 
-with target shape = [1, 4], the reshape operator will transform
+and target shape = [1, 4], the reshape operator will transform
 the tensor X into a 1-D tensor:
 
     [1, 2, 3, 4]
diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc
index fd5567a365c4c843de3b8aec7fa77164f16644a4..a9c45f639c6728ff2fd6de6fcdadfe5032a705d7 100644
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
@@ -68,22 +68,22 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
-             "Input parameter value that has to be updated");
+             "Input parameter value that has to be updated.");
     AddInput("MeanSquare",
              "(Tensor, default Tensor<float>)"
-             " The mean square value that gets updated");
+             " The mean square value that gets updated.");
     AddInput("LearningRate",
              "(Tensor, default Tensor<float>) "
-             "The learning rate should be a tensor of size 1");
+             "The learning rate should be a tensor of size 1.");
     AddInput("Grad",
              "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter");
+             "Input gradient of the parameter.");
     AddInput("Moment",
-             "(Tensor, default Tensor<float>) The moment that gets updated");
+             "(Tensor, default Tensor<float>) The moment that gets updated.");
 
-    AddOutput("ParamOut", "(Tensor) Output updated parameter value");
-    AddOutput("MomentOut", "(Tensor) Output updated moment");
-    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value");
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+    AddOutput("MomentOut", "(Tensor) Output updated moment.");
+    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
 
     AddAttr<float>("epsilon",
                    "(float, default 1e-10) Constant "
@@ -93,18 +93,19 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
                    "(float, default 0.9) "
                    "Discounting factor for coming gradient.")
         .SetDefault(0.9f);
-    AddAttr<float>("momentum", "(float, default 0.0) Constant value")
+    AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
         .SetDefault(0.0f);
     AddComment(R"DOC(
+Rmsprop Optimizer. 
 
-RMSprop
-
-MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad
+$$
+MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad \\
 MomentOut = momentum * Moment +
-            LearningRate * Grad / sqrt(MeanSquareOut + epsilon)
+            \frac{LearningRate * Grad}{\sqrt{MeanSquareOut + epsilon}} \\
 ParamOut = Param -  MomentOut
+$$
 
-The original slides that proposed RMSprop: Slide 29 of
+The original slides that proposed Rmsprop: Slide 29 of
 http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 
 )DOC");
diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
index 08fda9b44564249634f0d1a570e8b2458f88fd41..b862056ad400290a60e8a75a23dceeb1d4422ea4 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
@@ -53,8 +53,10 @@ class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker {
               "(LodTensor)The output of seq_expand op."
               "The lod of output will be as same as input(Y)'s lod.");
     AddComment(R"DOC(
-Expand input(X) according to LOD of input(Y).
+Seq Expand Operator.
 
+This operator expands input(X) according to LOD of input(Y).
+Following are cases to better explain how this works:
 Case 1:
 
 Given 2-level a LoDTensor input(X)
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
index ec4ad50dab7af0f86e586f840dbced402ee14120..64097ef2525d734f79f22ddd7957b3216b06ee7b 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -68,11 +68,12 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The level should be less than the level number of inputs.")
         .SetDefault(0);
     AddComment(R"DOC(
-Sequence Concat operator
+Sequence Concat Operator.
 
 The sequence_concat operator concatenates multiple LoDTensors.
-It only supports sequence (LoD Tensor with level number is 1)
+It supports a sequence (LoD Tensor with level number is 1)
 or a nested sequence (LoD tensor with level number is 2) as its input.
+The following examples explain how the operator works:
 - Case1:
   If the axis is other than 0(here, axis is 1 and level is 1),
   each input should have the same LoD information and the LoD
@@ -98,6 +99,7 @@ or a nested sequence (LoD tensor with level number is 2) as its input.
     LoD(Out) = {{0,5,9}, {0,2,5,7,9}}; Dims(Out) = (9,3,4)
 
 NOTE: The levels of all the inputs should be the same.
+
     )DOC");
   }
 };
diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc
index a3f2ed14439572e9723c3057d212bb773b2a4e44..41cadce4c603a9c14db79e2f6b30f8664cf72a38 100644
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -105,10 +105,10 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
-        "(LoDTensor) the input(X) is a LodTensor, which support "
+        "(LoDTensor) the input(X) is a LodTensor, which supports "
         "variable-time length input sequence. The underlying tensor in "
-        "this LoDTensor is a matrix with shape (T, N), where, T is the "
-        "total time steps in this mini-batch, N is the input_hidden_size.");
+        "this LoDTensor is a matrix with shape (T, N), where T is the "
+        "total time steps in this mini-batch and N is the input_hidden_size.");
     AddInput("PaddingData",
              "(Tensor, optional) the input(PaddingData) is an optional "
              "parameter, and it is learnable. "
@@ -157,14 +157,16 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
         .GreaterThan(0);
 
     AddComment(R"DOC(
-    SequenceConvOp performs convolution operation on features of
-    contextLength time-steps of each instance.
-    The convolution operation calculates the output based on the input, filter
-    and strides, paddings parameters. The size of each dimension of the
-    parameters is checked in the infer-shape. In order to ensure the equal
-    length of sequence before and after convolution, it is necessary to fill
-    the top and bottom of each sequence according to context_length,
-    context_stride and context_start.
+Sequence Conv Operator.
+
+SequenceConvOp performs convolution operation on features of contextLength
+time-steps of each instance. The convolution operation calculates the output
+based on the input, filter, strides and paddings parameters.
+The size of each dimension of the parameters is checked during infer-shape.
+In order to ensure the equal length of sequence before and after convolution,
+it is necessary to fill the top and bottom of each sequence based on
+context_length, context_stride and context_start.
+
     )DOC");
   }
 };
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index dfe8de49858bffee77249ff745f483fdb08302cc..710f280017fa5e188b187a3e91b27e2bedc65d10 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -27,6 +27,11 @@ class SequencePoolOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SequencePoolOp should not be null.");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    if (ctx->Attrs().Get<std::string>("pooltype") == "MAX") {
+      PADDLE_ENFORCE(ctx->HasOutput("MaxIndex"),
+                     "Output(MaxIndex) of SequencePoolOp should not be null.");
+      ctx->SetOutputDim("MaxIndex", ctx->GetInputDim("X"));
+    }
   }
 };
 
@@ -35,43 +40,50 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
   SequencePoolOpMaker(framework::OpProto* proto,
                       framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor), the variable-length input of SequencePoolOp");
+    AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
     AddOutput("Out",
-              "(Tensor), output of SequencePoolOp, which does not contain LoD "
+              "(Tensor) The output of SequencePoolOp does not contain LoD "
               "infomation.");
+    AddOutput("MaxIndex",
+              "(Tensor<int>) This tensor is used for the sequence max-pooling "
+              "to record the max indexes.")
+        .AsIntermediate();
     AddAttr<std::string>(
         "pooltype",
         "(int, default AVERAGE) the pooling pooltype of SequencePoolOp.")
         .SetDefault("AVERAGE")
         .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"});
     AddComment(R"DOC(
-    SequencePoolOp pools features of all time-steps of each instance.
-
-    It supports six pooling pooltype:
-    - AVERAGE: Out[i] = average_{for each instance in i-th sequence}{X[i]}
-    - SUM:     Out[i] = sum_{for each instance in i-th sequence}{X[i]}
-    - SQRT:    Out[i] = sum_{for each instance in i-th sequence}{X[i]} 
-                        / sqrt(i-th sequence length)
-    - LAST:    Out[i] = last instance in i-th sequence X[i]
-    - FIRST:   Out[i] = first instance in i-th sequence X[i]
-    - MAX:     Out[i] = max_{for each instance in i-th sequence}{X[i]}
-
-    For a mini-batch of 3 variable-length sentences, containing 2, 3, and 2 time-steps:
-
-    Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
-    Besides, for the sake of simplicity, we assume M=1 and N=1,
-    and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
-
-    Thus, Out is a [3,1,1] Tensor without LoD infomation.
-    And for different pooltype, the value of Out is as follows:
-
-    - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
-    - SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
-    - SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
+Sequence Pool Operator.
+
+The SequencePoolOp pools features of all time-steps of each instance.
+It supports six pooling types:
+1. AVERAGE: Out[i] = $$avg(X_i)$$
+2. SUM:     Out[i] = $$\sum_jX_{ij}$$
+3. SQRT:    Out[i] = $$\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$
+4. LAST:    Out[i] = last instance in i-th sequence X[i]
+5. FIRST:   Out[i] = first instance in i-th sequence X[i]
+6. MAX:     Out[i] = $$max(X_i)$$
+
+The following example explains how this works:
+For a mini-batch of 3 variable-length sentences,
+containing 2, 3, and 2 time-steps:
+
+Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
+Besides, for the sake of simplicity, we assume M=1 and N=1,
+and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
+
+Thus, Out is a [3,1,1] Tensor without LoD infomation.
+And for different pooltype, the value of Out is as follows:
+
+- AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
+- SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
+- SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
            6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
-    - MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
-    - LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
-    - FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
+- MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+- LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+- FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
+
     )DOC");
   }
 };
@@ -93,6 +105,12 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index e0e0493fe0ef7e1963ce5c2e3f37c164a605809b..2b8a25c2414c20efaffedfc8603697b3a104634f 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence_pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -34,7 +35,7 @@ class SequencePoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
+    auto* out = context.Output<Tensor>("Out");
     std::string pooltype = context.Attr<std::string>("pooltype");
 
     auto dims = in->dims();
@@ -53,6 +54,16 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto lod_level_0 = lod[0];
 
     out->mutable_data<T>(context.GetPlace());
+
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolFunctor<Place, T> max_pool;
+      auto* index = context.Output<Tensor>("MaxIndex");
+      index->Resize({dims});
+      index->mutable_data<int>(context.GetPlace());
+      max_pool(context.device_context(), *in, out, index);
+      return;
+    }
+
     auto place = context.GetEigenDevice<Place>();
     for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
       Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
@@ -69,8 +80,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
       } else if (pooltype == "SQRT") {
         out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                               std::sqrt(static_cast<T>(h));
-      } else if (pooltype == "MAX") {
-        out_e.device(place) = in_e.maximum(Eigen::array<int, 1>({{0}}));
       } else if (pooltype == "LAST") {
         out_e.device(place) = in_e.chip(h - 1, 0);
       } else if (pooltype == "FIRST") {
@@ -87,8 +96,8 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
+    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
     std::string pooltype = context.Attr<std::string>("pooltype");
 
     auto dims = in->dims();
@@ -96,6 +105,14 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
     int64_t w = in->numel() / dims[0];
 
     in_g->mutable_data<T>(context.GetPlace());
+
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolGradFunctor<Place, T> max_pool_grad;
+      auto* index = context.Input<Tensor>("MaxIndex");
+      max_pool_grad(context.device_context(), *out_g, *index, in_g);
+      return;
+    }
+
     if (pooltype == "LAST" || pooltype == "FIRST") {
       // set X@Grad be zero at first when pooltype is LAST/FIRST
       math::SetConstant<Place, T> functor;
@@ -118,20 +135,6 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
       } else if (pooltype == "SQRT") {
         in_g_e.device(place) =
             (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
-      } else if (pooltype == "MAX") {
-        auto in_t =
-            in->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
-        Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
-            in_t_map(in_t.data<T>(), h, w);
-        int row_id;
-        Eigen::array<int, 2> extents{{1, 1}};
-        for (int col_id = 0; col_id < w; col_id++) {
-          in_t_map.col(col_id).maxCoeff(&row_id);
-          Eigen::array<int, 2> in_offsets{{row_id, col_id}};
-          Eigen::array<int, 2> out_offsets{{0, col_id}};
-          in_g_e.slice(in_offsets, extents).device(place) =
-              out_g_e.slice(out_offsets, extents);
-        }
       } else if (pooltype == "LAST") {
         in_g_e.chip(h - 1, 0).device(place) = out_g_e;
       } else if (pooltype == "FIRST") {
diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc
index c891ab1fdcbb167453462c45b00b4632e663dd0e..32c15025660ebf0baf317e269a33c047e6844219 100644
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -43,20 +43,24 @@ class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
               "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
               "of length 1.");
     AddComment(R"DOC(
-SequenceSoftmaxOp computes softmax activation among all time-steps for each
+Sequence Softmax Operator.
+
+SequenceSoftmaxOp computes the softmax activation among all time-steps for each
 sequence. The dimension of each time-step should be 1. Thus, the shape of
-input Tensor can be either [N, 1] or [N], where N is the sum of all sequences'
-lengths.
+input Tensor can be either [N, 1] or [N], where N is the sum of the length
+of all sequences.
 
-Equation:
+The algorithm works as follows:
     for i-th sequence in a mini-batch:
-        Out(X[lod[i]:lod[i+1]], :) =
-            exp(X[lod[i]:lod[i+1], :]) / sum(exp(X[lod[i]:lod[i+1], :]))
+        $$Out(X[lod[i]:lod[i+1]], :) =
+            \frac{\exp(X[lod[i]:lod[i+1], :])}
+            {\sum(\exp(X[lod[i]:lod[i+1], :]))}$$
 
 For example, for a mini-batch of 3 sequences with variable-length,
 each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
 then softmax will be computed among X[0:2, :], X[2:5, :], X[5:7, :]
 and N turns out to be 7.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
index e781c8db208464cb94d94d1914e50f5aba3db2c6..d9e40546523c60b0a7eec2e0593446258996ba58 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -107,26 +107,28 @@ class SigmoidCrossEntropyWithLogitsOpMaker
     AddComment(R"DOC(
 SigmoidCrossEntropyWithLogits Operator.
 
-This measures the elementwise probability error in discrete classification tasks
+This measures the element-wise probability error in classification tasks
 in which each class is independent. This can be thought of as predicting labels
-for a data-point that are not mutually exclusive. For example, a news article
-can be about politics, technology or sports at the same time or none of these.
+for a data-point, where labels are not mutually exclusive.
+For example, a news article can be about politics, technology or sports
+at the same time or none of these.
 
 The logistic loss is given as follows:
 
-       loss = -Labels * log(sigmoid(X)) - (1 - Labels) * log(1 - sigmoid(X))
+       $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$
 
-We know that sigmoid(X) = (1 / (1 + exp(-X))). By substituting this we get
+We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get:
 
-       loss = X - X * Labels + log(1 + exp(-X))
+       $$loss = X - X * Labels + \log(1 + \exp(-X))$$
 
-For stability and to prevent overflow of exp(-X) when X < 0,
-we can reformulate the loss as follows:
+For stability and to prevent overflow of $$\exp(-X)$$ when X < 0,
+we reformulate the loss as follows:
 
-       loss = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
+       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$
 
 Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
 However the output only shares the LoD with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index 758481943d463f22eb6c6e0be9a99ad99161da5b..ebf7b43700a7498aa18b5f648b0b8c2c4e7b442b 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -77,14 +77,17 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
                       "A float scalar with default value 3.0.")
         .SetDefault(3.0);
     AddComment(R"DOC(
-Compute smooth l1 loss for input and target. The operator take the 1st
-dimension of input as batch size. For each instance, it will compute
-smooth l1 loss element by element first and sum all losses to one value.
-So the output shape is [batch_size, 1].
+Smooth L1 Loss Operator.
+
+This operator computes the smooth l1 loss for input and target.
+The operator takes the first dimension of input as the batch size.
+For each instance, it computes the smooth l1 loss element by element first
+and then sums all the losses. So the resulting output shape
+is [batch_size, 1].
 
 The equation is:
-loss = 0.5 * (sigma * (x-y))^2    if abs(x - y) < 1 / sigma^2
-       abs(x - y) - 0.5 / sigma^2 otherwise
+loss = $$0.5 * (\sigma * (x-y))^2$$   if $$|x - y| < 1 /({\sigma}^2)$$
+       $$\frac{|x - y| - 0.5}{{\sigma}^2}$$ otherwise
 
 )DOC");
   }
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 00fd0b32a9b3c0dd9fedf7b7621b1f15e5c4ce93..93f89e33a73c5f4c6c0e5a8793a0abe7c692b656 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -44,20 +44,23 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
              "2-D with shape [batch_size, input_feature_dimensions].");
     AddOutput("Y", "The normalized values with the same shape as X.");
     AddComment(R"DOC(
-The input of softmax operator is a 2-D tensor with shape N x K (N is the
+Softmax Operator.
+
+The input of the softmax operator is a 2-D tensor with shape N x K (N is the
 batch_size, K is the dimension of input feature). The output tensor has the
 same shape as the input tensor.
 
 For each row of the input tensor, the softmax operator squashes the
 K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-values in the range [0, 1] that add up to 1. Specifically, it computes the
-exponential of the given dimension and the sum of exponential values of all
-the other dimensions in the K-dimensional vector input. Then the ratio of the
-exponential of the given dimension and the sum of exponential values of all
-the other dimensions is the output of the softmax operator.
+values in the range [0, 1] that add up to 1.
+It computes the exponential of the given dimension and the sum of exponential
+values of all the other dimensions in the K-dimensional vector input.
+Then the ratio of the exponential of the given dimension and the sum of
+exponential values of all the other dimensions is the output of the softmax
+operator.
 
 For each row `i` and each column `j` in input X, we have:
-    Y[i, j] = exp(X[i, j]) / sum_j(exp(X[i, j]))
+    $$Y[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
 
 )DOC");
   }
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index 50497da1b70d39d2638240dd91035c9181124af9..c6b94f5cc947ccb86315fd9058b8c57d1a996927 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -4,13 +4,13 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/softmax_with_cross_entropy_op.h"
 #include <paddle/function/TensorType.h>
@@ -30,12 +30,10 @@ class SoftmaxWithCrossEntropyOpMaker
              "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
              "and K is the class number.");
     AddInput("Label",
-             "(Tensor, default: Tensor<int>), The ground truth which is a 2-D "
-             "tensor. "
-             "If softLabel is set to false, Label is a Tensor<int> with shape "
-             "[N x 1]."
-             "If softLabel is set to true, Label is a Tensor<float/double> "
-             "with shape [N x K].");
+             "(Tensor) The ground truth which is a 2-D tensor. If soft_label "
+             "is set to false, Label is a Tensor<int64> with shape [N x 1]. If "
+             "soft_label is set to true, Label is a Tensor<float/double> with "
+             "shape [N x K].");
     AddOutput(
         "Softmax",
         "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
@@ -51,32 +49,34 @@ class SoftmaxWithCrossEntropyOpMaker
         "the given labels as soft labels.")
         .SetDefault(false);
     AddComment(R"DOC(
-Cross entropy loss with softmax are used as the output layer extensively. This
+Softmax With Cross Entropy Operator.
+
+Cross entropy loss with softmax is used as the output layer extensively. This
 operator computes the softmax normalized values for each row of the input
-tensor, after which cross-entropy loss is then computed. This provides a more
+tensor, after which cross-entropy loss is computed. This provides a more
 numerically stable gradient.
 
-Because this operators performs a softmax on logits internally, it expects
-unscaled logits. Please do not call this op with the output of softmax operator,
-which will produce incorrect results.
+Because this operator performs a softmax on logits internally, it expects
+unscaled logits. This operator should not be used with the output of
+softmax operator since that would produce incorrect results.
 
-When the attribute softLabel is set false, this operators expects mutually
-exclusive hard labels, each sample in a batch is in exactly one class with
-probabilities 1. Each sample in the batch with one and only one label.
+When the attribute soft_label is set false, this operators expects mutually
+exclusive hard labels, each sample in a batch is in exactly one class with a
+probability of 1.0. Each sample in the batch will have a single label.
 
-Equation:
+The equation is as follows:
 
-1) hard label (one-hot label)
+1) Hard label (one-hot label, so every sample has exactly one class)
 
-Loss_j = \f$ -\text{Logit}_{Label_j} +
+$$Loss_j = \f$ -\text{Logit}_{Label_j} +
 \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right),
-j = 1, ..., K $\f
+j = 1, ..., K $\f$$
 
-2) soft label (a distribution over all classes)
+2) Soft label (each sample can have a distribution over all classes)
 
-Loss_j = \f$ -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i -
+$$Loss_j = \f$ -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i -
 \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right),
-j = 1,...,K $\f
+j = 1,...,K $\f$$
 
 )DOC");
   }
@@ -196,6 +196,8 @@ REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
 REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
                   ops::SoftmaxWithCrossEntropyOpGrad);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
-                       ops::SoftmaxWithCrossEntropyKernel<float>);
+                       ops::SoftmaxWithCrossEntropyKernel<float>,
+                       ops::SoftmaxWithCrossEntropyKernel<double>);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradKernel<float>);
+                       ops::SoftmaxWithCrossEntropyGradKernel<float>,
+                       ops::SoftmaxWithCrossEntropyGradKernel<double>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index 7602918bb39312db3c4d1a4064801712ef94ec72..b1faddac3fd21aaf817caf9d3e57e664f4e0e2d5 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -4,13 +4,13 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
@@ -24,7 +24,7 @@ using Tensor = framework::Tensor;
 namespace {
 template <typename T>
 __global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad,
-                                 const int* labels, const int batch_size,
+                                 const int64_t* labels, const int batch_size,
                                  const int class_num) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int sample_idx = tid / class_num;
@@ -50,7 +50,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
   int ids = blockIdx.x * blockDim.x + threadIdx.x;
   if (ids < batch_size * class_num) {
     int row_ids = ids / class_num;
-    logit_grad[ids] = logit_grad[ids] * (loss_grad[row_ids] - labels[ids]);
+    logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
   }
 }
 }  // namespace
@@ -104,7 +104,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
                               .stream()>>>(logit_grad_data, loss_grad_data,
                                            label_data, batch_size, class_num);
     } else {
-      const int* label_data = labels->data<int>();
+      const int64_t* label_data = labels->data<int64_t>();
       CrossEntropyGrad<T><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               context.device_context())
@@ -119,6 +119,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy,
-                       ops::SoftmaxWithCrossEntropyCUDAKernel<float>);
+                       ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
+                       ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
 REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>);
+                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
+                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index 7f3f9e23aa9455437cfa893363b3e59a0699dbea..c4ab3f74b4b07d13957d99e01aa4868fac719f61 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -4,13 +4,13 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -60,25 +60,25 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
 
     const int class_num = logit_grad->dims()[1];
+    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
+
     if (context.Attr<bool>("soft_label")) {
-      auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
-      auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
       auto lbl_mat = EigenMatrix<T>::From(*labels);
-
       logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
-          logit_grad_mat *
-          (out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) -
-           lbl_mat);
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
+          (logit_grad_mat - lbl_mat);
     } else {
+      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
+          logit_grad_mat *
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
+
       const int batch_size = logit_grad->dims()[0];
-      const int* label_data = labels->data<int>();
-      const T* out_grad_data = out_grad->data<T>();
+      const int64_t* label_data = labels->data<int64_t>();
       T* logit_grad_data = logit_grad->data<T>();
-
+      const T* out_grad_data = out_grad->data<T>();
       for (int i = 0; i < batch_size; ++i) {
-        int index = i * class_num + label_data[i];
-        logit_grad_data[index] =
-            out_grad_data[i] * (logit_grad_data[index] - 1.);
+        logit_grad_data[i * class_num + label_data[i]] -= out_grad_data[i];
       }
     }
   }
diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
index d785e57c830439ad80005d9a3d4bb77faf1ae1b9..94de3d5069017a7ca818e246ad574c4db92d8006 100644
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -32,7 +32,7 @@ class TransposeOp : public framework::OperatorWithKernel {
     size_t axis_size = axis.size();
 
     PADDLE_ENFORCE_EQ(x_rank, axis_size,
-                      "the input tensor's rank(%d) "
+                      "The input tensor's rank(%d) "
                       "should be equal to the axis's size(%d)",
                       x_rank, axis_size);
 
@@ -64,12 +64,14 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor)The output tensor");
     AddAttr<std::vector<int>>(
         "axis",
-        "(vector<int>)a list of values, and the size of the list should be "
+        "(vector<int>)A list of values, and the size of the list should be "
         "the same with the input tensor rank, the tensor will "
         "permute the axes according the the values given");
     AddComment(R"DOC(
-The Tensor will be permuted according to the axis values given.
-The op is very much like the numpy.transpose function in python
+Transpose Operator.
+
+The input tensor will be permuted according to the axis values given.
+The op functions similar to how numpy.transpose works in python.
 For example:
  >> input = numpy.arange(6).reshape((2,3))
  >> input
@@ -83,6 +85,7 @@ For example:
 		[2, 5]])
 So, given a input tensor of shape(N, C, H, W) and the axis is {0, 2, 3, 1},
 the output tensor shape will be (N, H, W, C)
+
 )DOC");
   }
 };
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index d3fc544ec712a753304de8568534c78c154774a5..5462e6c6c7a840f75f8c15195c6d6910f30ef733 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -239,7 +239,8 @@ void BindVarDsec(py::module &m) {
       .value("FEED_MINIBATCH", VarDesc::FEED_MINIBATCH)
       .value("FETCH_LIST", VarDesc::FETCH_LIST)
       .value("STEP_SCOPES", VarDesc::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", VarDesc::LOD_RANK_TABLE);
+      .value("LOD_RANK_TABLE", VarDesc::LOD_RANK_TABLE)
+      .value("LOD_TENSOR_ARRAY", VarDesc::LOD_TENSOR_ARRAY);
 }
 
 void BindOpDesc(py::module &m) {
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 78dc7943b3644e7014da427e6aca640d94746f68..0c528174b2b2b3a27869ed0083fc96b8d90e723b 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/prune.h"
 #include "paddle/framework/selected_rows.h"
 #include "paddle/framework/tensor_array.h"
@@ -233,6 +234,9 @@ All parameter, weight, gradient are variables in Paddle.
              return self.GetMutable<SelectedRows>();
            },
            py::return_value_policy::reference)
+      .def("get_lod_tensor_array",
+           [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
+           py::return_value_policy::reference)
 #ifdef PADDLE_WITH_CUDA
       .def("get_communicator",
            [](Variable &self) -> platform::Communicator * {
@@ -505,6 +509,23 @@ All parameter, weight, gradient are variables in Paddle.
         return res;
       });
 
+  py::class_<LoDTensorArray>(m, "LoDTensorArray")
+      .def("__getitem__",
+           [](LoDTensorArray &self, size_t i) { return &self.at(i); },
+           py::return_value_policy::reference)
+      .def("__len__", [](LoDTensorArray &self) { return self.size(); })
+      .def("__setitem__",
+           [](LoDTensorArray &self, size_t i, const LoDTensor &t) {
+             PADDLE_ENFORCE_LT(i, self.size());
+             self[i].ShareDataWith(t);
+             self[i].set_lod(t.lod());
+           })
+      .def("append", [](LoDTensorArray &self, const LoDTensor &t) {
+        self.emplace_back();
+        self.back().ShareDataWith(t);
+        self.back().set_lod(t.lod());
+      });
+
   m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 5bdf8c833522564e6b1027ca5dad8c0bb481cdc4..73da7dfa6f997be5d34bbe9fb99254aa6e6b28f2 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -2,171 +2,184 @@
 
 set -xe
 
-# Set BASE_IMAGE according to env variables
-if [[ ${WITH_GPU} == "ON" ]]; then
-  BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
-else
-  BASE_IMAGE="ubuntu:16.04"
-fi
-
-DOCKERFILE_GPU_ENV=""
-DOCKERFILE_CUDNN_DSO=""
-if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-    DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
-    DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
-fi
-
-mkdir -p /paddle/build
-cd /paddle/build
-
-# build script will not fail if *.deb does not exist
-rm *.deb 2>/dev/null || true
-# delete previous built whl packages
-rm -rf /paddle/paddle/dist 2>/dev/null || true
-
-cat <<EOF
-========================================
-Configuring cmake in /paddle/build ...
-      -DCMAKE_BUILD_TYPE=Release
-      -DWITH_DOC=OFF
-      -DWITH_GPU=${WITH_GPU:-OFF}
-      -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
-      -DWITH_MKLML=${WITH_MKLML:-ON}
-      -DWITH_AVX=${WITH_AVX:-OFF}
-      -DWITH_GOLANG=${WITH_GOLANG:-ON}
-      -DWITH_SWIG_PY=ON
-      -DWITH_C_API=${WITH_C_API:-OFF}
-      -DWITH_PYTHON=${WITH_PYTHON:-ON}
-      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-      -DCUDNN_ROOT=/usr/
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
-      -DWITH_TESTING=${WITH_TESTING:-ON}
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-========================================
-EOF
 
-# Disable UNITTEST_USE_VIRTUALENV in docker because
-# docker environment is fully controlled by this script.
-# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
-cmake .. \
-      -DCMAKE_BUILD_TYPE=Release \
-      -DWITH_DOC=OFF \
-      -DWITH_GPU=${WITH_GPU:-OFF} \
-      -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
-      -DWITH_MKLML=${WITH_MKLML:-ON} \
-      -DWITH_AVX=${WITH_AVX:-OFF} \
-      -DWITH_GOLANG=${WITH_GOLANG:-ON} \
-      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-      -DWITH_C_API=${WITH_C_API:-OFF} \
-      -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-      -DCUDNN_ROOT=/usr/ \
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-      -DWITH_TESTING=${WITH_TESTING:-ON} \
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-
-cat <<EOF
-============================================
-Building in /paddle/build ...
-============================================
-EOF
-make -j `nproc`
+function cmake_gen() {
+    # Set BASE_IMAGE according to env variables
+    if [[ ${WITH_GPU} == "ON" ]]; then
+    BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
+    else
+    BASE_IMAGE="ubuntu:16.04"
+    fi
+
+    DOCKERFILE_GPU_ENV=""
+    DOCKERFILE_CUDNN_DSO=""
+    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
+        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
+    fi
+
+    mkdir -p /paddle/build
+    cd /paddle/build
+
+    # build script will not fail if *.deb does not exist
+    rm *.deb 2>/dev/null || true
+    # delete previous built whl packages
+    rm -rf /paddle/paddle/dist 2>/dev/null || true
 
-if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
-cat <<EOF
-========================================
-Running unit tests ...
-========================================
+    cat <<EOF
+    ========================================
+    Configuring cmake in /paddle/build ...
+        -DCMAKE_BUILD_TYPE=Release
+        -DWITH_DOC=OFF
+        -DWITH_GPU=${WITH_GPU:-OFF}
+        -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
+        -DWITH_MKLML=${WITH_MKLML:-ON}
+        -DWITH_AVX=${WITH_AVX:-OFF}
+        -DWITH_GOLANG=${WITH_GOLANG:-ON}
+        -DWITH_SWIG_PY=ON
+        -DWITH_C_API=${WITH_C_API:-OFF}
+        -DWITH_PYTHON=${WITH_PYTHON:-ON}
+        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+        -DCUDNN_ROOT=/usr/
+        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
+        -DWITH_TESTING=${WITH_TESTING:-ON}
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+    ========================================
 EOF
-    ctest --output-on-failure
-    # make install should also be test when unittest
-    make install -j `nproc`
-    pip install /usr/local/opt/paddle/share/wheels/*.whl
-    paddle version
-fi
-
 
-if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
+    # Disable UNITTEST_USE_VIRTUALENV in docker because
+    # docker environment is fully controlled by this script.
+    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
+    cmake .. \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DWITH_DOC=OFF \
+        -DWITH_GPU=${WITH_GPU:-OFF} \
+        -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
+        -DWITH_MKLML=${WITH_MKLML:-ON} \
+        -DWITH_AVX=${WITH_AVX:-OFF} \
+        -DWITH_GOLANG=${WITH_GOLANG:-ON} \
+        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
+        -DWITH_C_API=${WITH_C_API:-OFF} \
+        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
+        -DCUDNN_ROOT=/usr/ \
+        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
+        -DWITH_TESTING=${WITH_TESTING:-ON} \
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+}
+
+function run_build() {
     cat <<EOF
-========================================
-Building documentation ...
-   In /paddle/build_doc
-========================================
+    ============================================
+    Building in /paddle/build ...
+    ============================================
 EOF
-    mkdir -p /paddle/build_doc
-    pushd /paddle/build_doc
-    cmake .. \
-          -DWITH_DOC=ON \
-          -DWITH_GPU=OFF \
-          -DWITH_AVX=${WITH_AVX:-ON} \
-          -DWITH_SWIG_PY=ON \
-          -DWITH_STYLE_CHECK=OFF
-    make -j `nproc` gen_proto_py
-    make -j `nproc` paddle_docs paddle_docs_cn
-    popd
-fi
-
-
-if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
+    make -j `nproc`
+}
+
+function run_test() {
+    if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
     cat <<EOF
-========================================
-Converting C++ source code into HTML ...
-========================================
+    ========================================
+    Running unit tests ...
+    ========================================
 EOF
-    export WOBOQ_OUT=/paddle/build/woboq_out
-    mkdir -p $WOBOQ_OUT
-    cp -rv /woboq/data $WOBOQ_OUT/../data
-    /woboq/generator/codebrowser_generator \
-        -b /paddle/build \
-        -a \
-        -o $WOBOQ_OUT \
-        -p paddle:/paddle
-    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-fi
-
-cat <<EOF
-========================================
-Generate /paddle/build/Dockerfile ...
-========================================
+        ctest --output-on-failure
+        # make install should also be test when unittest
+        make install -j `nproc`
+        pip install /usr/local/opt/paddle/share/wheels/*.whl
+        paddle version
+    fi
+}
+
+
+function gen_docs() {
+    if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
+        cat <<EOF
+    ========================================
+    Building documentation ...
+    In /paddle/build_doc
+    ========================================
 EOF
+        mkdir -p /paddle/build_doc
+        pushd /paddle/build_doc
+        cmake .. \
+            -DWITH_DOC=ON \
+            -DWITH_GPU=OFF \
+            -DWITH_AVX=${WITH_AVX:-ON} \
+            -DWITH_SWIG_PY=ON \
+            -DWITH_STYLE_CHECK=OFF
+        make -j `nproc` gen_proto_py
+        make -j `nproc` paddle_docs paddle_docs_cn
+        popd
+    fi
+
+
+    if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
+        cat <<EOF
+    ========================================
+    Converting C++ source code into HTML ...
+    ========================================
+EOF
+        export WOBOQ_OUT=/paddle/build/woboq_out
+        mkdir -p $WOBOQ_OUT
+        cp -rv /woboq/data $WOBOQ_OUT/../data
+        /woboq/generator/codebrowser_generator \
+            -b /paddle/build \
+            -a \
+            -o $WOBOQ_OUT \
+            -p paddle:/paddle
+        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+    fi
+}
+
 
-cat > /paddle/build/Dockerfile <<EOF
-FROM ${BASE_IMAGE}
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-ENV HOME /root
+function gen_dockerfile() {
+
+    cat <<EOF
+    ========================================
+    Generate /paddle/build/Dockerfile ...
+    ========================================
 EOF
 
-if [[ -n ${APT_MIRROR} ]]; then
-cat >> /paddle/build/Dockerfile <<EOF
-RUN sed -i '${APT_MIRROR}' /etc/apt/sources.list
+    cat > /paddle/build/Dockerfile <<EOF
+    FROM ${BASE_IMAGE}
+    MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+    ENV HOME /root
 EOF
-fi
-
-if [[ ${WITH_GPU} == "ON"  ]]; then
-  NCCL_DEPS="apt-get install -y libnccl-dev &&"
-else
-  NCCL_DEPS="" 
-fi
-
-cat >> /paddle/build/Dockerfile <<EOF
-ADD python/dist/*.whl /
-# run paddle version to install python packages first
-RUN apt-get update &&\
-    ${NCCL_DEPS}\
-    apt-get install -y wget python-pip && pip install -U pip && \
-    pip install /*.whl; apt-get install -f -y && \
-    apt-get clean -y && \
-    rm -f /*.whl && \
-    paddle version && \
-    ldconfig
-${DOCKERFILE_CUDNN_DSO}
-${DOCKERFILE_GPU_ENV}
-ADD go/cmd/pserver/pserver /usr/bin/
-ADD go/cmd/master/master /usr/bin/
-ADD paddle/pybind/print_operators_doc /usr/bin/
-# default command shows the paddle version and exit
-CMD ["paddle", "version"]
+
+    if [[ ${WITH_GPU} == "ON"  ]]; then
+        NCCL_DEPS="apt-get install -y libnccl-dev &&"
+    else
+        NCCL_DEPS="" 
+    fi
+
+    cat >> /paddle/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
+    # run paddle version to install python packages first
+    RUN apt-get update &&\
+        ${NCCL_DEPS}\
+        apt-get install -y wget python-pip && pip install -U pip && \
+        pip install /*.whl; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f /*.whl && \
+        paddle version && \
+        ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_GPU_ENV}
+    ADD go/cmd/pserver/pserver /usr/bin/
+    ADD go/cmd/master/master /usr/bin/
+    # default command shows the paddle version and exit
+    CMD ["paddle", "version"]
 EOF
+}
 
 set +xe
+
+cmake_gen
+run_build
+run_test
+gen_docs
+gen_dockerfile
+
 printf "If you need to install PaddlePaddle in develop docker image,"
 printf "please make install or pip install build/python/dist/*.whl.\n"
diff --git a/paddle/utils/Excepts.h b/paddle/utils/Excepts.h
index 0add66da7464293795927431daf0e90359f40b52..5c2c504f53a586f2991ccfae891991465fdb39b6 100644
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
@@ -17,8 +17,7 @@ limitations under the License. */
 
 #include <fenv.h>
 
-#if (defined(__APPLE__) || defined(__OSX__)) && !defined(__arm__) && \
-    !defined(__aarch64__)
+#if defined(__APPLE__) || defined(__OSX__)
 
 int fegetexcept(void);
 int feenableexcept(unsigned int excepts);
diff --git a/paddle/utils/arch/osx/Excepts.cpp b/paddle/utils/arch/osx/Excepts.cpp
index 42ecaa06d256c9d259a20c648626605d77ce0308..ac444615786fa9f89f96504a31b2289eae7bb643 100644
--- a/paddle/utils/arch/osx/Excepts.cpp
+++ b/paddle/utils/arch/osx/Excepts.cpp
@@ -14,9 +14,13 @@ limitations under the License. */
 
 #include "paddle/utils/Excepts.h"
 
-#if (defined(__APPLE__) || defined(__OSX__)) && !defined(__arm__) && \
-    !defined(__aarch64__)
-
+#if defined(__APPLE__) || defined(__OSX__)
+#if defined(__arm__) || defined(__arm64__)
+// TODO(liuyiqun): implement the arm version
+int fegetexcept(void) { return -1; }
+int feenableexcept(unsigned int excepts) { return -1; }
+int fedisableexcept(unsigned int excepts) { return -1; }
+#else
 int fegetexcept(void) {
   static fenv_t fenv;
   return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
@@ -49,5 +53,5 @@ int fedisableexcept(unsigned int excepts) {
 
   return (fesetenv(&fenv) ? -1 : old_excepts);
 }
-
+#endif
 #endif
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp
index fdc914d1bcc3c74e0f05ef475069abc315bdc306..248f58a7f26e26e82b55110930964cee04fb558b 100644
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ b/paddle/utils/tests/test_StringUtils.cpp
@@ -18,6 +18,6 @@ limitations under the License. */
 
 TEST(StringUtil, to) {
   ASSERT_NEAR(paddle::str::to<double>("12.45"), 12.45, 1e-5);
-  ASSERT_DEATH(paddle::str::to<double>("12.45x23"), ".*");
-  ASSERT_DEATH(paddle::str::to<int>(""), ".*");
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<double>("12.45x23"), ".*");
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<int>(""), ".*");
 }
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 7bd6d59b0096c23bb791b9b50702130057628879..32578ad7799c0a276972ccef7770c2eae8438069 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -44,6 +44,7 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/pad
 
 
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+    COMMAND touch stub.cc
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index e88e962cff5bbfcb8be1014dbaab85568d2625ff..0e65598485d8785b3f5b2f1bc7e87f377b35792e 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2775,9 +2775,15 @@ class NCELayer(LayerBase):
 
 @config_layer('addto')
 class AddToLayer(LayerBase):
+    layer_type = 'addto'
+
     def __init__(self, name, inputs, bias=True, **xargs):
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        if self.layer_type == "mkldnn_addto":
+            config_assert(use_mkldnn, "mkldnn_addto only support MKLDNN")
+        self.layer_type = 'mkldnn_addto' if use_mkldnn else 'addto'
         super(AddToLayer, self).__init__(
-            name, 'addto', 0, inputs=inputs, **xargs)
+            name, self.layer_type, 0, inputs=inputs, **xargs)
         config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer')
 
         if len(self.inputs) > 1:
@@ -2796,6 +2802,11 @@ class AddToLayer(LayerBase):
         self.create_bias_parameter(bias, self.config.size)
 
 
+@config_layer('mkldnn_addto')
+class MKLDNNAddtoLayer(AddToLayer):
+    layer_type = 'mkldnn_addto'
+
+
 @config_layer('agent')
 class AgentLayer(LayerBase):
     def __init__(self, name, size, device=None):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 6e8ac8838bf3012969bf4f597cdf4430693fe313..169e201046a0d7b8c3e85f60946d8c1c762c88f4 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -143,6 +143,7 @@ __all__ = [
     'scale_shift_layer',
     'img_conv3d_layer',
     'resize_layer',
+    'sub_seq_layer',
 ]
 
 
@@ -252,6 +253,7 @@ class LayerType(object):
     SCALE_SHIFT_LAYER = 'scale_shift'
 
     RESIZE = 'resize'
+    SUB_SEQ_LAYER = 'subseq'
 
     @staticmethod
     def is_layer_type(type_name):
@@ -6980,3 +6982,58 @@ def resize_layer(input, size, name=None):
     """
     Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
     return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
+
+
+@wrap_act_default(act=LinearActivation())
+@wrap_name_default('sub_seq')
+def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
+    """
+    sub_seq_layer will return sub-sequences from the input sequences. For each
+    sequence in the input sequence layer, sub_seq_layer will slice it by given
+    offset and size. Please notice that, number of offset value and size value
+    both are equal to the number of sequence in the input layer.
+
+    .. code-block:: python
+
+        sub_seq = sub_seq_layer(input=input_seq, offsets=offsets, sizes=sizes)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer, which should be sequence.
+    :type input: LayerOutput
+    :param offsets: offset indices to slice the input sequence, which should be
+                    sequence type.
+    :type offsets: LayerOutput
+    :param sizes: sizes of the sub-sequences, which should be sequence type.
+    :type sizes: LayerOutput
+    :param act: Layer activation, default is LinearActivation
+    :type act: BaseActivation.
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of sub_seq_layer layer must be a PaddlePaddle layer.')
+    assert isinstance(offsets, LayerOutput), (
+        'The offset indices for sub_seq_layer, '
+        'must be a PaddlePaddle layer.')
+    assert isinstance(sizes, LayerOutput), (
+        'The sizes of sub-sequences, must be a PaddlePaddle layer.')
+
+    Layer(
+        name=name,
+        type=LayerType.SUB_SEQ_LAYER,
+        inputs=[input.name, offsets.name, sizes.name],
+        active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr))
+
+    return LayerOutput(
+        name,
+        LayerType.SUB_SEQ_LAYER,
+        parents=[input, offsets, sizes],
+        size=input.size)
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index c3495ee110bfaf91a47637a52e88b3bb56dce7a9..c3cd4cf8c32e20f3ef86305489fc415397dec1b8 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -116,7 +116,7 @@ class AdamOptimizer(BaseSGDOptimizer):
 
         m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
         v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
 
     :param beta1: the :math:`\\beta_1` in equation.
     :type beta1: float
diff --git a/python/paddle/utils/merge_model.py b/python/paddle/utils/merge_model.py
index 48e5087cc281bd3a3d0b4a403372456ebbf39c62..421e953d2775f145800cf7179ec644697a265060 100644
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
@@ -23,32 +23,32 @@ from paddle.v2.topology import Topology
 
 
 def merge_v2_model(net, param_file, output_file):
-    '''Integrate the model config and model parameters into one file.
-    
+    '''Merge the model config and parameters into one file.
+
     The model configuration file describes the model structure which
     ends with .py. The parameters file stores the parameters of the model
     which ends with .tar.gz.
-    
-    @param  net            The output layer of the network.
-    @param  param_file     Path of the model parameters(.tar.gz) which is stored by v2 api.
+
+    @param  net            The output layer of the network for inference.
+    @param  param_file     Path of the parameters (.tar.gz) which is stored by v2 api.
     @param  output_file    Path of the merged file which will be generated.
-    
+
     Usage:
 
-        from paddle.util.merge_model import merge_v2_model
+        from paddle.utils.merge_model import merge_v2_model
         # import your network configuration
-        from mobilenet import mobile_net
-        
-        net = mobile_net(3*224*224, 102)
+        from example_net import net_conf
+
+        net = net_conf(is_predict=True)
         param_file = './param_pass_00000.tar.gz'
         output_file = './output.paddle'
-        
+
         merge_v2_model(net, param_file, output_file)
 
     '''
 
     assert isinstance(net, LayerOutput), \
-            "The net should be the output of the network"
+            "The net should be the output of the network for inference"
     assert os.path.exists(param_file), \
             "The model parameters file %s does not exists " % (param_file)
 
diff --git a/python/paddle/v2/framework/backward.py b/python/paddle/v2/framework/backward.py
index 6827792cb351243f926aeca5f37324dc987d6a79..678efd5d20585355a684bb2df16fdb57a69e0eeb 100644
--- a/python/paddle/v2/framework/backward.py
+++ b/python/paddle/v2/framework/backward.py
@@ -19,8 +19,20 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None):
     :rtype: list[Variable]
     """
     assert isinstance(loss, framework.Variable)
-    param_grad_map = loss.block.program.append_backward(loss, no_grad_set or
-                                                        set())
+
+    if no_grad_set is None:
+        program = loss.block.program
+        assert isinstance(program, framework.Program)
+        no_grad_set = list()
+        for block in program.blocks:
+            assert isinstance(block, framework.Block)
+            for var in block.vars.itervalues():
+                assert isinstance(var, framework.Variable)
+                if var.stop_gradient:
+                    no_grad_set.append(var.name)
+        no_grad_set = set(no_grad_set)
+
+    param_grad_map = loss.block.program.append_backward(loss, no_grad_set)
     if parameter_list is not None:
         parameters = parameter_list
     else:
diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py
index 4e737549c929c6d5b2f2784f4fe9bcfeedae66e2..dd23c47961b6b27ffca32dc7fe496d715411c858 100644
--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
@@ -21,6 +21,7 @@ class Variable(object):
                  dtype=None,
                  lod_level=None,
                  persistable=None,
+                 stop_gradient=False,
                  **kwargs):
         self.block = block
 
@@ -89,6 +90,7 @@ class Variable(object):
 
         self.block.vars[name] = self
         self.op = None
+        self.stop_gradient = stop_gradient
 
     def __str__(self):
         protostr = self.desc.serialize_to_string()
@@ -550,5 +552,5 @@ class Parameter(Variable):
 
 
 # program is a global instance.
-g_program = Program()
-g_init_program = Program()
+g_main_program = Program()
+g_startup_program = Program()
diff --git a/python/paddle/v2/framework/io.py b/python/paddle/v2/framework/io.py
index f3ba719bde086f696a27b806228a8c97466a681e..5c247904a330e25b1a9f53db431947840db3f615 100644
--- a/python/paddle/v2/framework/io.py
+++ b/python/paddle/v2/framework/io.py
@@ -1,7 +1,7 @@
 import os
 import cPickle as pickle
 
-from paddle.v2.framework.framework import Program, Parameter, g_program, \
+from paddle.v2.framework.framework import Program, Parameter, g_main_program, \
     Variable
 
 __all__ = [
@@ -29,13 +29,13 @@ def _clone_var_in_block_(block, var):
         persistable=True)
 
 
-def save_vars(executor, dirname, program=None, vars=None, predicate=None):
+def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     Save variables to directory by executor.
 
     :param executor: executor that save variable
     :param dirname: directory path
-    :param program: program. If vars is None, then filter all variables in this 
+    :param main_program: program. If vars is None, then filter all variables in this 
     program which fit `predicate`. Default g_program.
     :param predicate: The Predicate describes a callable that returns a variable
     as a bool. If it returns true, the variables will be saved.
@@ -44,15 +44,15 @@ def save_vars(executor, dirname, program=None, vars=None, predicate=None):
     :return: None
     """
     if vars is None:
-        if program is None:
-            program = g_program
-        if not isinstance(program, Program):
+        if main_program is None:
+            main_program = g_main_program
+        if not isinstance(main_program, Program):
             raise TypeError("program should be as Program type or None")
 
         save_vars(
             executor,
             dirname=dirname,
-            vars=filter(predicate, program.list_vars()))
+            vars=filter(predicate, main_program.list_vars()))
     else:
         save_program = Program()
         save_block = save_program.global_block()
@@ -66,37 +66,37 @@ def save_vars(executor, dirname, program=None, vars=None, predicate=None):
         executor.run(save_program)
 
 
-def save_params(executor, dirname, program=None):
+def save_params(executor, dirname, main_program=None):
     """
     Save all parameters to directory with executor.
     """
     save_vars(
         executor,
         dirname=dirname,
-        program=program,
+        main_program=main_program,
         vars=None,
         predicate=is_parameter)
 
 
-def save_persistables(executor, dirname, program=None):
+def save_persistables(executor, dirname, main_program=None):
     """
     Save all persistables to directory with executor.
     """
     save_vars(
         executor,
         dirname=dirname,
-        program=program,
+        main_program=main_program,
         vars=None,
         predicate=is_persistable)
 
 
-def load_vars(executor, dirname, program=None, vars=None, predicate=None):
+def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     Load variables from directory by executor.
 
     :param executor: executor that save variable
     :param dirname: directory path
-    :param program: program. If vars is None, then filter all variables in this 
+    :param main_program: program. If vars is None, then filter all variables in this 
     program which fit `predicate`. Default g_program.
     :param predicate: The Predicate describes a callable that returns a variable
     as a bool. If it returns true, the variables will be loaded.
@@ -105,15 +105,15 @@ def load_vars(executor, dirname, program=None, vars=None, predicate=None):
     :return: None
     """
     if vars is None:
-        if program is None:
-            program = g_program
-        if not isinstance(program, Program):
+        if main_program is None:
+            main_program = g_main_program
+        if not isinstance(main_program, Program):
             raise TypeError("program's type should be Program")
 
         load_vars(
             executor,
             dirname=dirname,
-            vars=filter(predicate, program.list_vars()))
+            vars=filter(predicate, main_program.list_vars()))
     else:
         load_prog = Program()
         load_block = load_prog.global_block()
@@ -129,27 +129,33 @@ def load_vars(executor, dirname, program=None, vars=None, predicate=None):
         executor.run(load_prog)
 
 
-def load_params(executor, dirname, program=None):
+def load_params(executor, dirname, main_program=None):
     """
     load all parameters from directory by executor.
     """
     load_vars(
-        executor, dirname=dirname, program=program, predicate=is_parameter)
+        executor,
+        dirname=dirname,
+        main_program=main_program,
+        predicate=is_parameter)
 
 
-def load_persistables(executor, dirname, program=None):
+def load_persistables(executor, dirname, main_program=None):
     """
     load all persistables from directory by executor.
     """
     load_vars(
-        executor, dirname=dirname, program=program, predicate=is_persistable)
+        executor,
+        dirname=dirname,
+        main_program=main_program,
+        predicate=is_persistable)
 
 
 def save_inference_model(dirname,
                          feeded_var_names,
                          target_vars,
                          executor,
-                         program=None):
+                         main_program=None):
     """
     Build a model especially for inference, 
     and save it to directory by the executor.
@@ -158,20 +164,20 @@ def save_inference_model(dirname,
     :param feeded_var_names: Names of variables that need to be feeded data during inference
     :param target_vars: Variables from which we can get inference results.
     :param executor: executor that save inference model
-    :param program: original program, which will be pruned to build the inference model. 
+    :param main_program: original program, which will be pruned to build the inference model. 
     Default g_program.
 
     :return: None
     """
-    if program is None:
-        program = g_program
+    if main_program is None:
+        main_program = g_main_program
     if not isinstance(target_vars, list):
         target_vars = [target_vars]
 
     if not os.path.isdir(dirname):
         os.makedirs(dirname)
 
-    pruned_program = program.prune(target_vars)
+    pruned_program = main_program.prune(target_vars)
     fetch_var_names = [v.name for v in target_vars]
 
     model_file_name = dirname + "/__model__"
@@ -182,10 +188,10 @@ def save_inference_model(dirname,
             "fetch_var_names": fetch_var_names
         }, f, -1)
 
-    save_params(executor, dirname, program)
+    save_params(executor, dirname, main_program)
 
 
-def load_persistables_if_exist(executor, dirname, program=None):
+def load_persistables_if_exist(executor, dirname, main_program=None):
     filenames = next(os.walk(dirname))[2]
     filenames = set(filenames)
 
@@ -198,7 +204,7 @@ def load_persistables_if_exist(executor, dirname, program=None):
     load_vars(
         executor,
         dirname,
-        program=program,
+        main_program=main_program,
         vars=None,
         predicate=_is_presistable_and_exist_)
 
diff --git a/python/paddle/v2/framework/layer_helper.py b/python/paddle/v2/framework/layer_helper.py
index 9e80eaa647366d13f8d9d55745067b7b29091d32..c38346b79fecfb2f82a60b360c505da16ecdf3c0 100644
--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/framework/layer_helper.py
@@ -1,8 +1,8 @@
 import copy
 import itertools
 
-from paddle.v2.framework.framework import Variable, g_program, \
-    g_init_program, unique_name, Program
+from paddle.v2.framework.framework import Variable, g_main_program, \
+    g_startup_program, unique_name, Program
 from paddle.v2.framework.initializer import ConstantInitializer, \
     UniformInitializer
 
@@ -20,23 +20,23 @@ class LayerHelper(object):
         return self.kwargs['name']
 
     @property
-    def program(self):
-        prog = self.kwargs.get('program', None)
+    def main_program(self):
+        prog = self.kwargs.get('main_program', None)
         if prog is None:
-            return g_program
+            return g_main_program
         else:
             return prog
 
     @property
-    def init_program(self):
-        prog = self.kwargs.get('init_program', None)
+    def startup_program(self):
+        prog = self.kwargs.get('startup_program', None)
         if prog is None:
-            return g_init_program
+            return g_startup_program
         else:
             return prog
 
     def append_op(self, *args, **kwargs):
-        return self.program.current_block().append_op(*args, **kwargs)
+        return self.main_program.current_block().append_op(*args, **kwargs)
 
     def multiple_input(self, input_param_name='input'):
         inputs = self.kwargs.get(input_param_name, [])
@@ -120,27 +120,27 @@ class LayerHelper(object):
             attr_copy['initializer'] = initializer
         if attr_copy['name'] is None:
             attr_copy['name'] = unique_name(".".join([self.name, suffix]))
-        self.init_program.global_block().create_parameter(
+        self.startup_program.global_block().create_parameter(
             dtype=dtype, shape=shape, **attr_copy)
-        return self.program.global_block().create_parameter(
+        return self.main_program.global_block().create_parameter(
             name=attr_copy['name'], dtype=dtype, shape=shape)
 
     def create_tmp_variable(self, dtype):
-        return self.program.current_block().create_var(
+        return self.main_program.current_block().create_var(
             name=unique_name(".".join([self.name, 'tmp'])),
             dtype=dtype,
             persistable=False)
 
     def create_variable(self, *args, **kwargs):
-        return self.program.current_block().create_var(*args, **kwargs)
+        return self.main_program.current_block().create_var(*args, **kwargs)
 
     def create_global_variable(self, persistable=False, *args, **kwargs):
-        return self.program.global_block().create_var(
+        return self.main_program.global_block().create_var(
             *args, persistable=persistable, **kwargs)
 
     def set_variable_initializer(self, var, initializer):
         assert isinstance(var, Variable)
-        self.init_program.global_block().create_var(
+        self.startup_program.global_block().create_var(
             name=var.name,
             type=var.type,
             dtype=var.data_type,
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
index 8b7d6fc32bff716d12c685180a25112f5573207b..b7e468fb51c19d33db1a68df641e3aff20f04f45 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -18,8 +18,8 @@ def fc(input,
        name=None,
        act=None,
        num_flatten_dims=1,
-       program=None,
-       init_program=None):
+       main_program=None,
+       startup_program=None):
     # create helper
     helper = LayerHelper('fc', **locals())
 
@@ -64,8 +64,8 @@ def embedding(input,
               data_type='float32',
               is_sparse=False,
               param_attr=None,
-              program=None,
-              init_program=None):
+              main_program=None,
+              startup_program=None):
     helper = LayerHelper('embedding', **locals())
     w = helper.create_parameter(
         attr=helper.param_attr, shape=size, dtype=data_type)
@@ -84,8 +84,8 @@ def data(name,
          data_type='float32',
          type=core.VarDesc.VarType.LOD_TENSOR,
          append_batch_size=True,
-         program=None,
-         init_program=None):
+         main_program=None,
+         startup_program=None):
     helper = LayerHelper('data', **locals())
     shape = list(shape)
     for i in xrange(len(shape)):
@@ -99,7 +99,7 @@ def data(name,
         shape = [-1] + shape  # append batch size as -1
 
     return helper.create_global_variable(
-        name=name, shape=shape, dtype=data_type, type=type)
+        name=name, shape=shape, dtype=data_type, type=type, stop_gradient=True)
 
 
 def _convert_(name):
@@ -178,7 +178,7 @@ _create_op_func_('sigmoid')
 _create_op_func_('scale')
 
 
-def cast(x, data_type, program=None):
+def cast(x, data_type, main_program=None):
     helper = LayerHelper('cast', **locals())
     out = helper.create_tmp_variable(dtype=data_type)
     helper.append_op(
@@ -190,7 +190,7 @@ def cast(x, data_type, program=None):
     return out
 
 
-def concat(input, axis, program=None, init_program=None):
+def concat(input, axis, main_program=None, startup_program=None):
     helper = LayerHelper('concat', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
     helper.append_op(
@@ -201,7 +201,7 @@ def concat(input, axis, program=None, init_program=None):
     return out
 
 
-def sums(input, program=None, init_program=None):
+def sums(input, main_program=None, startup_program=None):
     helper = LayerHelper('sum', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
     helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
@@ -281,8 +281,8 @@ def sequence_conv(input,
                   padding=None,
                   bias_attr=None,
                   param_attr=None,
-                  program=None,
-                  init_program=None):
+                  main_program=None,
+                  startup_program=None):
     # FIXME(dzh) : want to unify the argument of python layer
     # function. So we ignore some unecessary attributes.
     # such as, padding_trainable, context_start.
@@ -321,8 +321,8 @@ def conv2d(input,
            padding=None,
            bias_attr=None,
            param_attr=None,
-           program=None,
-           init_program=None):
+           main_program=None,
+           startup_program=None):
     helper = LayerHelper('conv2d', **locals())
     dtype = helper.input_dtype()
 
@@ -372,11 +372,13 @@ def sequence_pool(input, pool_type, **kwargs):
     helper = LayerHelper('sequence_pool', input=input, **kwargs)
     dtype = helper.input_dtype()
     pool_out = helper.create_tmp_variable(dtype)
+    max_index = helper.create_tmp_variable(dtype)
 
     helper.append_op(
         type="sequence_pool",
-        inputs={"X": [input]},
-        outputs={"Out": [pool_out]},
+        inputs={"X": input},
+        outputs={"Out": pool_out,
+                 "MaxIndex": max_index},
         attrs={"pooltype": pool_type.upper()})
 
     return pool_out
@@ -388,8 +390,8 @@ def pool2d(input,
            pool_stride=[1, 1],
            pool_padding=[0, 0],
            global_pooling=False,
-           program=None,
-           init_program=None):
+           main_program=None,
+           startup_program=None):
     if pool_type not in ["max", "avg"]:
         raise ValueError(
             "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
@@ -428,8 +430,8 @@ def batch_norm(input,
                param_attr=None,
                bias_attr=None,
                data_layout='NCHW',
-               program=None,
-               init_program=None):
+               main_program=None,
+               startup_program=None):
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
 
@@ -505,16 +507,16 @@ class BlockGuard(object):
     keyword.
     """
 
-    def __init__(self, program):
-        if not isinstance(program, Program):
+    def __init__(self, main_program):
+        if not isinstance(main_program, Program):
             raise TypeError("BlockGuard takes a program")
-        self.program = program
+        self.main_program = main_program
 
     def __enter__(self):
-        self.program.create_block()
+        self.main_program.create_block()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        self.program.rollback()
+        self.main_program.rollback()
         if exc_type is not None:
             return False  # re-raise exception
         return True
@@ -524,7 +526,7 @@ class StaticRNNGuard(BlockGuard):
     def __init__(self, rnn):
         if not isinstance(rnn, StaticRNN):
             raise TypeError("StaticRNNGuard takes an StaticRNN")
-        super(StaticRNNGuard, self).__init__(rnn.helper.program)
+        super(StaticRNNGuard, self).__init__(rnn.helper.main_program)
         self.rnn = rnn
 
     def __enter__(self):
@@ -560,8 +562,9 @@ class StaticRNN(object):
     IN_RNN_BLOCK = 1
     AFTER_RNN_BLOCK = 2
 
-    def __init__(self, name=None, program=None):
-        self.helper = LayerHelper("static_rnn", name=name, program=program)
+    def __init__(self, name=None, main_program=None):
+        self.helper = LayerHelper(
+            "static_rnn", name=name, main_program=main_program)
         self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
         self.inputs = []  # input variable list in current block
         self.outputs = []  # output variable list in parent block
@@ -653,7 +656,7 @@ class StaticRNN(object):
         self.memories[mem.name].mem = var
 
     def parent_block(self):
-        prog = self.helper.program
+        prog = self.helper.main_program
         parent_idx = prog.current_block().parent_idx
         assert parent_idx >= 0
         parent_block = prog.block(parent_idx)
@@ -670,8 +673,8 @@ class StaticRNN(object):
             return self.outputs
 
     def complete_rnn_op(self):
-        program = self.helper.program
-        rnn_block = program.current_block()
+        main_program = self.helper.main_program
+        rnn_block = main_program.current_block()
         parent_block = self.parent_block()
 
         local_inputs = set()
@@ -737,7 +740,7 @@ class StaticRNN(object):
             })
 
 
-def lod_rank_table(x, level=0, program=None):
+def lod_rank_table(x, level=0, main_program=None):
     helper = LayerHelper("lod_rank_table", **locals())
     table = helper.create_variable(
         type=core.VarDesc.VarType.LOD_RANK_TABLE,
diff --git a/python/paddle/v2/framework/net_drawer.py b/python/paddle/v2/framework/net_drawer.py
index aa30e2a6ca231667186529aec30adcdbc6efcff9..045e267c253e2485e75df3fb95cc0e591ee29ea5 100644
--- a/python/paddle/v2/framework/net_drawer.py
+++ b/python/paddle/v2/framework/net_drawer.py
@@ -80,7 +80,7 @@ def parse_graph(program, graph, var_dict, **kwargs):
                         graph.edge(**draw_edge(var_dict, op, e, arg))
 
 
-def draw_graph(init_program, program, **kwargs):
+def draw_graph(startup_program, main_program, **kwargs):
     if kwargs.has_key("graph_attr"):
         GRAPH_STYLE.update(kwargs[graph_attr])
     if kwargs.has_key("node_attr"):
@@ -101,8 +101,8 @@ def draw_graph(init_program, program, **kwargs):
         **kwargs)
 
     var_dict = {}
-    parse_graph(init_program, g, var_dict)
-    parse_graph(program, g, var_dict)
+    parse_graph(startup_program, g, var_dict)
+    parse_graph(main_program, g, var_dict)
 
     if filename != None:
         g.save()
diff --git a/python/paddle/v2/framework/nets.py b/python/paddle/v2/framework/nets.py
index f5a2c27676a02b953026be0893cd49b832bf2c6b..725d2fa7f5e7a862eea0ef9172a9e63858ebd0dd 100644
--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/framework/nets.py
@@ -10,23 +10,23 @@ def simple_img_conv_pool(input,
                          pool_stride,
                          act,
                          pool_type='max',
-                         program=None,
-                         init_program=None):
+                         main_program=None,
+                         startup_program=None):
     conv_out = layers.conv2d(
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
         act=act,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     pool_out = layers.pool2d(
         input=conv_out,
         pool_size=pool_size,
         pool_type=pool_type,
         pool_stride=pool_stride,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
     return pool_out
 
 
@@ -40,8 +40,8 @@ def img_conv_group(input,
                    conv_batchnorm_drop_rate=None,
                    pool_stride=1,
                    pool_type=None,
-                   program=None,
-                   init_program=None):
+                   main_program=None,
+                   startup_program=None):
     """
     Image Convolution Group, Used for vgg net.
     """
@@ -71,30 +71,30 @@ def img_conv_group(input,
             filter_size=conv_filter_size[i],
             padding=conv_padding[i],
             act=local_conv_act,
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)
 
         if conv_with_batchnorm[i]:
             tmp = layers.batch_norm(
                 input=tmp,
                 act=conv_act,
-                program=program,
-                init_program=init_program)
+                main_program=main_program,
+                startup_program=startup_program)
             drop_rate = conv_batchnorm_drop_rate[i]
             if abs(drop_rate) > 1e-5:
                 tmp = layers.dropout(
                     x=tmp,
                     dropout_prob=drop_rate,
-                    program=program,
-                    init_program=init_program)
+                    main_program=main_program,
+                    startup_program=startup_program)
 
     pool_out = layers.pool2d(
         input=tmp,
         pool_size=pool_size,
         pool_type=pool_type,
         pool_stride=pool_stride,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
     return pool_out
 
 
@@ -103,19 +103,19 @@ def sequence_conv_pool(input,
                        filter_size,
                        act="sigmoid",
                        pool_type="max",
-                       program=None,
-                       init_program=None):
+                       main_program=None,
+                       startup_program=None):
     conv_out = layers.sequence_conv(
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
         act=act,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     pool_out = layers.sequence_pool(
         input=conv_out,
         pool_type=pool_type,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
     return pool_out
diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/framework/optimizer.py
index 902442297eb54df91a46db68a66e9208ece3e71c..f20865d604f68c8398f299bf8edfd020bfa4e4c5 100644
--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/framework/optimizer.py
@@ -132,7 +132,7 @@ class Optimizer(object):
     def create_optimization_pass(self,
                                  parameters_and_grads,
                                  loss,
-                                 init_program=None):
+                                 startup_program=None):
         """Add optimization operators to update gradients to variables.
 
         Args:
@@ -144,7 +144,7 @@ class Optimizer(object):
           optimization. This will include parameter update ops, global step
           update ops and any other custom ops required by subclasses to manage
           their internal state.
-          :param init_program: 
+          :param startup_program: 
         """
         # This is a default implementation of create_optimization_pass that
         # can be shared by most optimizers. This implementation assumes that
@@ -156,7 +156,9 @@ class Optimizer(object):
         # Create any accumulators
         program = loss.block.program
         self.helper = LayerHelper(
-            self.__class__.__name__, program=program, init_program=init_program)
+            self.__class__.__name__,
+            main_program=program,
+            startup_program=startup_program)
         self._create_accumulators(loss.block,
                                   [p[0] for p in parameters_and_grads])
         # Create any necessary tensors
@@ -185,7 +187,7 @@ class Optimizer(object):
 
     def minimize(self,
                  loss,
-                 init_program=None,
+                 startup_program=None,
                  parameter_list=None,
                  no_grad_set=None):
         """Add operations to minimize `loss` by updating `parameter_list`.
@@ -198,7 +200,7 @@ class Optimizer(object):
         # Add regularization if any 
         params_grads = append_regularization_ops(params_grads)
         optimize_ops = self.create_optimization_pass(params_grads, loss,
-                                                     init_program)
+                                                     startup_program)
         return optimize_ops
 
 
diff --git a/python/paddle/v2/framework/tests/test_executor_and_mul.py b/python/paddle/v2/framework/tests/test_executor_and_mul.py
index 35f775711167ce0d210044ab4cb382db802f39a5..c885cfbebd4b665ddf50adbc43673942dc949a0b 100644
--- a/python/paddle/v2/framework/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/framework/tests/test_executor_and_mul.py
@@ -2,7 +2,7 @@ import unittest
 from paddle.v2.framework.layers import mul, data
 import paddle.v2.framework.core as core
 from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_program
+from paddle.v2.framework.framework import g_main_program
 import numpy
 
 
@@ -23,7 +23,7 @@ class TestExecutor(unittest.TestCase):
         tensor_b = core.LoDTensor()
         tensor_b.set(b_np, place)
         exe = Executor(place)
-        outs = exe.run(g_program,
+        outs = exe.run(g_main_program,
                        feed={'a': tensor_a,
                              'b': tensor_b},
                        fetch_list=[out])
diff --git a/python/paddle/v2/framework/tests/test_fit_a_line.py b/python/paddle/v2/framework/tests/test_fit_a_line.py
index 944240629ca0c2ef8ee0d881f48bdfc6b5b485d3..174ee74c3bc8952d296a2ffa205108f2a8e12cfb 100644
--- a/python/paddle/v2/framework/tests/test_fit_a_line.py
+++ b/python/paddle/v2/framework/tests/test_fit_a_line.py
@@ -3,40 +3,44 @@ import paddle.v2.framework.layers as layers
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.framework import Program, g_main_program
 from paddle.v2.framework.io import save_persistables, load_persistables
 from paddle.v2.framework.executor import Executor
 
 import numpy as np
 
-init_program = Program()
-program = Program()
+startup_program = Program()
+main_program = Program()
 x = layers.data(
     name='x',
     shape=[13],
     data_type='float32',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 
 y_predict = layers.fc(input=x,
                       size=1,
                       act=None,
-                      program=program,
-                      init_program=init_program)
+                      main_program=main_program,
+                      startup_program=startup_program)
 
 y = layers.data(
     name='y',
     shape=[1],
     data_type='float32',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 
 cost = layers.square_error_cost(
-    input=y_predict, label=y, program=program, init_program=init_program)
-avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
+    input=y_predict,
+    label=y,
+    main_program=main_program,
+    startup_program=startup_program)
+avg_cost = layers.mean(
+    x=cost, main_program=main_program, startup_program=startup_program)
 
 sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost, init_program)
+opts = sgd_optimizer.minimize(avg_cost, startup_program)
 
 BATCH_SIZE = 20
 
@@ -48,12 +52,12 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)
 
-exe.run(init_program, feed={}, fetch_list=[])
+exe.run(startup_program, feed={}, fetch_list=[])
 
 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
-    save_persistables(exe, "./fit_a_line.model/", program=program)
-    load_persistables(exe, "./fit_a_line.model/", program=program)
+    save_persistables(exe, "./fit_a_line.model/", main_program=main_program)
+    load_persistables(exe, "./fit_a_line.model/", main_program=main_program)
     for data in train_reader():
         x_data = np.array(map(lambda x: x[0], data)).astype("float32")
         y_data = np.array(map(lambda x: x[1], data)).astype("float32")
@@ -65,7 +69,7 @@ for pass_id in range(PASS_NUM):
         tensor_y = core.LoDTensor()
         tensor_y.set(y_data, place)
         # print tensor_y.get_dims()
-        outs = exe.run(program,
+        outs = exe.run(main_program,
                        feed={'x': tensor_x,
                              'y': tensor_y},
                        fetch_list=[avg_cost])
diff --git a/python/paddle/v2/framework/tests/test_gru_op.py b/python/paddle/v2/framework/tests/test_gru_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2474cff94c6c71cc62bc8e69a5d83e38d51c511
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_gru_op.py
@@ -0,0 +1,156 @@
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+from test_lstm_op import identity, sigmoid, tanh, relu
+
+
+class TestGRUOp(OpTest):
+    batch_size = 9
+    frame_size = 5
+    activate = {
+        'identity': identity,
+        'sigmoid': sigmoid,
+        'tanh': tanh,
+        'relu': relu
+    }
+
+    @staticmethod
+    def seq_to_batch(lod, is_reverse):
+        idx_in_seq_list = []
+        seq_starts = lod[0]
+        seq_lens = []
+        for i in range(len(seq_starts) - 1):
+            seq_lens.append(seq_starts[i + 1] - seq_starts[i])
+        sorted_seqs = sorted(
+            range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
+        num_batch = seq_lens[sorted_seqs[0]]
+        for batch_idx in range(num_batch):
+            idx_in_seq = []
+            for i in range(len(seq_lens)):
+                if seq_lens[sorted_seqs[i]] <= batch_idx:
+                    break
+                idx = (seq_starts[sorted_seqs[i] + 1] - 1 - batch_idx
+                       ) if is_reverse else (
+                           seq_starts[sorted_seqs[i]] + batch_idx)
+                idx_in_seq.append(idx)
+            idx_in_seq_list.append(idx_in_seq)
+        return idx_in_seq_list
+
+    def gru_step(self, x, h_p, w, b):
+        batch_size = x.shape[0]
+        frame_size = w.shape[0]
+        g = x + np.tile(b, (batch_size, 1))
+        w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
+            (frame_size, frame_size * 2))
+        u_r = self.activate[self.attrs['gate_activation']](np.dot(
+            h_p, w_u_r) + g[:, :frame_size * 2])
+        u = u_r[:, :frame_size]
+        r = u_r[:, frame_size:frame_size * 2]
+        r_h_p = r * h_p
+        w_c = w.flatten()[frame_size * frame_size * 2:].reshape(
+            (frame_size, frame_size))
+        c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
+                                                    g[:, frame_size * 2:])
+        g = np.hstack((u_r, c))
+        h = u * c + (1 - u) * h_p
+        return g, r_h_p, h
+
+    def gru(self):
+        input, lod = self.inputs['Input']
+        w = self.inputs['Weight']
+        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+            (1, self.frame_size * 3))
+        batch_gate = self.outputs['BatchGate']
+        batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev']
+        batch_hidden = self.outputs['BatchHidden']
+        hidden = self.outputs['Hidden']
+        idx_in_seq_list = self.idx_in_seq_list
+        h_p = self.inputs['H0'] if self.inputs.has_key('H0') else np.zeros(
+            (len(idx_in_seq_list[0]), self.frame_size))
+        num_batch = len(idx_in_seq_list)
+        end_idx = 0
+        for batch_idx in range(num_batch):
+            x = input[idx_in_seq_list[batch_idx]]
+            g, r_h_p, h = self.gru_step(x, h_p, w, b)
+            if batch_idx < (num_batch - 1):
+                h_p = h[:len(idx_in_seq_list[batch_idx + 1])]
+            start_idx = end_idx
+            end_idx = start_idx + len(idx_in_seq_list[batch_idx])
+            batch_gate[start_idx:end_idx] = g
+            batch_reset_hidden_prev[start_idx:end_idx] = r_h_p
+            batch_hidden[start_idx:end_idx] = h
+            hidden[idx_in_seq_list[batch_idx]] = h
+        return batch_gate, batch_reset_hidden_prev, hidden
+
+    def set_data(self):
+        lod = [[0, 2, 6, self.batch_size]]
+        self.idx_in_seq_list = self.seq_to_batch(lod, self.is_reverse)
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        input = np.random.rand(batch_size, frame_size * 3).astype('float64')
+        h0 = np.random.rand(len(self.idx_in_seq_list[0]),
+                            frame_size).astype('float64')
+        weight = np.random.rand(frame_size, frame_size * 3).astype('float64')
+        bias = np.random.rand(1, frame_size * 3).astype('float64')
+
+        self.inputs = {
+            'Input': (input, lod),
+            'H0': h0,
+            'Weight': weight,
+            'Bias': bias
+        }
+
+        self.outputs = {
+            'BatchGate': np.zeros(
+                (batch_size, frame_size * 3), dtype='float64'),
+            'BatchResetHiddenPrev': np.zeros(
+                (batch_size, frame_size), dtype='float64'),
+            'BatchHidden': np.zeros(
+                (batch_size, frame_size), dtype='float64'),
+            'Hidden': np.zeros(
+                (batch_size, frame_size), dtype='float64')
+        }
+
+    def set_confs(self):
+        self.is_reverse = False
+        self.attrs = {
+            'activation': 'tanh',
+            'gate_activation': 'sigmoid',
+            'is_reverse': self.is_reverse
+        }
+
+    def setUp(self):
+        self.op_type = "gru"
+        self.set_confs()
+        self.set_data()
+        self.gru()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
+
+
+class TestGRUOpNoInitial(TestGRUOp):
+    def set_data(self):
+        super(TestGRUOpNoInitial, self).set_data()
+        self.inputs.pop('H0')
+
+    def test_check_grad(self):
+        self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden'])
+
+
+class TestGRUOpReverse(TestGRUOp):
+    def set_confs(self):
+        self.is_reverse = True
+        self.attrs = {
+            'activation': 'identity',
+            'gate_activation': 'sigmoid',
+            'is_reverse': self.is_reverse
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_huber_loss_op.py b/python/paddle/v2/framework/tests/test_huber_loss_op.py
index 003e7d7ed7ccdfc48b0aa8db0a6765b5c93e7c14..a24fcbec6cc4801118ce4ef97eb4692cd2351c28 100644
--- a/python/paddle/v2/framework/tests/test_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_huber_loss_op.py
@@ -21,7 +21,8 @@ class TestHuberLossOp(OpTest):
             'Y': np.random.uniform(0, 1., (samples_num, 1)).astype('float32'),
         }
         residual = self.inputs['Y'] - self.inputs['X']
-        loss = np.vectorize(huber_loss_forward)(residual, delta)
+        loss = np.vectorize(huber_loss_forward)(residual,
+                                                delta).astype('float32')
         self.attrs = {'delta': delta}
         self.outputs = {
             'Residual': residual,
@@ -43,6 +44,5 @@ class TestHuberLossOp(OpTest):
             ['X'], 'Out', max_relative_error=0.008, no_grad_set=set('residual'))
 
 
-# TODO(typhoonzero): should add this back till we fix it
-#if __name__ == '__main__':
-#    unittest.main()
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_image_classification_layer.py b/python/paddle/v2/framework/tests/test_image_classification_layer.py
index b4eda13552e60f009ec910e3d21e9d77107133a1..b1a267ec32b1c937b946bee82e41b846ebbf1288 100644
--- a/python/paddle/v2/framework/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_layer.py
@@ -9,8 +9,8 @@ def conv_block(input,
                num_filter,
                groups,
                dropouts,
-               program=None,
-               init_program=None):
+               main_program=None,
+               startup_program=None):
     return nets.img_conv_group(
         input=input,
         pool_size=2,
@@ -21,77 +21,81 @@ def conv_block(input,
         conv_with_batchnorm=True,
         conv_batchnorm_drop_rate=dropouts,
         pool_type='max',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
 
 class TestLayer(unittest.TestCase):
     def test_batch_norm_layer(self):
-        program = Program()
-        init_program = Program()
+        main_program = Program()
+        startup_program = Program()
         images = layers.data(
             name='pixel',
             shape=[3, 48, 48],
             data_type='float32',
-            program=program)
+            main_program=main_program)
         layers.batch_norm(
-            input=images, program=program, init_program=init_program)
+            input=images,
+            main_program=main_program,
+            startup_program=startup_program)
 
-        # print str(program)
+        # print str(main_program)
 
     def test_dropout_layer(self):
-        program = Program()
-        init_program = Program()
+        main_program = Program()
+        startup_program = Program()
         images = layers.data(
             name='pixel',
             shape=[3, 48, 48],
             data_type='float32',
-            program=program)
+            main_program=main_program)
         layers.dropout(
             x=images,
             dropout_prob=0.5,
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)
 
-        # print str(program)
+        # print str(main_program)
 
     def test_img_conv_group(self):
-        program = Program()
-        init_program = Program()
+        main_program = Program()
+        startup_program = Program()
 
         images = layers.data(
             name='pixel',
             shape=[3, 48, 48],
             data_type='float32',
-            program=program,
-            init_program=init_program)
-        conv1 = conv_block(images, 64, 2, [0.3, 0], program, init_program)
-        conv2 = conv_block(conv1, 256, 3, [0.4, 0.4, 0], program, init_program)
+            main_program=main_program,
+            startup_program=startup_program)
+        conv1 = conv_block(images, 64, 2, [0.3, 0], main_program,
+                           startup_program)
+        conv2 = conv_block(conv1, 256, 3, [0.4, 0.4, 0], main_program,
+                           startup_program)
 
-        # print str(program)
+        # print str(main_program)
 
     def test_elementwise_add_with_act(self):
-        program = Program()
-        init_program = Program()
+        main_program = Program()
+        startup_program = Program()
         image1 = layers.data(
             name='pixel1',
             shape=[3, 48, 48],
             data_type='float32',
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)
         image2 = layers.data(
             name='pixel2',
             shape=[3, 48, 48],
             data_type='float32',
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)
         out = layers.elementwise_add(
             x=image1,
             y=image2,
             act='relu',
-            program=program,
-            init_program=init_program)
-        # print(program)
+            main_program=main_program,
+            startup_program=startup_program)
+        # print(main_program)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/framework/tests/test_image_classification_train.py b/python/paddle/v2/framework/tests/test_image_classification_train.py
index 7189adbf8fc55bb188cd9c99ba8b17c6ad6a4794..a4165da9703c55ae3347123409407f0cae30856f 100644
--- a/python/paddle/v2/framework/tests/test_image_classification_train.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_train.py
@@ -5,19 +5,19 @@ import paddle.v2.framework.layers as layers
 import paddle.v2.framework.nets as nets
 import paddle.v2.framework.optimizer as optimizer
 from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_init_program, g_program
+from paddle.v2.framework.framework import g_startup_program, g_main_program
 from paddle.v2.framework.initializer import XavierInitializer
 
 
-def resnet_cifar10(input, depth=32, program=None, init_program=None):
+def resnet_cifar10(input, depth=32, main_program=None, startup_program=None):
     def conv_bn_layer(input,
                       ch_out,
                       filter_size,
                       stride,
                       padding,
                       act='relu',
-                      program=None,
-                      init_program=None):
+                      main_program=None,
+                      startup_program=None):
         tmp = layers.conv2d(
             input=input,
             filter_size=filter_size,
@@ -26,10 +26,13 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
             padding=padding,
             act=None,
             bias_attr=False,
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)
         return layers.batch_norm(
-            input=tmp, act=act, program=program, init_program=init_program)
+            input=tmp,
+            act=act,
+            main_program=main_program,
+            startup_program=startup_program)
 
     def shortcut(input, ch_in, ch_out, stride, program, init_program):
         if ch_in != ch_out:
@@ -42,16 +45,16 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
                    ch_in,
                    ch_out,
                    stride,
-                   program=program,
-                   init_program=init_program):
+                   main_program=main_program,
+                   startup_program=startup_program):
         tmp = conv_bn_layer(
             input,
             ch_out,
             3,
             stride,
             1,
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)
         tmp = conv_bn_layer(
             tmp,
             ch_out,
@@ -59,21 +62,22 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
             1,
             1,
             act=None,
-            program=program,
-            init_program=init_program)
-        short = shortcut(input, ch_in, ch_out, stride, program, init_program)
+            main_program=main_program,
+            startup_program=startup_program)
+        short = shortcut(input, ch_in, ch_out, stride, main_program,
+                         startup_program)
         return layers.elementwise_add(
             x=tmp,
             y=short,
             act='relu',
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)
 
     def layer_warp(block_func, input, ch_in, ch_out, count, stride, program,
-                   init_program):
-        tmp = block_func(input, ch_in, ch_out, stride, program, init_program)
+                   startup_program):
+        tmp = block_func(input, ch_in, ch_out, stride, program, startup_program)
         for i in range(1, count):
-            tmp = block_func(tmp, ch_out, ch_out, 1, program, init_program)
+            tmp = block_func(tmp, ch_out, ch_out, 1, program, startup_program)
         return tmp
 
     assert (depth - 2) % 6 == 0
@@ -84,8 +88,8 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
         filter_size=3,
         stride=1,
         padding=1,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
     res1 = layer_warp(
         basicblock,
         conv1,
@@ -93,8 +97,8 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
         16,
         n,
         1,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
     res2 = layer_warp(
         basicblock,
         res1,
@@ -102,8 +106,8 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
         32,
         n,
         2,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
     res3 = layer_warp(
         basicblock,
         res2,
@@ -111,25 +115,25 @@ def resnet_cifar10(input, depth=32, program=None, init_program=None):
         64,
         n,
         2,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
     pool = layers.pool2d(
         input=res3,
         pool_size=8,
         pool_type='avg',
         pool_stride=1,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
     return pool
 
 
-def vgg16_bn_drop(input, program=None, init_program=None):
+def vgg16_bn_drop(input, main_program=None, startup_program=None):
     def conv_block(input,
                    num_filter,
                    groups,
                    dropouts,
-                   program=None,
-                   init_program=None):
+                   main_program=None,
+                   startup_program=None):
         return nets.img_conv_group(
             input=input,
             pool_size=2,
@@ -140,38 +144,50 @@ def vgg16_bn_drop(input, program=None, init_program=None):
             conv_with_batchnorm=True,
             conv_batchnorm_drop_rate=dropouts,
             pool_type='max',
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)
 
-    conv1 = conv_block(input, 64, 2, [0.3, 0], program, init_program)
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0], program, init_program)
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0], program, init_program)
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0], program, init_program)
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0], program, init_program)
+    conv1 = conv_block(input, 64, 2, [0.3, 0], main_program, startup_program)
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0], main_program, startup_program)
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0], main_program,
+                       startup_program)
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0], main_program,
+                       startup_program)
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0], main_program,
+                       startup_program)
 
     drop = layers.dropout(
-        x=conv5, dropout_prob=0.5, program=program, init_program=init_program)
+        x=conv5,
+        dropout_prob=0.5,
+        main_program=main_program,
+        startup_program=startup_program)
     fc1 = layers.fc(input=drop,
                     size=512,
                     act=None,
                     param_attr={"initializer": XavierInitializer()},
-                    program=program,
-                    init_program=init_program)
+                    main_program=main_program,
+                    startup_program=startup_program)
     reshape1 = layers.reshape(
         x=fc1,
         shape=list(fc1.shape + (1, 1)),
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
     bn = layers.batch_norm(
-        input=reshape1, act='relu', program=program, init_program=init_program)
+        input=reshape1,
+        act='relu',
+        main_program=main_program,
+        startup_program=startup_program)
     drop2 = layers.dropout(
-        x=bn, dropout_prob=0.5, program=program, init_program=init_program)
+        x=bn,
+        dropout_prob=0.5,
+        main_program=main_program,
+        startup_program=startup_program)
     fc2 = layers.fc(input=drop2,
                     size=512,
                     act=None,
                     param_attr={"initializer": XavierInitializer()},
-                    program=program,
-                    init_program=init_program)
+                    main_program=main_program,
+                    startup_program=startup_program)
     return fc2
 
 
@@ -209,7 +225,7 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)
 
-exe.run(g_init_program, feed={}, fetch_list=[])
+exe.run(g_startup_program, feed={}, fetch_list=[])
 
 for pass_id in range(PASS_NUM):
     batch_id = 0
@@ -227,7 +243,7 @@ for pass_id in range(PASS_NUM):
         tensor_img.set(img_data, place)
         tensor_y.set(y_data, place)
 
-        outs = exe.run(g_program,
+        outs = exe.run(g_main_program,
                        feed={"pixel": tensor_img,
                              "label": tensor_y},
                        fetch_list=[avg_cost, accuracy])
diff --git a/python/paddle/v2/framework/tests/test_inference_model_io.py b/python/paddle/v2/framework/tests/test_inference_model_io.py
index e9c9cd27d9ead5b45a2e708a669035cc1ce9cb0c..d273387a35820a27f87ef4569f158db4fe670864 100644
--- a/python/paddle/v2/framework/tests/test_inference_model_io.py
+++ b/python/paddle/v2/framework/tests/test_inference_model_io.py
@@ -3,7 +3,7 @@ import paddle.v2.framework.layers as layers
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.framework import Program, g_main_program
 from paddle.v2.framework.io import save_inference_model, load_inference_model
 import paddle.v2.framework.executor as executor
 import unittest
@@ -20,28 +20,28 @@ class TestBook(unittest.TestCase):
             name='x',
             shape=[2],
             data_type='float32',
-            program=program,
-            init_program=init_program)
+            main_program=program,
+            startup_program=init_program)
         y = layers.data(
             name='y',
             shape=[1],
             data_type='float32',
-            program=program,
-            init_program=init_program)
+            main_program=program,
+            startup_program=init_program)
 
         y_predict = layers.fc(input=x,
                               size=1,
                               act=None,
-                              program=program,
-                              init_program=init_program)
+                              main_program=program,
+                              startup_program=init_program)
 
         cost = layers.square_error_cost(
             input=y_predict,
             label=y,
-            program=program,
-            init_program=init_program)
+            main_program=program,
+            startup_program=init_program)
         avg_cost = layers.mean(
-            x=cost, program=program, init_program=init_program)
+            x=cost, main_program=program, startup_program=init_program)
 
         sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
         opts = sgd_optimizer.minimize(avg_cost, init_program)
diff --git a/python/paddle/v2/framework/tests/test_layers.py b/python/paddle/v2/framework/tests/test_layers.py
index 5cbe790e3f019f5dcf6b201c4744e7502141ed99..716963fb431a8aca70e1b87efaf268595af2b09e 100644
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/framework/tests/test_layers.py
@@ -1,6 +1,6 @@
 import paddle.v2.framework.layers as layers
 import paddle.v2.framework.nets as nets
-from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.framework import Program, g_main_program
 import paddle.v2.framework.core as core
 import unittest
 
@@ -9,15 +9,15 @@ class TestBook(unittest.TestCase):
     def test_fit_a_line(self):
         program = Program()
         x = layers.data(
-            name='x', shape=[13], data_type='float32', program=program)
-        y_predict = layers.fc(input=x, size=1, act=None, program=program)
+            name='x', shape=[13], data_type='float32', main_program=program)
+        y_predict = layers.fc(input=x, size=1, act=None, main_program=program)
 
         y = layers.data(
-            name='y', shape=[1], data_type='float32', program=program)
+            name='y', shape=[1], data_type='float32', main_program=program)
         cost = layers.square_error_cost(
-            input=y_predict, label=y, program=program)
+            input=y_predict, label=y, main_program=program)
 
-        avg_cost = layers.mean(x=cost, program=program)
+        avg_cost = layers.mean(x=cost, main_program=program)
         self.assertIsNotNone(avg_cost)
         program.append_backward(avg_cost)
         print str(program)
@@ -27,26 +27,42 @@ class TestBook(unittest.TestCase):
 
         # Change g_program, so the rest layers use `g_program`
         images = layers.data(
-            name='pixel', shape=[784], data_type='float32', program=program)
+            name='pixel',
+            shape=[784],
+            data_type='float32',
+            main_program=program)
         label = layers.data(
-            name='label', shape=[1], data_type='int32', program=program)
-        hidden1 = layers.fc(input=images, size=128, act='relu', program=program)
-        hidden2 = layers.fc(input=hidden1, size=64, act='relu', program=program)
+            name='label', shape=[1], data_type='int32', main_program=program)
+        hidden1 = layers.fc(input=images,
+                            size=128,
+                            act='relu',
+                            main_program=program)
+        hidden2 = layers.fc(input=hidden1,
+                            size=64,
+                            act='relu',
+                            main_program=program)
         predict = layers.fc(input=hidden2,
                             size=10,
                             act='softmax',
-                            program=program)
-        cost = layers.cross_entropy(input=predict, label=label, program=program)
-        avg_cost = layers.mean(x=cost, program=program)
+                            main_program=program)
+        cost = layers.cross_entropy(
+            input=predict, label=label, main_program=program)
+        avg_cost = layers.mean(x=cost, main_program=program)
         self.assertIsNotNone(avg_cost)
         print str(program)
 
     def test_simple_conv2d(self):
         program = Program()
         images = layers.data(
-            name='pixel', shape=[3, 48, 48], data_type='int32', program=program)
+            name='pixel',
+            shape=[3, 48, 48],
+            data_type='int32',
+            main_program=program)
         layers.conv2d(
-            input=images, num_filters=3, filter_size=[4, 4], program=program)
+            input=images,
+            num_filters=3,
+            filter_size=[4, 4],
+            main_program=program)
 
         print str(program)
 
@@ -57,9 +73,9 @@ class TestBook(unittest.TestCase):
             name='pixel',
             shape=[1, 28, 28],
             data_type='float32',
-            program=program)
+            main_program=program)
         label = layers.data(
-            name='label', shape=[1], data_type='int32', program=program)
+            name='label', shape=[1], data_type='int32', main_program=program)
         conv_pool_1 = nets.simple_img_conv_pool(
             input=images,
             filter_size=5,
@@ -67,7 +83,7 @@ class TestBook(unittest.TestCase):
             pool_size=2,
             pool_stride=2,
             act="relu",
-            program=program)
+            main_program=program)
         conv_pool_2 = nets.simple_img_conv_pool(
             input=conv_pool_1,
             filter_size=5,
@@ -75,14 +91,15 @@ class TestBook(unittest.TestCase):
             pool_size=2,
             pool_stride=2,
             act="relu",
-            program=program)
+            main_program=program)
 
         predict = layers.fc(input=conv_pool_2,
                             size=10,
                             act="softmax",
-                            program=program)
-        cost = layers.cross_entropy(input=predict, label=label, program=program)
-        avg_cost = layers.mean(x=cost, program=program)
+                            main_program=program)
+        cost = layers.cross_entropy(
+            input=predict, label=label, main_program=program)
+        avg_cost = layers.mean(x=cost, main_program=program)
 
         program.append_backward(avg_cost)
 
@@ -93,58 +110,58 @@ class TestBook(unittest.TestCase):
         dict_size = 10000
         embed_size = 32
         first_word = layers.data(
-            name='firstw', shape=[1], data_type='int64', program=program)
+            name='firstw', shape=[1], data_type='int64', main_program=program)
         second_word = layers.data(
-            name='secondw', shape=[1], data_type='int64', program=program)
+            name='secondw', shape=[1], data_type='int64', main_program=program)
         third_word = layers.data(
-            name='thirdw', shape=[1], data_type='int64', program=program)
+            name='thirdw', shape=[1], data_type='int64', main_program=program)
         forth_word = layers.data(
-            name='forthw', shape=[1], data_type='int64', program=program)
+            name='forthw', shape=[1], data_type='int64', main_program=program)
         next_word = layers.data(
-            name='nextw', shape=[1], data_type='int64', program=program)
+            name='nextw', shape=[1], data_type='int64', main_program=program)
 
         embed_first = layers.embedding(
             input=first_word,
             size=[dict_size, embed_size],
             data_type='float32',
             param_attr={'name': 'shared_w'},
-            program=program)
+            main_program=program)
         embed_second = layers.embedding(
             input=second_word,
             size=[dict_size, embed_size],
             data_type='float32',
             param_attr={'name': 'shared_w'},
-            program=program)
+            main_program=program)
 
         embed_third = layers.embedding(
             input=third_word,
             size=[dict_size, embed_size],
             data_type='float32',
             param_attr={'name': 'shared_w'},
-            program=program)
+            main_program=program)
         embed_forth = layers.embedding(
             input=forth_word,
             size=[dict_size, embed_size],
             data_type='float32',
             param_attr={'name': 'shared_w'},
-            program=program)
+            main_program=program)
 
         concat_embed = layers.concat(
             input=[embed_first, embed_second, embed_third, embed_forth],
             axis=1,
-            program=program)
+            main_program=program)
 
         hidden1 = layers.fc(input=concat_embed,
                             size=256,
                             act='sigmoid',
-                            program=program)
+                            main_program=program)
         predict_word = layers.fc(input=hidden1,
                                  size=dict_size,
                                  act='softmax',
-                                 program=program)
+                                 main_program=program)
         cost = layers.cross_entropy(
-            input=predict_word, label=next_word, program=program)
-        avg_cost = layers.mean(x=cost, program=program)
+            input=predict_word, label=next_word, main_program=program)
+        avg_cost = layers.mean(x=cost, main_program=program)
         self.assertIsNotNone(avg_cost)
 
         print str(program)
diff --git a/python/paddle/v2/framework/tests/test_lod_rank_table.py b/python/paddle/v2/framework/tests/test_lod_rank_table.py
index f635e716bcd0b338159235c1c66c490a14d53b07..2242d4391dc7c12e59f4f157b34b0525efcbc9a1 100644
--- a/python/paddle/v2/framework/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/framework/tests/test_lod_rank_table.py
@@ -1,6 +1,6 @@
 from paddle.v2.framework.layers import lod_rank_table, data
 from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_program
+from paddle.v2.framework.framework import g_main_program
 import paddle.v2.framework.core as core
 import numpy
 import unittest
@@ -19,7 +19,7 @@ class TestLoDRankTable(unittest.TestCase):
         tensor.set(numpy.random.random(size=(17, 100)), cpu)
         tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
 
-        exe.run(g_program, scope=scope, feed={'x': tensor})
+        exe.run(g_main_program, scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
         self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
diff --git a/python/paddle/v2/framework/tests/test_lod_tensor_array.py b/python/paddle/v2/framework/tests/test_lod_tensor_array.py
new file mode 100644
index 0000000000000000000000000000000000000000..a433bcf622b14a1d2d33b5b98d555e1a21e4b9e8
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_lod_tensor_array.py
@@ -0,0 +1,38 @@
+import unittest
+import paddle.v2.framework.core as core
+import numpy
+
+
+class TestLoDTensorArray(unittest.TestCase):
+    def test_get_set(self):
+        scope = core.Scope()
+        arr = scope.var('tmp_lod_tensor_array')
+        tensor_array = arr.get_lod_tensor_array()
+        self.assertEqual(0, len(tensor_array))
+        cpu = core.CPUPlace()
+        for i in xrange(10):
+            t = core.LoDTensor()
+            t.set(numpy.array([i], dtype='float32'), cpu)
+            t.set_lod([[0, 1]])
+            tensor_array.append(t)
+
+        self.assertEqual(10, len(tensor_array))
+
+        for i in xrange(10):
+            t = tensor_array[i]
+            self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32'))
+            self.assertEqual([[0, 1]], t.lod())
+
+            t = core.LoDTensor()
+            t.set(numpy.array([i + 10], dtype='float32'), cpu)
+            t.set_lod([[0, 2]])
+            tensor_array[i] = t
+            t = tensor_array[i]
+            self.assertEqual(
+                numpy.array(t), numpy.array(
+                    [i + 10], dtype='float32'))
+            self.assertEqual([[0, 2]], t.lod())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_operator_desc.py b/python/paddle/v2/framework/tests/test_operator_desc.py
index 7355f72455ca4f821c9520d97162e3e0050383af..a0bc4e0b91602cfc90f91a1e2dd4bce22c0dbf6d 100644
--- a/python/paddle/v2/framework/tests/test_operator_desc.py
+++ b/python/paddle/v2/framework/tests/test_operator_desc.py
@@ -1,11 +1,11 @@
 import unittest
-from paddle.v2.framework.framework import Variable, Program, g_program
+from paddle.v2.framework.framework import Variable, Program, g_main_program
 import paddle.v2.framework.core as core
 
 
 class TestOperator(unittest.TestCase):
     def test_error_type(self):
-        block = g_program.create_block()
+        block = g_main_program.create_block()
         try:
             block.append_op()
             self.assertFail()
diff --git a/python/paddle/v2/framework/tests/test_parameter.py b/python/paddle/v2/framework/tests/test_parameter.py
index 1ac0cdd99f1b7c15d64ae9d2c465d5a9d563bd80..f04eb4cf27276b0f7da0793c97742ac42e4583be 100644
--- a/python/paddle/v2/framework/tests/test_parameter.py
+++ b/python/paddle/v2/framework/tests/test_parameter.py
@@ -1,11 +1,11 @@
 import unittest
-from paddle.v2.framework.framework import g_program
+from paddle.v2.framework.framework import g_main_program
 import paddle.v2.framework.core as core
 
 
 class TestParameter(unittest.TestCase):
     def test_param(self):
-        b = g_program.create_block()
+        b = g_main_program.create_block()
         param = b.create_parameter(
             name='fc.w',
             shape=[784, 100],
diff --git a/python/paddle/v2/framework/tests/test_program.py b/python/paddle/v2/framework/tests/test_program.py
index be020573b7dcd9f8dcd0f99d654dc8b2106abb2b..7be67b6614ee3302a319289b821a214a81b6f64e 100644
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/framework/tests/test_program.py
@@ -2,35 +2,35 @@ import unittest
 
 import paddle.v2.framework.core as core
 from paddle.v2.framework.framework import Program
-from paddle.v2.framework.framework import g_program
+from paddle.v2.framework.framework import g_main_program
 
 
 class TestProgram(unittest.TestCase):
     def test_program(self):
-        b = g_program.current_block()
+        b = g_main_program.current_block()
         self.assertEqual(-1, b.parent_idx)
         self.assertEqual(0, b.idx)
 
-        b = g_program.create_block()
+        b = g_main_program.create_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_program.create_block()
+        b = g_main_program.create_block()
         self.assertEqual(2, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_program.rollback()
+        g_main_program.rollback()
 
-        b = g_program.current_block()
+        b = g_main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_program.create_block()
+        b = g_main_program.create_block()
         self.assertEqual(3, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_program.rollback()
-        b = g_program.current_block()
+        g_main_program.rollback()
+        b = g_main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
index 695236f3df6d34446038756055df83ebc86becd9..c3186e25b37fe112987017521daa336199085134 100644
--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
@@ -4,26 +4,26 @@ import paddle.v2.framework.nets as nets
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.framework import Program, g_main_program
 from paddle.v2.framework.executor import Executor
 
 import numpy as np
 
-init_program = Program()
-program = Program()
+startup_program = Program()
+main_program = Program()
 
 images = layers.data(
     name='pixel',
     shape=[1, 28, 28],
     data_type='float32',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 label = layers.data(
     name='label',
     shape=[1],
     data_type='int64',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 conv_pool_1 = nets.simple_img_conv_pool(
     input=images,
     filter_size=5,
@@ -31,8 +31,8 @@ conv_pool_1 = nets.simple_img_conv_pool(
     pool_size=2,
     pool_stride=2,
     act="relu",
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 conv_pool_2 = nets.simple_img_conv_pool(
     input=conv_pool_1,
     filter_size=5,
@@ -40,24 +40,30 @@ conv_pool_2 = nets.simple_img_conv_pool(
     pool_size=2,
     pool_stride=2,
     act="relu",
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 
 predict = layers.fc(input=conv_pool_2,
                     size=10,
                     act="softmax",
-                    program=program,
-                    init_program=init_program)
+                    main_program=main_program,
+                    startup_program=startup_program)
 cost = layers.cross_entropy(
-    input=predict, label=label, program=program, init_program=init_program)
-avg_cost = layers.mean(x=cost, program=program)
+    input=predict,
+    label=label,
+    main_program=main_program,
+    startup_program=startup_program)
+avg_cost = layers.mean(x=cost, main_program=main_program)
 accuracy = layers.accuracy(
-    input=predict, label=label, program=program, init_program=init_program)
+    input=predict,
+    label=label,
+    main_program=main_program,
+    startup_program=startup_program)
 
 # optimizer = optimizer.MomentumOptimizer(learning_rate=0.1 / 128.0,
 # momentum=0.9)
 optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
-opts = optimizer.minimize(avg_cost, init_program)
+opts = optimizer.minimize(avg_cost, startup_program)
 
 BATCH_SIZE = 50
 PASS_NUM = 3
@@ -69,7 +75,7 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)
 
-exe.run(init_program, feed={}, fetch_list=[])
+exe.run(startup_program, feed={}, fetch_list=[])
 
 for pass_id in range(PASS_NUM):
     count = 0
@@ -84,7 +90,7 @@ for pass_id in range(PASS_NUM):
         tensor_img.set(img_data, place)
         tensor_y.set(y_data, place)
 
-        outs = exe.run(program,
+        outs = exe.run(main_program,
                        feed={"pixel": tensor_img,
                              "label": tensor_y},
                        fetch_list=[avg_cost, accuracy])
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
index e848db170167944a975906b25ad54177ef2de5da..076cf882160cd53f45ef291d82ba57ada843a287 100644
--- a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
@@ -11,14 +11,14 @@ from paddle.v2.framework.initializer import UniformInitializer
 import numpy as np
 
 BATCH_SIZE = 128
-init_program = Program()
-program = Program()
+startup_program = Program()
+main_program = Program()
 image = layers.data(
     name='x',
     shape=[784],
     data_type='float32',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 
 param_attr = {
     'name': None,
@@ -30,38 +30,45 @@ param_attr = {
 hidden1 = layers.fc(input=image,
                     size=128,
                     act='relu',
-                    program=program,
-                    init_program=init_program,
+                    main_program=main_program,
+                    startup_program=startup_program,
                     param_attr=param_attr)
 hidden2 = layers.fc(input=hidden1,
                     size=64,
                     act='relu',
-                    program=program,
-                    init_program=init_program,
+                    main_program=main_program,
+                    startup_program=startup_program,
                     param_attr=param_attr)
 
 predict = layers.fc(input=hidden2,
                     size=10,
                     act='softmax',
-                    program=program,
-                    init_program=init_program,
+                    main_program=main_program,
+                    startup_program=startup_program,
                     param_attr=param_attr)
 
 label = layers.data(
     name='y',
     shape=[1],
     data_type='int64',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 
 cost = layers.cross_entropy(
-    input=predict, label=label, program=program, init_program=init_program)
-avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
+    input=predict,
+    label=label,
+    main_program=main_program,
+    startup_program=startup_program)
+avg_cost = layers.mean(
+    x=cost, main_program=main_program, startup_program=startup_program)
 accuracy = layers.accuracy(
-    input=predict, label=label, program=program, init_program=init_program)
+    input=predict,
+    label=label,
+    main_program=main_program,
+    startup_program=startup_program)
 
 optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-opts = optimizer.minimize(avg_cost, init_program)
+opts = optimizer.minimize(avg_cost, startup_program)
 
 train_reader = paddle.batch(
     paddle.reader.shuffle(
@@ -71,7 +78,7 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)
 
-exe.run(init_program, feed={}, fetch_list=[])
+exe.run(startup_program, feed={}, fetch_list=[])
 
 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
@@ -86,7 +93,7 @@ for pass_id in range(PASS_NUM):
         tensor_y = core.LoDTensor()
         tensor_y.set(y_data, place)
 
-        outs = exe.run(program,
+        outs = exe.run(main_program,
                        feed={'x': tensor_x,
                              'y': tensor_y},
                        fetch_list=[avg_cost, accuracy])
diff --git a/python/paddle/v2/framework/tests/test_recommender_system.py b/python/paddle/v2/framework/tests/test_recommender_system.py
index 7bc3f84a935884d4b7532a848f90a4648e92896a..7e54f0d1b86461954b05d1e3c1d5f49f71ff9068 100644
--- a/python/paddle/v2/framework/tests/test_recommender_system.py
+++ b/python/paddle/v2/framework/tests/test_recommender_system.py
@@ -4,13 +4,13 @@ import paddle.v2.framework.nets as nets
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.framework import Program, g_main_program
 from paddle.v2.framework.executor import Executor
 
 import numpy as np
 
-init_program = Program()
-program = Program()
+startup_program = Program()
+main_program = Program()
 is_sparse = True
 use_gpu = False
 BATCH_SIZE = 256
@@ -26,8 +26,8 @@ def get_usr_combined_features():
         name='user_id',
         shape=[1],
         data_type='int64',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     usr_emb = layers.embedding(
         input=uid,
@@ -35,13 +35,13 @@ def get_usr_combined_features():
         size=[USR_DICT_SIZE, 32],
         param_attr={'name': 'user_table'},
         is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     usr_fc = layers.fc(input=usr_emb,
                        size=32,
-                       program=program,
-                       init_program=init_program)
+                       main_program=main_program,
+                       startup_program=startup_program)
 
     USR_GENDER_DICT_SIZE = 2
 
@@ -49,75 +49,75 @@ def get_usr_combined_features():
         name='gender_id',
         shape=[1],
         data_type='int64',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     usr_gender_emb = layers.embedding(
         input=usr_gender_id,
         size=[USR_GENDER_DICT_SIZE, 16],
         param_attr={'name': 'gender_table'},
         is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     usr_gender_fc = layers.fc(input=usr_gender_emb,
                               size=16,
-                              program=program,
-                              init_program=init_program)
+                              main_program=main_program,
+                              startup_program=startup_program)
 
     USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
     usr_age_id = layers.data(
         name='age_id',
         shape=[1],
         data_type="int64",
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     usr_age_emb = layers.embedding(
         input=usr_age_id,
         size=[USR_AGE_DICT_SIZE, 16],
         is_sparse=is_sparse,
         param_attr={'name': 'age_table'},
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     usr_age_fc = layers.fc(input=usr_age_emb,
                            size=16,
-                           program=program,
-                           init_program=init_program)
+                           main_program=main_program,
+                           startup_program=startup_program)
 
     USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
     usr_job_id = layers.data(
         name='job_id',
         shape=[1],
         data_type="int64",
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     usr_job_emb = layers.embedding(
         input=usr_job_id,
         size=[USR_JOB_DICT_SIZE, 16],
         param_attr={'name': 'job_table'},
         is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     usr_job_fc = layers.fc(input=usr_job_emb,
                            size=16,
-                           program=program,
-                           init_program=init_program)
+                           main_program=main_program,
+                           startup_program=startup_program)
 
     concat_embed = layers.concat(
         input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
         axis=1,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     usr_combined_features = layers.fc(input=concat_embed,
                                       size=200,
                                       act="tanh",
-                                      program=program,
-                                      init_program=init_program)
+                                      main_program=main_program,
+                                      startup_program=startup_program)
 
     return usr_combined_features
 
@@ -130,8 +130,8 @@ def get_mov_combined_features():
         name='movie_id',
         shape=[1],
         data_type='int64',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     mov_emb = layers.embedding(
         input=mov_id,
@@ -139,13 +139,13 @@ def get_mov_combined_features():
         size=[MOV_DICT_SIZE, 32],
         param_attr={'name': 'movie_table'},
         is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     mov_fc = layers.fc(input=mov_emb,
                        size=32,
-                       program=program,
-                       init_program=init_program)
+                       main_program=main_program,
+                       startup_program=startup_program)
 
     CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
 
@@ -153,21 +153,21 @@ def get_mov_combined_features():
         name='category_id',
         shape=[1],
         data_type='int64',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     mov_categories_emb = layers.embedding(
         input=category_id,
         size=[CATEGORY_DICT_SIZE, 32],
         is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     mov_categories_hidden = layers.sequence_pool(
         input=mov_categories_emb,
         pool_type="sum",
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
 
@@ -175,15 +175,15 @@ def get_mov_combined_features():
         name='movie_title',
         shape=[1],
         data_type='int64',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     mov_title_emb = layers.embedding(
         input=mov_title_id,
         size=[MOV_TITLE_DICT_SIZE, 32],
         is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     mov_title_conv = nets.sequence_conv_pool(
         input=mov_title_emb,
@@ -191,21 +191,21 @@ def get_mov_combined_features():
         filter_size=3,
         act="tanh",
         pool_type="sum",
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     concat_embed = layers.concat(
         input=[mov_fc, mov_categories_hidden, mov_title_conv],
         axis=1,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     # FIXME(dzh) : need tanh operator
     mov_combined_features = layers.fc(input=concat_embed,
                                       size=200,
                                       act="tanh",
-                                      program=program,
-                                      init_program=init_program)
+                                      main_program=main_program,
+                                      startup_program=startup_program)
 
     return mov_combined_features
 
@@ -218,24 +218,26 @@ def model():
     inference = layers.cos_sim(
         X=usr_combined_features,
         Y=mov_combined_features,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     label = layers.data(
         name='score',
         shape=[1],
         data_type='float32',
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     square_cost = layers.square_error_cost(
         input=inference,
         label=label,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     avg_cost = layers.mean(
-        x=square_cost, program=program, init_program=init_program)
+        x=square_cost,
+        main_program=main_program,
+        startup_program=startup_program)
 
     return avg_cost
 
@@ -243,8 +245,8 @@ def model():
 def main():
     cost = model()
     sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.2)
-    opts = sgd_optimizer.minimize(cost, init_program=init_program)
-    block = program.block(0)
+    opts = sgd_optimizer.minimize(cost, startup_program=startup_program)
+    block = main_program.block(0)
 
     if use_gpu:
         place = core.GPUPlace(0)
@@ -252,7 +254,7 @@ def main():
         place = core.CPUPlace()
 
     exe = Executor(place)
-    exe.run(init_program, feed={}, fetch_list=[])
+    exe.run(startup_program, feed={}, fetch_list=[])
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
@@ -301,7 +303,7 @@ def main():
     PASS_NUM = 100
     for pass_id in range(PASS_NUM):
         for data in train_reader():
-            outs = exe.run(program,
+            outs = exe.run(main_program,
                            feed=func_feed(feeding, data),
                            fetch_list=[cost])
             out = np.array(outs[0])
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index 157befd2effa252356fdb51b273fefd4bb7ae31c..001de349d1f62e661335e63f7519c59697a1030c 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -99,17 +99,17 @@ class RecurrentOpTest1(unittest.TestCase):
     batch_size = 1
     sent_len = 1
 
-    def init_program(self):
-        self.program = Program()
-        self.init_program = Program()
+    def setup_program(self):
+        self.main_program = Program()
+        self.startup_program = Program()
         self.p_info = {
-            "program": self.program,
-            "init_program": self.init_program
+            "main_program": self.main_program,
+            "startup_program": self.startup_program
         }
         self.place = core.CPUPlace()
 
     def setUp(self):
-        self.init_program()
+        self.setup_program()
         self.data_field = {"x", "h_boot"}
 
         self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
@@ -125,13 +125,15 @@ class RecurrentOpTest1(unittest.TestCase):
             name='x',
             append_batch_size=False,
             **self.p_info)
+        x.stop_gradient = False
         h_boot = data(
             shape=[self.input_dim],
             data_type='float32',
             name='h_boot',
             **self.p_info)
+        h_boot.stop_gradient = False
 
-        rnn = StaticRNN(program=self.program)
+        rnn = StaticRNN(main_program=self.main_program)
         with rnn.step():
             h_pre = rnn.memory(init=h_boot)
             x_t = rnn.step_input(x)
@@ -153,7 +155,7 @@ class RecurrentOpTest1(unittest.TestCase):
             for x in self.data_field
         }
         exe = Executor(self.place)
-        out = exe.run(self.program,
+        out = exe.run(self.main_program,
                       feed=self.feed_map,
                       fetch_list=[self.output])
 
@@ -165,12 +167,14 @@ class RecurrentOpTest1(unittest.TestCase):
             for x in self.data_field
         }
         fetch_list = [
-            self.program.global_block().var(x + "@GRAD")
+            self.main_program.global_block().var(x + "@GRAD")
             for x in self.data_field
         ]
 
         exe = Executor(self.place)
-        return exe.run(self.program, feed=self.feed_map, fetch_list=fetch_list)
+        return exe.run(self.main_program,
+                       feed=self.feed_map,
+                       fetch_list=fetch_list)
 
     def test_backward(self):
         self.check_forward()
@@ -237,7 +241,7 @@ class RecurrentOpTest2(RecurrentOpTest1):
     sent_len = 2
 
     def setUp(self):
-        self.init_program()
+        self.setup_program()
 
         self.data_field = {"x", "h_boot", "W", "U"}
 
@@ -254,13 +258,15 @@ class RecurrentOpTest2(RecurrentOpTest1):
             name='x',
             append_batch_size=False,
             **self.p_info)
+        x.stop_gradient = False
         h_boot = data(
             shape=[self.input_dim],
             data_type='float32',
             name='h_boot',
             **self.p_info)
+        h_boot.stop_gradient = False
 
-        rnn = StaticRNN(program=self.program)
+        rnn = StaticRNN(main_program=self.main_program)
         with rnn.step():
             h_pre = rnn.memory(init=h_boot)
             x_t = rnn.step_input(x)
@@ -333,7 +339,7 @@ class RecurrentOpTest3(RecurrentOpTest1):
     sent_len = 2
 
     def setUp(self):
-        self.init_program()
+        self.setup_program()
 
         self.data_field = {"x", "h_boot1", "h_boot2"}
 
@@ -351,20 +357,23 @@ class RecurrentOpTest3(RecurrentOpTest1):
             name='x',
             append_batch_size=False,
             **self.p_info)
+        x.stop_gradient = False
         h_boot1 = data(
             shape=[self.batch_size, self.input_dim],
             data_type='float32',
             name='h_boot1',
             append_batch_size=False,
             **self.p_info)
+        h_boot1.stop_gradient = False
         h_boot2 = data(
             shape=[self.batch_size, self.input_dim],
             data_type='float32',
             name='h_boot2',
             append_batch_size=False,
             **self.p_info)
+        h_boot2.stop_gradient = False
 
-        rnn = StaticRNN(program=self.program)
+        rnn = StaticRNN(main_program=self.main_program)
         with rnn.step():
             h_pre1 = rnn.memory(init=h_boot1)
             h_pre2 = rnn.memory(init=h_boot2)
diff --git a/python/paddle/v2/framework/tests/test_seq_pool.py b/python/paddle/v2/framework/tests/test_seq_pool.py
index efc4920124afb539017a3b3f211c7320da68ffef..512d8b315f29cecf79ae274dca491c240f3447a1 100644
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
@@ -29,6 +29,9 @@ class TestSeqAvgPool(OpTest):
         self.check_output()
 
     def test_check_grad(self):
+        # Remove MaxIndex after check_grad is refined.
+        self.outputs['MaxIndex'] = \
+            np.zeros(self.outputs['Out'].shape).astype('int32')
         self.check_grad(["X"], "Out")
 
 
@@ -85,31 +88,53 @@ class TestSeqSqrtPool2D(TestSeqAvgPool2D):
             out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17))
 
     def test_check_grad(self):
+        # Remove MaxIndex after check_grad is refined.
+        self.outputs['MaxIndex'] = \
+            np.zeros(self.outputs['Out'].shape).astype('int32')
         self.check_grad(["X"], "Out", max_relative_error=0.06)
 
 
 class TestSeqMaxPool(TestSeqAvgPool):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
+        lod = [[0, 4, 5, 8, 13]]
+        for i in range(4):
+            l = lod[0][i + 1] - lod[0][i]
+            x[lod[0][i] + np.random.randint(l), :] += 2.0
+
+        self.inputs = {'X': (x, lod)}
+
+        out = np.zeros((4, 23)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+
     def compute(self, x, lod, out):
         self.attrs = {'pooltype': "MAX"}
         for i in range(4):
             sub_x = x[lod[0][i]:lod[0][i + 1], :]
             out[i] = np.amax(sub_x, axis=0)
 
-    def test_check_grad(self):
-        # Remove MaxPool2D from gradient check to confirm the success of CI.
-        return
-
 
 class TestSeqMaxPool2D(TestSeqAvgPool2D):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32')
+        lod = [[0, 4, 5, 8, 13]]
+        self.inputs = {'X': (x, lod)}
+        for i in range(4):
+            l = lod[0][i + 1] - lod[0][i]
+            x[lod[0][i] + np.random.randint(l), :] += 1.0
+
+        out = np.zeros((4, 3, 11)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+
     def compute(self, x, lod, out):
         self.attrs = {'pooltype': "MAX"}
         for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
-            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 17))
-
-    def test_check_grad(self):
-        # Remove MaxPool2D from gradient check to confirm the success of CI.
-        return
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 11))
+            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
 
 
 class TestSeqLastPool(TestSeqAvgPool):
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
index f93feb20696f126423bc9412eab3b4aa41b19426..c2f07f9096c69f3d4977f9444bdd5dcda8028973 100644
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -12,30 +12,30 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
 
     def setUp(self):
         self.op_type = "softmax_with_cross_entropy"
-        batch_size = 3
+        batch_size = 2
         class_num = 37
 
         logits = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float32")
+                                   [batch_size, class_num]).astype("float64")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
-        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int32")
+        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
 
         cross_entropy = np.asmatrix(
             [[-np.log(softmax[i][labels[i][0]])]
              for i in range(softmax.shape[0])],
-            dtype="float32")
+            dtype="float64")
 
         self.inputs = {"Logits": logits, "Label": labels}
         self.outputs = {
-            "Softmax": softmax.astype('float32'),
-            "Loss": cross_entropy.astype('float32')
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
         }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+        self.check_grad(["Logits"], "Loss")
 
 
 class TestSoftmaxWithCrossEntropyOp2(OpTest):
@@ -49,19 +49,19 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
         class_num = 37
 
         logits = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float32")
+                                   [batch_size, class_num]).astype("float64")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
         labels = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float32")
+                                   [batch_size, class_num]).astype("float64")
         labels /= np.sum(labels, axis=1, keepdims=True)
 
         cross_entropy = (-labels * np.log(softmax)).sum(
-            axis=1, keepdims=True).astype("float32")
+            axis=1, keepdims=True).astype("float64")
 
         self.inputs = {"Logits": logits, "Label": labels}
         self.outputs = {
-            "Softmax": softmax.astype('float32'),
-            "Loss": cross_entropy.astype('float32')
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
         }
         self.attrs = {"soft_label": True}
 
@@ -69,9 +69,8 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+        self.check_grad(["Logits"], "Loss")
 
 
 if __name__ == "__main__":
-    exit(0)  # FIXME: xe has bug
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py b/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
index dcbb34ccfcff65086dff1cb1ffd859c4c1e0d7ca..eb377e9264b6031e9bf484a90b7c2b39442407f1 100644
--- a/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
@@ -4,7 +4,7 @@ import paddle.v2.framework.nets as nets
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program, g_program, g_init_program
+from paddle.v2.framework.framework import Program, g_main_program, g_startup_program
 from paddle.v2.framework.executor import Executor
 
 import numpy as np
@@ -70,7 +70,7 @@ def main():
     place = core.CPUPlace()
     exe = Executor(place)
 
-    exe.run(g_init_program)
+    exe.run(g_startup_program)
 
     for pass_id in xrange(PASS_NUM):
         for data in train_data():
@@ -82,7 +82,7 @@ def main():
             tensor_label = core.LoDTensor()
             tensor_label.set(label, place)
 
-            outs = exe.run(g_program,
+            outs = exe.run(g_main_program,
                            feed={"words": tensor_words,
                                  "label": tensor_label},
                            fetch_list=[cost, acc])
diff --git a/python/paddle/v2/framework/tests/test_variable.py b/python/paddle/v2/framework/tests/test_variable.py
index c670ca19afbd778747303cb002666aa2a5e62c37..03115f10a5a494424c6f8310c544c569be818e5b 100644
--- a/python/paddle/v2/framework/tests/test_variable.py
+++ b/python/paddle/v2/framework/tests/test_variable.py
@@ -1,5 +1,5 @@
 import unittest
-from paddle.v2.framework.framework import Variable, g_program, Program
+from paddle.v2.framework.framework import Variable, g_main_program, Program
 import paddle.v2.framework.core as core
 import numpy as np
 
@@ -18,7 +18,7 @@ class TestVariable(unittest.TestCase):
         self.assertRaises(ValueError, lambda: convert("int8"))
 
     def test_var(self):
-        b = g_program.current_block()
+        b = g_main_program.current_block()
         w = b.create_var(
             dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
         self.assertNotEqual(str(w), "")
diff --git a/python/paddle/v2/framework/tests/test_word2vec.py b/python/paddle/v2/framework/tests/test_word2vec.py
index 2aaf8d6a2b2023416ed8daf93d9a252bd4b0b05c..6c3a448ec79717da6b81f51d9a4dccf038539b79 100644
--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ b/python/paddle/v2/framework/tests/test_word2vec.py
@@ -3,13 +3,13 @@ import paddle.v2.framework.layers as layers
 import paddle.v2.framework.core as core
 import paddle.v2.framework.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.framework import Program, g_main_program
 from paddle.v2.framework.executor import Executor
 
 import numpy as np
 
-init_program = Program()
-program = Program()
+startup_program = Program()
+main_program = Program()
 
 embed_size = 32
 hidden_size = 256
@@ -24,32 +24,32 @@ first_word = layers.data(
     name='firstw',
     shape=[1],
     data_type='int64',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 second_word = layers.data(
     name='secondw',
     shape=[1],
     data_type='int64',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 third_word = layers.data(
     name='thirdw',
     shape=[1],
     data_type='int64',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 forth_word = layers.data(
     name='forthw',
     shape=[1],
     data_type='int64',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 next_word = layers.data(
     name='nextw',
     shape=[1],
     data_type='int64',
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 
 embed_first = layers.embedding(
     input=first_word,
@@ -57,16 +57,16 @@ embed_first = layers.embedding(
     data_type='float32',
     is_sparse=is_sparse,
     param_attr={'name': 'shared_w'},
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 embed_second = layers.embedding(
     input=second_word,
     size=[dict_size, embed_size],
     data_type='float32',
     is_sparse=is_sparse,
     param_attr={'name': 'shared_w'},
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 
 embed_third = layers.embedding(
     input=third_word,
@@ -74,42 +74,43 @@ embed_third = layers.embedding(
     data_type='float32',
     is_sparse=is_sparse,
     param_attr={'name': 'shared_w'},
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 embed_forth = layers.embedding(
     input=forth_word,
     size=[dict_size, embed_size],
     data_type='float32',
     is_sparse=is_sparse,
     param_attr={'name': 'shared_w'},
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 
 concat_embed = layers.concat(
     input=[embed_first, embed_second, embed_third, embed_forth],
     axis=1,
-    program=program,
-    init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
 
 hidden1 = layers.fc(input=concat_embed,
                     size=hidden_size,
                     act='sigmoid',
-                    program=program,
-                    init_program=init_program)
+                    main_program=main_program,
+                    startup_program=startup_program)
 predict_word = layers.fc(input=hidden1,
                          size=dict_size,
                          act='softmax',
-                         program=program,
-                         init_program=init_program)
+                         main_program=main_program,
+                         startup_program=startup_program)
 cost = layers.cross_entropy(
     input=predict_word,
     label=next_word,
-    program=program,
-    init_program=init_program)
-avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
+    main_program=main_program,
+    startup_program=startup_program)
+avg_cost = layers.mean(
+    x=cost, main_program=main_program, startup_program=startup_program)
 
 sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost, init_program)
+opts = sgd_optimizer.minimize(avg_cost, startup_program)
 
 train_reader = paddle.batch(
     paddle.dataset.imikolov.train(word_dict, N), batch_size)
@@ -117,7 +118,7 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)
 
-exe.run(init_program, feed={}, fetch_list=[])
+exe.run(startup_program, feed={}, fetch_list=[])
 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
     for data in train_reader():
@@ -145,7 +146,7 @@ for pass_id in range(PASS_NUM):
         next_tensor = core.LoDTensor()
         next_tensor.set(next_data, place)
 
-        outs = exe.run(program,
+        outs = exe.run(main_program,
                        feed={
                            'firstw': first_tensor,
                            'secondw': second_tensor,
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 29f0945eb4c88eab8fa9ee83f455190dfd473aa4..94d706b1d6289a7bffbdfb161c35d44c78fdf46f 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -11,11 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Optimizers(update equation) for SGD method.
-
-TODO(yuyang18): Complete comments.
-"""
 
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
@@ -101,32 +96,37 @@ class Optimizer(object):
 
 class Momentum(Optimizer):
     """
-    SGD Optimizer.
-
-    SGD is an optimization method, trying to find a neural network that
-    minimize the "cost/error" of it by iteration. In paddle's implementation
-    SGD Optimizer is synchronized, which means all gradients will be wait to
-    calculate and reduced into one gradient, then do optimize operation.
+    Momentum Optimizer.
 
-    The neural network consider the learning problem of minimizing an objective
-    function, that has the form of a sum
+    When sparse=False, the momentum update formula is as follows:
 
     ..  math::
 
-        Q(w) = \\sum_{i}^{n} Q_i(w)
+        v_{t} &= k * v_{t-1} - \\gamma_t / (g_{t} + \\lambda w_{t-1}) \\\\
+        w_{t} &= w_{t-1} + v_{t} \\\\
 
-    The value of function Q sometimes is the cost of neural network (Mean
-    Square Error between prediction and label for example). The function Q is
-    parametrised by w, the weight/bias of neural network. And weights is what to
-    be learned. The i is the i-th observation in (trainning) data.
+    where, :math:`k` is momentum, :math:`\\lambda` is decay rate,
+    :math:`\\gamma_t` is learning rate at the t'th iteration.
+    :math:`w_{t}` is the weight as the t'th iteration.
+    And the :math:`v_{t}` is the history momentum variable.
 
-    So, the SGD method will optimize the weight by
+    When sparse=True, the update scheme:
 
     ..  math::
 
-        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
-
-    where :math:`\\eta` is learning rate. And :math:`n` is batch size.
+        \\alpha_t &= \\alpha_{t-1} / k \\\\
+        \\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\
+        u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\
+        v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\
+        \\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t
+    
+    where :math:`k` is momentum, :math:`\\lambda` is decay rate, 
+    :math:`\\gamma_t` is learning rate at the t'th iteration.
+
+    :param momentum: the momentum factor.
+    :type momentum: float
+    :param sparse: with sparse support or not, False by default.
+    :type sparse: bool
     """
 
     def __init__(self, momentum=None, sparse=False, **kwargs):
@@ -146,7 +146,7 @@ class Adam(Optimizer):
 
         m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
         v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
 
     :param beta1: the :math:`\\beta_1` in equation.
     :type beta1: float
diff --git a/python/setup.py.in b/python/setup.py.in
index 87b3823e52604b889cdee76bc696a1ae9b9de802..5348c2d8d7e9b5adc5fe93e2943bef149ba047cc 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,4 +1,4 @@
-from setuptools import setup, Distribution
+from setuptools import setup, Distribution, Extension
 class BinaryDistribution(Distribution):
     def has_ext_modules(foo):
         return True
@@ -41,6 +41,7 @@ setup(name='paddlepaddle',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
       packages=packages,
+      ext_modules=[Extension('_foo', ['stub.cc'])],
       package_data={
         'paddle.v2.master': ['libpaddle_master.so'],
         'paddle.v2.framework': ['core.so'],
@@ -54,6 +55,5 @@ setup(name='paddlepaddle',
           'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
       },
       scripts=paddle_bins,
-      distclass=BinaryDistribution,
       data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
 )