diff --git a/CMakeLists.txt b/CMakeLists.txt
index fd3582a1bca199d62d19550ffdd1efe9db520fa7..ae8728f4d4c22f45f13a283a448e907337f37f7a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,8 +36,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
+option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -82,10 +81,8 @@ if(ANDROID OR IOS)
         "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
     set(WITH_RDMA OFF CACHE STRING
         "Disable RDMA when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKLDNN OFF CACHE STRING
-        "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKLML OFF CACHE STRING
-        "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
+    set(WITH_MKL OFF CACHE STRING
+        "Disable MKL when cross-compiling for Android and iOS" FORCE)
 
     # Compile PaddlePaddle mobile inference library
     if (NOT WITH_C_API)
@@ -111,6 +108,14 @@ else()
     set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
 
+set(WITH_MKLML ${WITH_MKL})
+if (WITH_MKL AND ${AVX2_FOUND})
+    set(WITH_MKLDNN ON)
+else()
+    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
+    set(WITH_MKLDNN OFF)
+endif()
+
 ########################################################################################
 
 include(external/mklml)     # download mklml package
@@ -158,14 +163,15 @@ set(EXTERNAL_LIBS
 )
 
 if(WITH_GPU)
-    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
-    if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
-    endif(NOT WITH_DSO)
+  include(cuda)
 endif(WITH_GPU)
 
+if(WITH_MKLML)
+    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
+endif()
+
 if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
 
 if(USE_NNPACK)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 24ddb24399dabeec9b8e5faf36be3eb21f420111..e550ec285668ea25757eeee9e7c5dc48fc9d339d 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -76,27 +76,14 @@ else()
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
-if(WITH_MKLDNN)
-    add_definitions(-DPADDLE_USE_MKLDNN)
-    if (WITH_MKLML AND MKLDNN_IOMP_DIR)
-        message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
-        set(OPENMP_FLAGS "-fopenmp")
-        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
-    else()
-        find_package(OpenMP)
-        if(OPENMP_FOUND)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-        else()
-            message(WARNING "Can not find OpenMP."
-                 "Some performance features in MKLDNN may not be available")
-        endif()
-    endif()
-
-endif(WITH_MKLDNN)
+if (WITH_MKLML AND MKLML_IOMP_LIB)
+    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
+    set(OPENMP_FLAGS "-fopenmp")
+    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
+endif()
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
index 310450f7d009dc0cdae9c0079a96445af8ec8f95..d3f5bf6852b3b295f3b5806b0577a880b0ce6ba6 100644
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
 # Set the architecture for iOS
 if(NOT DEFINED IOS_ARCH)
   if(IOS_PLATFORM STREQUAL "OS")
-    # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
-    set(IOS_ARCH "arm64")
+    set(IOS_ARCH "armv7;armv7s;arm64")
   elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    # FIXME(liuyiqun): support "i386;x86_64" future
-    set(IOS_ARCH "x86_64")
+    set(IOS_ARCH "i386;x86_64")
   endif()
 endif()
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_
 
 # Hidden visibilty is required for cxx on iOS 
 set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
+set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
 
 set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
 
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..6bea7cf3022242ce48cc882915f7e71810937283
--- /dev/null
+++ b/cmake/cuda.cmake
@@ -0,0 +1,188 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs7 "30 35 50 52")
+set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
+
+######################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   detect_installed_gpus(out_variable)
+function(detect_installed_gpus out_variable)
+  if(NOT CUDA_gpu_detect_output)
+    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+    file(WRITE ${cufile} ""
+      "#include <cstdio>\n"
+      "int main() {\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device) {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+
+    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
+                    "--run" "${cufile}"
+                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    if(nvcc_res EQUAL 0)
+      # only keep the last line of nvcc_out
+      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
+      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
+      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
+    endif()
+  endif()
+
+  if(NOT CUDA_gpu_detect_output)
+    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
+    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+########################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
+# Usage:
+#   select_nvcc_arch_flags(out_variable)
+function(select_nvcc_arch_flags out_variable)
+  # List of arch names
+  set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
+  set(archs_name_default "All")
+  if(NOT CMAKE_CROSSCOMPILING)
+    list(APPEND archs_names "Auto")
+  endif()
+
+  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
+  set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
+  mark_as_advanced(CUDA_ARCH_NAME)
+
+  # verify CUDA_ARCH_NAME value
+  if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
+    string(REPLACE ";" ", " archs_names "${archs_names}")
+    message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
+  else()
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
+    set(cuda_arch_bin "30 35")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+    set(cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(cuda_arch_bin "60 61")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
+    set(cuda_arch_bin "70")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
+    set(cuda_arch_bin ${paddle_known_gpu_archs})
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
+    detect_installed_gpus(cuda_arch_bin)
+  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(cuda_arch_bin ${CUDA_ARCH_BIN})
+  endif()
+
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+  list(REMOVE_DUPLICATES cuda_arch_bin)
+  list(REMOVE_DUPLICATES cuda_arch_ptx)
+
+  set(nvcc_flags "")
+  set(nvcc_archs_readable "")
+
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(arch ${cuda_arch_bin})
+    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified PTX for the concrete BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+      list(APPEND nvcc_archs_readable sm_${arch})
+    endif()
+  endforeach()
+
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(arch ${cuda_arch_ptx})
+    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+    list(APPEND nvcc_archs_readable compute_${arch})
+  endforeach()
+
+  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
+
+message(STATUS "CUDA detected: " ${CUDA_VERSION})
+if (${CUDA_VERSION} LESS 7.0)
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
+  # warning for now.
+  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+endif()
+
+include_directories(${CUDA_INCLUDE_DIRS})
+list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+if(NOT WITH_DSO)
+    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+endif(NOT WITH_DSO)
+
+# setting nvcc arch flags
+select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
+list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
+
+# Set C++11 support
+set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
+# So, don't set these flags here.
+list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
+list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+# Set :expt-relaxed-constexpr to suppress Eigen warnings
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
+
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+
+mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
+mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 5a06825beb73e85d8a55b7b578b187bee2c4340c..fc52d339d7a336b44c97f2e0a9fc8d6604854365 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
 
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
     SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
-    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
-    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
-    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
-    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
+    MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
+ELSE()
+    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
 
 SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
@@ -57,15 +56,16 @@ ExternalProject_Add(
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
     CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
     CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
+                        -DMKLROOT:PATH=${MKLML_ROOT}
 )
 
 ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
+MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
+add_definitions(-DPADDLE_USE_MKLDNN)
 LIST(APPEND external_project_dependencies mkldnn)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 324e29f931ecbb6beab2d363daa01a19b1a56b3e..4c4f59656dae68739f2f07f3febd510e727fe2dd 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND})
         "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
         CACHE FILEPATH "openblas library." FORCE)
 
-    SET(OPENBLAS_CC "${CMAKE_C_COMPILER}")
+    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
 
     IF(CMAKE_CROSSCOMPILING)
         SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND})
                 SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
             ENDIF()
         ELSEIF(IOS)
-            # FIXME(liuyiqun): support multiple architectures
-            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
-            SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
-            IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
-            ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+            IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+                SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
                 SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
                 SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
+            ELSE()
+                MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
+                       "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
             ENDIF()
         ELSEIF(RPI)
             # use hardfp
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 8bd058222880b4df3b08da09c02f9fe7f1d0ee66..a8e1aca49c97df256b1269c286b0bce7732fa932 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
 INCLUDE(ExternalProject)
 
 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 4593ae6180b6d7deb61d897eb634b17ac0bb1683..2b125cef6aa8d1021afe8a7a0d232d84d36be4bc 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -149,58 +149,3 @@ endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
     safe_set_nvflag(${flag})
 endforeach()
-
-
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-
-# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
-# So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
-LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
-
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
-endif()
-
-function(specify_cuda_arch cuda_version cuda_arch)
-    if(${cuda_version} VERSION_GREATER "8.0")
-        foreach(capability 61 62)
-          if(${cuda_arch} STREQUAL ${capability})
-            list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-          endif()
-        endforeach()
-    elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
-        list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-    endif()
-endfunction()
-
-# Common gpu architectures: Kepler, Maxwell
-foreach(capability 30 35 50)
-      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
-endforeach()
-
-if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
-      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
-endif()
-
-# Modern gpu architectures: Pascal
-if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
-      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
-      list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
-endif()
-
-# Custom gpu architecture
-set(CUDA_ARCH)
-
-if(CUDA_ARCH)
-  specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
-endif()
-
-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
-
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 117ab7f49cdf4a568cd203b2b17767643d0b2d50..ad905ab55ba3537054fa5b30b5fca4d83c406702 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME)
         target_link_libraries(${TARGET_NAME} log)
     endif(ANDROID)
 
-    if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
-      target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+    if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
+      target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
     endif()
 
     add_dependencies(${TARGET_NAME} ${external_project_dependencies})
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 203506d7ab84e5a5be2232b077eac2d433a99766..b2b55ec419d2f8453e067f202f6c1b7da6c201de 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -335,6 +335,16 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
     :noindex:
 
+dot_prod
+---------
+.. autoclass:: paddle.v2.layer.dot_prod
+    :noindex:
+
+out_prod
+--------
+.. autoclass:: paddle.v2.layer.out_prod
+    :noindex:
+
 power
 -----
 ..  autoclass:: paddle.v2.layer.power
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
index 16236763a73770f3fe5eadf67645765d0456f875..ec6d4681836e189f46dbb9b915a237dc15cda7cf 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA.
 我们把集成方案大致分为了如下几个方面。
 
 ### CMake
-我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项，当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
+我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关，他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。
 
-同时，我们会引入`WITH_MKLML`选项，用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用，但是建议在开启MKL-DNN的同时也打开MKLML的开关，这样才能发挥最好的性能。
+当打开`WITH_MKL`时，会开启MKLML的功能，作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上，同时会开启MKL-DNN功能。
 
-所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
+当关闭`WITH_MKL`时，MKLML和MKL-DNN功能会同时关闭。
 
-**备注**：当`WITH_MKLML=ON`的时候，会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库，所以会稍微改动`cmake/cblas.cmake`中的逻辑。
+所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
 
 ### Layers
 所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index 731a63f945c29ba78538b3d71289b234e569354d..61f3a223547b352cf7929615cf3682b29b9a738f 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -34,7 +34,7 @@ PaddlePaddle的文档构建有两种方式。
     cd TO_YOUR_PADDLE_CLONE_PATH
     mkdir -p build
     cd build
-    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
     make gen_proto_py
     make paddle_docs paddle_docs_cn
 
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index 882066f23714f7ab3bba9199b5fa5ff2325ce849..424d7718c64438496cf0895397babd5408e1ca02 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -1,4 +1,4 @@
-# 构建Android平台上的PaddlePaddle库
+# Android平台编译指南
 
 用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
 - 基于Docker容器的编译方式
diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md
index cda636a67de712e072f4cc7ad859dda75211eaa8..9da48e7f2119ce901fbb3abab73400df27be16d2 100644
--- a/doc/mobile/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -1,4 +1,4 @@
-# 构建iOS平台上的PaddlePaddle库
+# iOS平台编译指南
 交叉编译iOS平台上适用的PaddlePaddle库，需要在MacOS系统上进行。本文的将介绍在MacOS上，从源码交叉编译iOS平台上适用的PaddlePaddle库。
 
 ## 准备交叉编译环境
@@ -25,7 +25,7 @@ iOS平台可选配置参数：
 - `IOS_PLATFORM`，可设置为`OS/SIMULATOR`，默认值为`OS`。
   - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
   - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
-- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示：
+- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示，默认编译所有架构：
 
     <table class="docutils">
     <colgroup>
@@ -41,11 +41,11 @@ iOS平台可选配置参数：
     <tbody valign="top">
       <tr class="row-even">
       <td>OS</td>
-      <td>armv7, armv7s, arm64 (默认)</td>
+      <td>armv7, armv7s, arm64 </td>
     </tr>
     <tr class="row-odd">
       <td>SIMULATOR</td>
-      <td>i386, x86_64 (默认)</td>
+      <td>i386, x86_64 </td>
     </tr>
     </tbody>
     </table>
@@ -66,7 +66,7 @@ iOS平台可选配置参数：
 ```bash
 cmake -DCMAKE_SYSTEM_NAME=iOS \
       -DIOS_PLATFORM=OS \
-      -DIOS_ARCH="arm64" \
+      -DIOS_ARCH="armv7;arm64" \
       -DIOS_ENABLE_BITCODE=ON \
       -DIOS_USE_VECLIB_FOR_BLAS=ON \
       -DCMAKE_INSTALL_PREFIX=your/path/to/install \
@@ -112,6 +112,6 @@ $ make install
 - `lib`目录，其中包含PaddlePaddle的C-API静态库
 - `third_party`目录，其中包含所依赖的所有第三方库
 
-注意，不同架构的PaddlePaddle库建议安装到不同的目录下，然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。
+注意，如果PaddlePaddle库需要同时支持真机和模拟器，则需要分别编译真机和模拟器版本，然后使用`lipo`工具合并fat库。
 
 自此，PaddlePaddle库已经安装完成，用户可将合成的fat库用于深度学习相关的iOS App中，调用方法见C-API文档。
diff --git a/doc/mobile/cross_compiling_for_raspberry_cn.md b/doc/mobile/cross_compiling_for_raspberry_cn.md
index 6e983645faaed1f67edaeeb82ddbef9cef6bb85f..f8ef9dc8031613831437745995268f3abc392f5b 100644
--- a/doc/mobile/cross_compiling_for_raspberry_cn.md
+++ b/doc/mobile/cross_compiling_for_raspberry_cn.md
@@ -1,4 +1,4 @@
-# 构建Raspberry Pi平台上的PaddlePaddle库
+# Raspberry Pi平台编译指南
 
 通常有两个方法来构建基于 Rasspberry Pi 的版本：
 
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index ede2670882ee2b93f610a2261a4ecc1784bc2d0c..4ab8de80d1c7be0f8e3eb848955373dd5e21bc18 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -25,7 +25,9 @@ limitations under the License. */
 #include "hl_matrix.h"
 #include "hl_sequence.h"
 #include "hl_sparse.h"
+#ifndef PADDLE_MOBILE_INFERENCE
 #include "hl_warpctc_wrap.h"
+#endif
 
 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_aggregate_stub.h"
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index b3b9c45ded95ce2e735b8898d47760956dcacdce..00d9dd238ec5328be28f58f8118daad3a039e08c 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -270,6 +270,19 @@ static bool AllGradInSet(const std::vector<std::string>& names,
       return false;
     }
   }
+  if (VLOG_IS_ON(10)) {
+    std::ostringstream sout;
+    sout << "All input {";
+    for (auto& name : names) {
+      sout << name << ",";
+    }
+    sout << "} is in {";
+    for (auto& name : set) {
+      sout << name << ",";
+    }
+    sout << "}";
+    VLOG(10) << sout.str();
+  }
   return true;
 }
 
@@ -290,14 +303,12 @@ static void CreateGradVarInBlock(
   auto ops = block_desc->AllOps();
   for (size_t op_index = grad_op_start_index; op_index < ops.size();
        ++op_index) {
-    bool need_infer_shape = false;
     std::unordered_set<std::string> new_vars;
     ForEachVarName(ops[op_index]->Outputs(),
                    [&](const std::string& grad_var_name) {
                      if (block_desc->HasVar(grad_var_name)) {
                        return false;
                      }
-                     need_infer_shape = true;
                      auto var = block_desc->Var(grad_var_name);
                      new_vars.insert(var->Name());
                      auto it = param_name_map.find(grad_var_name);
@@ -311,23 +322,21 @@ static void CreateGradVarInBlock(
                      grad_record.op_idx_ = static_cast<int>(op_index);
                      return false; /* not break */
                    });
-    if (need_infer_shape) {
-      ops[op_index]->InferVarType(block_desc);
-      for (auto& arg : ops[op_index]->OutputArgumentNames()) {
-        if (new_vars.find(arg) == new_vars.end()) {
-          continue;
-        }
-        auto pname = FwdName(arg);
-        auto* param = block_desc->FindVarRecursive(pname);
-        auto* grad = block_desc->FindVar(arg);
-        if (param == nullptr) {
-          grad->SetDataType(DataType::FP32);
-        } else {
-          grad->SetDataType(param->GetDataType());
-        }
+    ops[op_index]->InferVarType(block_desc);
+    for (auto& arg : ops[op_index]->OutputArgumentNames()) {
+      if (new_vars.find(arg) == new_vars.end()) {
+        continue;
+      }
+      auto pname = FwdName(arg);
+      auto* param = block_desc->FindVarRecursive(pname);
+      auto* grad = block_desc->FindVar(arg);
+      if (param == nullptr) {
+        grad->SetDataType(DataType::FP32);
+      } else {
+        grad->SetDataType(param->GetDataType());
       }
-      ops[op_index]->InferShape(*block_desc);
     }
+    ops[op_index]->InferShape(*block_desc);
   }
 }
 
@@ -387,6 +396,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
     ProgramDescBind& program_desc, int block_idx,
     std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var) {
+  VLOG(5) << "MakeBlockBackward";
   BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
   std::vector<OpDescBind*> op_descs = cur_block->AllOps();
   std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
@@ -394,9 +404,10 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
   std::vector<std::unique_ptr<OpDescBind>> backward_descs;
 
   for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
+    VLOG(5) << "Making backward " << (*it)->Type() << " op";
     std::vector<std::unique_ptr<OpDescBind>> op_grads;
 
-    if ((*it)->Type() == "recurrent") {
+    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
       int step_block_idx = (*it)->GetBlockAttr("step_block");
       BlockDescBind* backward_block = CreateStepBlock(
           program_desc, no_grad_vars, grad_to_var, step_block_idx);
@@ -410,6 +421,15 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
     }
 
+    if (VLOG_IS_ON(10)) {
+      std::ostringstream sout;
+      sout << "Made ";
+      for (auto& op_grad : op_grads) {
+        sout << op_grad->Type() << " ";
+      }
+      VLOG(10) << sout.str();
+    }
+
     for (const auto& desc : op_grads) {
       for (const std::string& out_name : desc->OutputArgumentNames()) {
         if (out_name.find("@GRAD") == std::string::npos) {
@@ -425,6 +445,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
         op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
         [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
   }
+
+  VLOG(5) << "Appending Sums";
   // Check whether some variables are written more than once
   std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
   for (const auto& dup : dup_out_ops) {
@@ -432,16 +454,22 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
     const std::vector<size_t> dup_op = dup.second;
     if (out_name != kEmptyVarName && dup_op.size() > 1) {
       std::vector<std::string> sum_op_inputs;
+      std::string next_g_name = out_name;
       for (size_t i = 0; i < dup_op.size(); ++i) {
+        VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
+                 << " duplicated";
         std::string new_name = out_name + "@RENAME@" + std::to_string(i);
-        backward_descs[dup_op[i]]->Rename(out_name, new_name);
+        backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
+        backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
         sum_op_inputs.emplace_back(new_name);
+        next_g_name = sum_op_inputs.back();
       }
       std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
           "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
       pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
     }
   }
+
   pending_sum_ops.sort(
       [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
          const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
@@ -452,6 +480,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
                           std::move(p.second));
   }
 
+  VLOG(5) << "MakeBlockBackward Finished";
+
   return backward_descs;
 }
 
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index 3ec88d7a72c3339bf5e7d0ca3957a3f608f039b7..be144d8fc0104fccc08006532a85906ade25c2a1 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -29,6 +29,8 @@ inline DataType ToDataType(std::type_index type) {
     return DataType::INT32;
   } else if (typeid(int64_t).hash_code() == type.hash_code()) {
     return DataType::INT64;
+  } else if (typeid(bool).hash_code() == type.hash_code()) {
+    return DataType::BOOL;
   } else {
     PADDLE_THROW("Not supported");
   }
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 53b899a23997b71e723a298ec360a4e018d89878..8b6f42b82df14bfcd25f33ef16b5903fb965a8ba 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -60,8 +60,7 @@ void make_ddim(DDim& ddim, const int64_t* dims, int n) {
       ddim = make_dim<9>(dims);
       break;
     default:
-      throw std::invalid_argument(
-          "Dynamic dimensions must have between [1, 9] dimensions.");
+      PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
   }
 }
 
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 2fcf41d69f0011b0d9a3d89c97fcebacb0703e97..adedd8cb0e8504fd6fc924e62a2ede3c1c7ce698 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -120,6 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
 
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
+    VLOG(10) << op->DebugString();
     op->Run(*local_scope, *device);
   }
   if (create_local_scope) {
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 39c8def82e1ebb10a0e357a648af760099020c32..48cd131550dea5ad3f368b25c31d753efbe0dff9 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -235,6 +235,23 @@ void OpDescBind::Rename(const std::string &old_name,
   need_update_ = true;
 }
 
+void OpDescBind::RenameOutput(const std::string &old_name,
+                              const std::string &new_name) {
+  for (auto &output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+  need_update_ = true;
+}
+
+void OpDescBind::RenameInput(const std::string &old_name,
+                             const std::string &new_name) {
+  for (auto &input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  need_update_ = true;
+}
+
 struct SetAttrDescVisitor : public boost::static_visitor<void> {
   explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
   mutable OpDesc::Attr *attr_;
@@ -448,7 +465,12 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
 DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
   auto var = block_.FindVarRecursive(name);
   PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
-  return framework::make_ddim(var->Shape());
+  try {
+    return framework::make_ddim(var->Shape());
+  } catch (...) {
+    VLOG(5) << "GetDim of variable " << name << " error";
+    std::rethrow_exception(std::current_exception());
+  }
 }
 
 void CompileTimeInferShapeContext::SetDim(const std::string &name,
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index e3e96441bbf51729f2ba69c9257e6961b1de0d5c..da032319afa775571d3942bf6ae415db7d233735 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -73,6 +73,10 @@ class OpDescBind {
 
   void Rename(const std::string &old_name, const std::string &new_name);
 
+  void RenameOutput(const std::string &old_name, const std::string &new_name);
+
+  void RenameInput(const std::string &old_name, const std::string &new_name);
+
   // Only be used in C++
   const AttributeMap &GetAttrMap() const;
 
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 3276f8af396fe58450a8dc6713fe61e49d5ca708..93467ab8ac796277b47a861a427de2837fb2d3d4 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -403,19 +403,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
 void OperatorWithKernel::Run(const Scope& scope,
                              const platform::DeviceContext& dev_ctx) const {
-  if (VLOG_IS_ON(1)) {
-    auto inputs = this->InputVars();
-    auto outputs = this->OutputVars(true);
-    std::ostringstream sout;
-    sout << "Run operator " << this->Type() << " From [";
-    std::ostream_iterator<std::string> out_it(sout, ",");
-    std::copy(inputs.begin(), inputs.end(), out_it);
-    sout << "] to [";
-    std::copy(outputs.begin(), outputs.end(), out_it);
-    sout << "]";
-    VLOG(1) << sout.str();
-  }
-
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
   this->InferShape(&infer_shape_ctx);
 
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 9428b8a07ea0af005f6e960ddaa02da624ad9d97..9ad6272c99dd6a85520ae44c1331ac232bc6a9a2 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -38,11 +38,12 @@ Scope& Scope::NewScope() const {
 Variable* Scope::Var(const std::string& name) {
   auto iter = vars_.find(name);
   if (iter != vars_.end()) {
+    VLOG(3) << "Get existing variable " << name;
     return iter->second;
   }
   Variable* v = new Variable();
   vars_[name] = v;
-  VLOG(3) << "Create variable " << name << " on scope";
+  VLOG(3) << "Create variable " << name;
   v->name_ = &(vars_.find(name)->first);
   return v;
 }
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index 7d36ead2ca85328c7843b3b5d423cf8e921d1c93..05dc47f06ac81f0acb6d0317cbecb3009c7dd7f0 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -53,6 +53,10 @@ class InferShapeContext {
 
   virtual bool IsRuntime() const = 0;
 
+  // Note: In while op, we need this to be public
+  void SetDims(const std::vector<std::string> &names,
+               const std::vector<framework::DDim> &dims);
+
  protected:
   virtual framework::DDim GetDim(const std::string &name) const = 0;
   virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
@@ -60,9 +64,6 @@ class InferShapeContext {
   std::vector<framework::DDim> GetDims(
       const std::vector<std::string> &names) const;
 
-  void SetDims(const std::vector<std::string> &names,
-               const std::vector<framework::DDim> &dims);
-
   std::vector<VarDesc::VarType> GetVarTypes(
       const std::vector<std::string> &names) const;
 
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 91d732641a4a5eed050841b59fd10da397eb732f..41ead3c5ecef248830cfb0f8be360f21dcd58e7b 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -73,7 +73,6 @@ if(MOBILE_INFERENCE)
     list(REMOVE_ITEM GSERVER_SOURCES
          dataproviders/DataProvider.cpp
          dataproviders/MultiDataProvider.cpp
-         dataproviders/ProtoDataProvider.cpp
          dataproviders/PyDataProvider2.cpp
          dataproviders/PyDataProvider.cpp)
 
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 0478256f9cd81f4a99eb0cbcbd1a5a21de5cf14b..106cf5b6228e636026ded558d0f591022f1ae586 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <unistd.h>
 #include <algorithm>
-#include "ProtoDataProvider.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
 #include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
@@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config,
 
 REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
 REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
-REGISTER_DATA_PROVIDER(proto, ProtoDataProvider);
-REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider);
 
 int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
   int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
deleted file mode 100644
index c6f5cab1915b7f41d505c37a7fef762a392bad7f..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ /dev/null
@@ -1,932 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ProtoDataProvider.h"
-#include <algorithm>
-#include <fstream>
-#include <istream>
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-#include "DataProviderGroup.h"
-#include "paddle/utils/Logging.h"
-
-DEFINE_double(memory_threshold_on_load_data,
-              1.0,
-              "stop loading data when memory is not sufficient");
-
-namespace paddle {
-
-REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup<ProtoDataProvider>);
-REGISTER_DATA_PROVIDER(proto_sequence_group,
-                       DataProviderGroup<ProtoSequenceDataProvider>);
-
-ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
-                                     bool useGpu,
-                                     bool loadDataAll)
-    : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
-  if (loadDataAll) {
-    loadData(config_.files());
-  }
-}
-
-void ProtoDataProvider::loadData(const std::vector<std::string>& fileList) {
-  for (auto& file : fileList) {
-    if (FLAGS_memory_threshold_on_load_data < 1.0) {
-      double memUsage = getMemoryUsage();
-      if (memUsage > FLAGS_memory_threshold_on_load_data) {
-        LOG(INFO) << "memUsage is " << memUsage << ", > "
-                  << FLAGS_memory_threshold_on_load_data
-                  << " therefore SKIP ALL REMAINING file.";
-        break;
-      }
-    }
-    LOG(INFO) << "load data file " << file;
-    loadDataFile(file);
-  }
-
-  if (sequenceStartPositions_.size() == sampleNums_) {
-    // This means that each sample is one sequence
-    shuffledSequenceIds_.swap(sequenceStartPositions_);
-  } else {
-    sequenceStartPositions_.push_back(sampleNums_);
-    shuffledSequenceIds_.reserve(sequenceStartPositions_.size() - 1);
-    for (size_t i = 0; i < sequenceStartPositions_.size() - 1; ++i) {
-      shuffledSequenceIds_.push_back(i);
-    }
-  }
-
-  LOG(INFO) << "read done, num of instance=" << sampleNums_;
-  showDataStats();
-}
-
-void ProtoDataProvider::loadData(const std::string& fileName) {
-  std::vector<std::string> fileList;
-  loadFileList(fileName, fileList);
-  loadData(fileList);
-}
-
-void ProtoDataProvider::checkDataHeader(const DataHeader& header) {
-  if (header_.slot_defs_size()) {
-    // header_ is already set. Need to check consistency.
-    CHECK_EQ(header_.slot_defs_size(), header.slot_defs_size())
-        << "Different header";
-    for (int i = 0; i < header.slot_defs_size(); ++i) {
-      CHECK_EQ(header_.slot_defs(i).type(), header.slot_defs(i).type());
-      CHECK_EQ(header_.slot_defs(i).dim(), header.slot_defs(i).dim());
-    }
-    return;
-  }
-
-  // header_ is not set before
-  CHECK(header.slot_defs_size()) << "Invalid header: no slot is defined";
-  int i;
-  for (i = 0; i < header.slot_defs_size(); ++i) {
-    if (header.slot_defs(i).type() == SlotDef::INDEX ||
-        header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX) {
-      break;
-    }
-    constexpr int kBufLen = 100;
-    char buf[kBufLen];
-    snprintf(buf, kBufLen, "slot%d_nnz", i);
-    nnzStats_.push_back(getStat(buf));
-  }
-  numVecSlots_ = i;
-
-  // Check that INDEX slots are after VECTOR slots
-  for (int i = numVecSlots_; i < header.slot_defs_size(); ++i) {
-    CHECK(header.slot_defs(i).type() == SlotDef::INDEX ||
-          header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX);
-  }
-
-  slots_.clear();
-  slots_.reserve(header.slot_defs_size());
-  for (int i = 0; i < header.slot_defs_size(); ++i) {
-    slots_.emplace_back();
-    slots_.back().type = header.slot_defs(i).type();
-    slots_.back().dim = header.slot_defs(i).dim();
-    if (SlotDef::VECTOR_SPARSE_NON_VALUE == header.slot_defs(i).type() ||
-        SlotDef::VECTOR_SPARSE_VALUE == header.slot_defs(i).type()) {
-      slots_.back().indices.push_back(0);
-    }
-  }
-
-  header_ = header;
-}
-
-void ProtoDataProvider::checkSample(const DataSample& sample) {
-  CHECK_EQ(numVecSlots_, sample.vector_slots_size());
-  CHECK(header_.slot_defs_size() == numVecSlots_ + sample.id_slots_size() ||
-        header_.slot_defs_size() == numVecSlots_ + sample.var_id_slots_size());
-  for (int i = 0; i < numVecSlots_; ++i) {
-    uint32_t dim = header_.slot_defs(i).dim();
-    switch (header_.slot_defs(i).type()) {
-      case SlotDef::VECTOR_DENSE: {
-        CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
-        CHECK_EQ(0, sample.vector_slots(i).ids_size());
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          break;
-        }
-        CHECK_LT(0, sample.vector_slots(i).ids_size());
-        CHECK_EQ(0, sample.vector_slots(i).values_size());
-        auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
-                                       sample.vector_slots(i).ids().end());
-        CHECK_GT(dim, maxId);
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          CHECK_EQ(0, sample.vector_slots(i).values_size());
-          break;
-        }
-        CHECK_LT(0, sample.vector_slots(i).values_size());
-        CHECK_GE(static_cast<int>(dim), sample.vector_slots(i).values_size());
-        CHECK_EQ(sample.vector_slots(i).values_size(),
-                 sample.vector_slots(i).ids_size());
-        auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
-                                       sample.vector_slots(i).ids().end());
-        CHECK_GT(dim, maxId);
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        if (static_cast<int>(dim) != 0) {
-          CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
-          if (sample.vector_slots(i).dims_size() != 0) {
-            int totalDim = sample.vector_slots(i).dims(0);
-            for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
-              totalDim *= sample.vector_slots(i).dims(j);
-            }
-            CHECK_EQ(static_cast<int>(dim), totalDim);
-          }
-        } else {
-          CHECK_NE(sample.vector_slots(i).dims_size(), 0);
-          int totalDim = sample.vector_slots(i).dims(0);
-          for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
-            totalDim *= sample.vector_slots(i).dims(j);
-          }
-          CHECK_EQ(totalDim, sample.vector_slots(i).values_size());
-        }
-        break;
-      }
-      case SlotDef::STRING: {
-        CHECK_EQ(static_cast<int>(1), sample.vector_slots(i).strs_size());
-        CHECK_EQ(0, sample.vector_slots(i).ids_size());
-        CHECK_EQ(0, sample.vector_slots(i).values_size());
-        break;
-      }
-      default:
-        LOG(FATAL) << "BUG: Should not reach here";
-    }
-  }
-  for (int i = numVecSlots_; i < header_.slot_defs_size(); ++i) {
-    if (header_.slot_defs(i).type() != SlotDef::VAR_MDIM_INDEX) {
-      uint32_t id = sample.id_slots(i - numVecSlots_);
-      if (id == -1U) continue;
-      CHECK_LT(id, header_.slot_defs(i).dim());
-    } else {
-      for (int j = 0; j < sample.var_id_slots(i - numVecSlots_).ids_size();
-           ++j) {
-        uint32_t id = sample.var_id_slots(i - numVecSlots_).ids(j);
-        CHECK_LT(id, header_.slot_defs(i).dim());
-      }
-    }
-  }
-}
-
-void ProtoDataProvider::loadDataFile(const std::string& fileName) {
-  std::ifstream is(fileName);
-  CHECK(is) << "Fail to open " << fileName;
-  bool dataCompression = str::endsWith(fileName, ".gz");
-  std::unique_ptr<ProtoReader> reader(new ProtoReader(&is, dataCompression));
-  CHECK(reader) << "Fail to create proto data input stream";
-
-  DataHeader header;
-  CHECK(reader->read(&header));
-  checkDataHeader(header);
-
-  DataSample sample;
-  do {
-    if (!reader->read(&sample)) {
-      break;
-    }
-    checkSample(sample);
-    if (sample.is_beginning()) {
-      sequenceStartPositions_.push_back(sampleNums_);
-    }
-    fillSlots(sample);
-    ++sampleNums_;
-  } while (true);
-
-  CHECK(is.eof()) << "Fail to read file";
-  reader.reset(nullptr);
-  is.close();
-}
-
-// checkSample has done before, no check here
-void ProtoDataProvider::fillSlots(const DataSample& sample) {
-  for (size_t i = 0; i < slots_.size(); ++i) {
-    auto& slot = slots_[i];
-    int dim = slot.dim;
-    switch (slot.type) {
-      case SlotDef::VECTOR_DENSE: {
-        size_t oldSize = slot.denseData.size();
-        slot.denseData.resize(oldSize + dim);
-        const float* values = sample.vector_slots(i).values().data();
-#ifdef PADDLE_TYPE_DOUBLE
-        std::copy(values, values + dim, slot.denseData.begin() + oldSize);
-#else
-        memcpy(slot.denseData.data() + oldSize, values, sizeof(real) * dim);
-#endif
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        int slotSize = sample.vector_slots(i).ids_size();
-        int subSlotSize = 0;
-        int id = 0;  // the slot id
-        // find whether this vector_slots has subseq. If not has subseq,
-        // subSlotSize = 0.
-        for (id = 0; id < sample.subseq_slots_size(); id++) {
-          if (sample.subseq_slots(id).slot_id() == i) {
-            subSlotSize = sample.subseq_slots(id).lens_size();
-            break;
-          }
-        }
-        if (subSlotSize && slot.subIndices.size() == 0UL) {
-          // If has subSeq, the first element of subIndices = 0.
-          slot.subIndices.push_back(0);
-        }
-        if (slotSize == 0UL) {
-          // if has no id, new indices = old indices.
-          slot.indices.push_back(slot.indices.back());
-          // if has subSeq, new subIndices = old subIndices.
-          if (slot.subIndices.size()) {
-            slot.subIndices.push_back(slot.subIndices.back());
-          }
-          break;
-        }
-        slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
-        const unsigned int* ids = sample.vector_slots(i).ids().data();
-        memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
-               ids,
-               sizeof(*ids) * slotSize);
-        slot.indices.push_back(slot.indices.back() + slotSize);
-        if (subSlotSize) {
-          for (int ii = 0; ii < subSlotSize; ++ii) {
-            slot.subIndices.push_back(slot.subIndices.back() +
-                                      sample.subseq_slots(id).lens(ii));
-          }
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          slot.indices.push_back(slot.indices.back());
-          break;
-        }
-        int slotSize = sample.vector_slots(i).ids_size();
-        slot.sparseFloatValueData.resize(slot.indices.back() + slotSize);
-        const unsigned int* ids = sample.vector_slots(i).ids().data();
-        const float* values = sample.vector_slots(i).values().data();
-        for (int ii = 0; ii < slotSize; ++ii) {
-          slot.sparseFloatValueData[slot.indices.back() + ii].col = ids[ii];
-          slot.sparseFloatValueData[slot.indices.back() + ii].value =
-              values[ii];
-        }
-        slot.indices.push_back(slot.indices.back() + slotSize);
-        break;
-      }
-      case SlotDef::INDEX: {
-        slot.indexData.push_back(sample.id_slots(i - numVecSlots_));
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        size_t oldSize = slot.varDenseData.size();
-        slot.varDenseData.resize(oldSize + 1);
-        size_t varDim = sample.vector_slots(i).values_size();
-        slot.varDenseData[oldSize].data.resize(varDim);
-        const float* values = sample.vector_slots(i).values().data();
-#ifdef PADDLE_TYPE_DOUBLE
-        std::copy(
-            values, values + varDim, slot.varDenseData[oldSize].data.data());
-#else
-        memcpy(slot.varDenseData[oldSize].data.data(),
-               values,
-               sizeof(real) * varDim);
-#endif
-        slot.varDenseData[oldSize].dims.resize(
-            sample.vector_slots(i).dims_size());
-        memcpy(slot.varDenseData[oldSize].dims.data(),
-               sample.vector_slots(i).dims().data(),
-               sizeof(uint32_t) * sample.vector_slots(i).dims_size());
-        break;
-      }
-      case SlotDef::VAR_MDIM_INDEX: {
-        size_t oldSize = slot.varIndices.size();
-        slot.varIndices.resize(oldSize + 1);
-        size_t varDim = sample.var_id_slots(i - numVecSlots_).ids_size();
-        slot.varIndices[oldSize].resize(varDim);
-        memcpy(slot.varIndices[oldSize].data(),
-               sample.var_id_slots(i - numVecSlots_).ids().data(),
-               sizeof(uint32_t) * varDim);
-        break;
-      }
-      case SlotDef::STRING: {
-        slot.strData.push_back(sample.vector_slots(i).strs(0));
-        break;
-      }
-    }
-  }
-}
-
-void ProtoDataProvider::showDataStats() {
-  std::ostringstream oss;
-  for (size_t i = 0; i < slots_.size(); ++i) {
-    auto& slot = slots_[i];
-    if (slot.type == SlotDef::VECTOR_SPARSE_NON_VALUE) {
-      size_t nnz = slot.sparseNonValueData.size();
-      oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
-    } else if (slot.type == SlotDef::VECTOR_SPARSE_VALUE) {
-      size_t nnz = slot.sparseFloatValueData.size();
-      oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
-    }
-  }
-  LOG(INFO) << oss.str();
-}
-
-void ProtoDataProvider::reset() {
-  currentSequenceIndex_ = 0;
-  if (!skipShuffle_) {
-    shuffle();
-  }
-
-  DataProvider::reset();
-}
-
-void ProtoDataProvider::shuffle() {
-  std::shuffle(shuffledSequenceIds_.begin(),
-               shuffledSequenceIds_.end(),
-               ThreadLocalRandomEngine::get());
-}
-
-/*
-  Loop through sequences starting from currentSequenceIndex_
-  for at most size samples. For each sequence ranging from [begin, end),
-  op(begin, end) will be called.
-
-  return the number of sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::sequenceLoop(Op op, int64_t size) {
-  int64_t sz = 0;
-  size_t i;
-  size_t sequenceCount = shuffledSequenceIds_.size();
-  if (usageRatio_ < 1.0f) {
-    sequenceCount = static_cast<int64_t>(sequenceCount * usageRatio_);
-  }
-  for (i = currentSequenceIndex_; i < sequenceCount; ++i) {
-    size_t id = shuffledSequenceIds_[i];
-    int64_t begin = sequenceStartPositions_[id];
-    int64_t end = sequenceStartPositions_[id + 1];
-    int64_t len = end - begin;
-    if (sz + len > size && sz > 0) break;
-    sz += len;
-    op(begin, end);
-  }
-  return i - currentSequenceIndex_;
-}
-
-/*
-  Loop through sequences starting from currentSequenceIndex_
-  for at most size samples. For each sample of each sequence at position
-  pos, op(pos) will be called.
-
-  return the number of sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::sampleLoop(Op op, int64_t size) {
-  if (iidData()) {
-    size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
-    for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
-         ++i) {
-      size_t pos = shuffledSequenceIds_[i];
-      op(pos);
-    }
-    return size;
-  } else {
-    auto f = [op](int64_t begin, int64_t end) {
-      for (int64_t pos = begin; pos < end; ++pos) {
-        op(pos);
-      }
-    };
-    return sequenceLoop(f, size);
-  }
-}
-
-/*
-  Loop through sub-sequences starting from currentSequenceIndex_
-  for at most size samples. For each sample of each sub-sequence at position
-  pos, op(pos) will be called.
-
-  return the number of sub-sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::subSampleLoop(Op op, int64_t size, int slot) {
-  CHECK(iidData()) << "subSampleLoop only accepts iid data";
-  size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
-  int subSize = 0;
-  for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
-       ++i) {
-    size_t pos = shuffledSequenceIds_[i];
-    int64_t* indexs = slots_[slot].indices.data();
-    int64_t* subIndexs = slots_[slot].subIndices.data();
-    int64_t subSeqStart = 0;
-    int64_t subSeqEnd = 0;
-    for (int j = 0; j < (int)slots_[slot].subIndices.size(); j++) {
-      if (subIndexs[j] == indexs[pos]) {
-        subSeqStart = j;
-        if (subIndexs[pos] == subIndexs[pos + 1]) {
-          subSeqEnd = j + 1;
-          break;
-        }
-      } else if (subIndexs[j] == indexs[pos + 1]) {
-        subSeqEnd = j;
-        break;
-      }
-    }
-    for (int j = subSeqStart; j < subSeqEnd; j++) {
-      op(j);
-    }
-    subSize += subSeqEnd - subSeqStart;
-  }
-  return subSize;
-}
-
-int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
-                                                DataBatch* batch) {
-  int64_t numSequences = 0;  // actual number of sequences in the batch
-
-  // the number of sequences scanned, including those skipped because too long
-  int64_t numScannedSeqs = 0;
-  std::lock_guard<RWLock> guard(lock_);
-  if (iidData()) {
-    size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
-    numScannedSeqs = numSequences = size;
-  } else {
-    int64_t sz = 0;
-    auto op = [&sz, &numSequences](int64_t begin, int64_t end) {
-      ++numSequences;
-      sz += end - begin;
-    };
-    numScannedSeqs = sequenceLoop(op, size);
-    VLOG_IF(1, numScannedSeqs > numSequences)
-        << numScannedSeqs - numSequences
-        << " sequences are skipped because longer than " << size;
-    size = sz;
-  }
-  if (size <= 0) return 0;
-
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(header_.slot_defs_size());
-
-  if (!iidData()) {
-    ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
-                                  numSequences + 1,
-                                  /* useGpu= */ false);
-    int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
-    int pos = 0;
-    int i = 0;
-    auto op = [buf, &pos, &i](int64_t begin, int64_t end) {
-      buf[i] = pos;
-      pos += end - begin;
-      ++i;
-    };
-    sequenceLoop(op, size);
-    buf[i] = size;
-    for (size_t slot = 1; slot < cpuArguments.size(); ++slot) {
-      cpuArguments[slot].sequenceStartPositions =
-          cpuArguments[0].sequenceStartPositions;
-    }
-  }
-
-  for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
-    size_t dim = header_.slot_defs(slot).dim();
-    SlotDef::SlotType slotType = header_.slot_defs(slot).type();
-
-    std::vector<int64_t> dataPos;
-    dataPos.reserve(size);
-    auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
-    sampleLoop(op, size);
-
-    switch (slotType) {
-      case SlotDef::VECTOR_DENSE: {
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               dim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        for (int i = 0; i < size; ++i) {
-          memcpy(buf + i * dim,
-                 slots_[slot].denseData.data() + dataPos[i] * dim,
-                 sizeof(real) * dim);
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value =
-              Matrix::createSparseMatrix(size,
-                                         dim,
-                                         size /*DEFAULT_AVG_WIDTH = 1*/,
-                                         NO_VALUE,
-                                         SPARSE_CSR,
-                                         false,
-                                         useGpu_);
-        }
-        auto mat = cpuArguments[slot].value;
-        mat->resize(size, dim);
-        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseNonValueData.data(),
-              HPPL_STREAM_1);
-        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseNonValueData.data());
-        } else {
-          LOG(FATAL) << "Not Supported";
-        }
-        size_t numElements = 0;
-        for (auto pos : dataPos) {
-          numElements +=
-              slots_[slot].indices[pos + 1] - slots_[slot].indices[pos];
-        }
-        nnzStats_[slot]->addSample(numElements);
-
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value =
-              Matrix::createSparseMatrix(size,
-                                         dim,
-                                         size /*DEFAULT_AVG_WIDTH = 1*/,
-                                         FLOAT_VALUE,
-                                         SPARSE_CSR,
-                                         false,
-                                         useGpu_);
-        }
-        auto mat = cpuArguments[slot].value;
-        mat->resize(size, dim);
-        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data(),
-              HPPL_STREAM_1);
-        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data());
-        } else {
-          LOG(FATAL) << "Not Supported";
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                size,
-                                /*  useGpu= */ false);
-        int* buf = cpuArguments[slot].ids->getData();
-        for (int i = 0; i < size; ++i) {
-          buf[i] = slots_[slot].indexData[dataPos[i]];
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        CHECK_EQ(size, 1);
-        auto mat = cpuArguments[slot].value;
-        size_t totalDim = slots_[slot].varDenseData[dataPos[0]].data.size();
-
-        CHECK_EQ(slots_[slot].varDenseData[dataPos[0]].dims.size(), size_t(3));
-        size_t height, width, depth, oldWidth;
-        /* dims[2] is depth, will be changed to dims[0] in future */
-        depth = slots_[slot].varDenseData[dataPos[0]].dims[2];
-        height = slots_[slot].varDenseData[dataPos[0]].dims[1];
-        width = slots_[slot].varDenseData[dataPos[0]].dims[0];
-        oldWidth = width;
-        /* process the undesirable sample */
-        if (oldWidth < height) {
-          width = height;
-        }
-        cpuArguments[slot].setFrameHeight(height);
-        cpuArguments[slot].setFrameWidth(width);
-
-        if (oldWidth < height) {
-          totalDim = width * height * depth;
-        }
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               totalDim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        cpuArguments[slot].value->zeroMem();
-        if (oldWidth < height) {
-          real* srcBuf = slots_[slot].varDenseData[dataPos[0]].data.data();
-          for (size_t i = 0; i < depth; i++) {
-            for (size_t j = 0; j < height; j++) {
-              for (size_t k = 0; k < oldWidth; k++) {
-                buf[i * height * width + j * width + k] =
-                    srcBuf[i * height * oldWidth + j * oldWidth + k];
-              }
-            }
-          }
-        } else {
-          memcpy(buf,
-                 slots_[slot].varDenseData[dataPos[0]].data.data(),
-                 sizeof(real) * totalDim);
-        }
-        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                      size + 1, /* size == 1 currently */
-                                      /* useGpu= */ false);
-        int* bufStarts =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        bufStarts[0] = 0;
-        bufStarts[1] = 1;
-        break;
-      }
-      case SlotDef::VAR_MDIM_INDEX: {
-        CHECK_EQ(size, 1);
-        size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                totalDim,
-                                /*  useGpu= */ false);
-        int* buf = cpuArguments[slot].ids->getData();
-        memcpy(buf,
-               slots_[slot].varIndices[dataPos[0]].data(),
-               sizeof(int) * totalDim);
-
-        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                      size + 1, /* size == 1 currently */
-                                      /* useGpu= */ false);
-        int* bufStarts =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        bufStarts[0] = 0;
-        /* we expand the convolutinal feature map to a sequence data,
-         * so there should be a corresponding sequence labels */
-        bufStarts[1] = totalDim;
-        break;
-      }
-      case SlotDef::STRING: {
-        if (cpuArguments[slot].strs) {
-          cpuArguments[slot].strs->resize(size);
-        } else {
-          cpuArguments[slot].strs =
-              std::make_shared<std::vector<std::string>>(size);
-        }
-        for (int i = 0; i < size; ++i) {
-          (*cpuArguments[slot].strs)[i] = slots_[slot].strData[dataPos[i]];
-        }
-        break;
-      }
-    }
-  }
-
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (int i = 0; i < header_.slot_defs_size(); ++i) {
-      SlotDef::SlotType slotType = header_.slot_defs(i).type();
-      if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
-          SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
-        gpuArguments[i] = cpuArguments[i];
-        gpuArguments[i].sequenceStartPositions =
-            cpuArguments[i].sequenceStartPositions;
-      } else {
-        gpuArguments[i].resizeAndCopyFrom(
-            cpuArguments[i], useGpu_, HPPL_STREAM_1);
-      }
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-
-  currentSequenceIndex_ += numScannedSeqs;
-
-  return batch->getSize();
-}
-
-ProtoSequenceDataProvider::ProtoSequenceDataProvider(const DataConfig& config,
-                                                     bool useGpu,
-                                                     bool loadDataAll)
-    : ProtoDataProvider(config, useGpu, loadDataAll) {}
-
-int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
-                                                        DataBatch* batch) {
-  CHECK(iidData()) << "ProtoSequenceDataProvider only accepts iid data";
-  int64_t numSequences = 0;  // actual number of sequences in the batch
-
-  // the number of sequences scanned, including those skipped because too long
-  int64_t numScannedSeqs = 0;
-  std::lock_guard<RWLock> guard(lock_);
-  size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
-  numScannedSeqs = numSequences = size;
-  if (size <= 0) return 0;
-
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(header_.slot_defs_size());
-
-  for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
-    SlotDef::SlotType slotType = header_.slot_defs(slot).type();
-
-    std::vector<int64_t> dataPos;
-    dataPos.reserve(size);
-    auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
-    sampleLoop(op, size);
-
-    // current slot: sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                  size + 1,
-                                  /* useGpu= */ false);
-
-    switch (slotType) {
-      case SlotDef::VECTOR_SPARSE_VALUE:
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "ProtoSequenceDataProvider only support"
-                   << " VECTOR_DENSE, VECTOR_SPARSE_NON_VALUE and INDEX slots";
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        // copy to IDS, not value
-        // pointers used in current slot
-        sparse_non_value_t* data = slots_[slot].sparseNonValueData.data();
-        int64_t* indexs = slots_[slot].indices.data();
-        int64_t* seqs = dataPos.data();
-
-        // current slot: i need size instances. what is the total length?
-        int totalFeatureInCurrentSlot = 0;
-        for (int ins = 0; ins < size; ins++) {
-          int64_t currInsId = seqs[ins];
-          totalFeatureInCurrentSlot +=
-              indexs[currInsId + 1] - indexs[currInsId];
-          // special: if current instance has NO feature in current slot
-          if (indexs[currInsId + 1] == indexs[currInsId]) {
-            totalFeatureInCurrentSlot++;
-          }
-        }
-        // done
-
-        // current slot: ids
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                totalFeatureInCurrentSlot,
-                                /* useGpu= */ false);
-
-        // where to write
-        int* currPosOfArgumentId = cpuArguments[slot].ids->getData();
-        int* currPosOfArgumentSeqStart =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        int allSequenceLength = 0;
-        currPosOfArgumentSeqStart[0] = 0;
-        // for each instance, copy data and fill sequence positions
-        for (int instance = 0; instance < size; instance++) {
-          int64_t currInstanceId = seqs[instance];
-          int64_t currInstanceLength =
-              indexs[currInstanceId + 1] - indexs[currInstanceId];
-          sparse_non_value_t* currInstanceData = data + indexs[currInstanceId];
-          // write sequenceStartPositions
-          allSequenceLength += currInstanceLength;
-          currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
-          // copy features
-          for (int featCopier = 0; featCopier < currInstanceLength;
-               featCopier++) {
-            currPosOfArgumentId[featCopier] = currInstanceData[featCopier].col;
-          }
-          currPosOfArgumentId += currInstanceLength;
-          // special: if current instance has NO feature in current slot
-          if (currInstanceLength == 0) {
-            allSequenceLength++;
-            currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
-            currPosOfArgumentId[0] = -1;
-            currPosOfArgumentId++;
-          }
-          // done
-        }
-        if (slots_[slot].subIndices.size()) {
-          std::vector<int64_t> dataSubPos;
-          auto op = [this, &dataSubPos](int64_t pos) {
-            dataSubPos.push_back(pos);
-          };
-          int subSize = subSampleLoop(op, size, slot);
-          ICpuGpuVector::resizeOrCreate(
-              cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
-          int* currPosOfArgumentSubSeqStart =
-              cpuArguments[slot].subSequenceStartPositions->getMutableData(
-                  false);
-          int64_t* subSeqs = dataSubPos.data();
-          int64_t* subIndexs = slots_[slot].subIndices.data();
-          int allSubSequenceLength = 0;
-          currPosOfArgumentSubSeqStart[0] = 0;
-          // for each instance, compute sub-sequence number
-          for (int instance = 0; instance < subSize; instance++) {
-            int64_t currSubInstanceId = subSeqs[instance];
-            int64_t currSubInstanceLength =
-                subIndexs[currSubInstanceId + 1] - subIndexs[currSubInstanceId];
-            // write subSequenceStartPositions
-            allSubSequenceLength += currSubInstanceLength;
-            currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
-            // special: if current instance has NO feature in current slot
-            if (currSubInstanceLength == 0) {
-              allSubSequenceLength++;
-              currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
-            }
-          }
-          cpuArguments[slot].checkSubset();
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        // label slot
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                size,
-                                /* useGpu= */ false);
-        // fill labels
-        int* buf = cpuArguments[slot].ids->getData();
-        for (int i = 0; i < size; ++i) {
-          buf[i] = slots_[slot].indexData[dataPos[i]];
-        }
-        // label HAS sequence structure
-        cpuArguments[slot].sequenceStartPositions->fillSequence(false);
-        break;
-      }
-      case SlotDef::VECTOR_DENSE: {
-        // copy values
-        size_t dim = header_.slot_defs(slot).dim();
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               dim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        for (int i = 0; i < size; ++i) {
-          memcpy(buf + i * dim,
-                 slots_[slot].denseData.data() + dataPos[i] * dim,
-                 sizeof(real) * dim);
-        }
-        // sequence structure
-        cpuArguments[slot].sequenceStartPositions->fillSequence(false);
-        break;
-      }
-      default: { LOG(FATAL) << "should not reach here"; }
-    }
-  }
-
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (size_t i = 0; i < cpuArguments.size(); ++i) {
-      gpuArguments[i].resizeAndCopyFrom(
-          cpuArguments[i], useGpu_, HPPL_STREAM_1);
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-
-  currentSequenceIndex_ += numScannedSeqs;
-  return batch->getSize();
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
deleted file mode 100644
index 7dd45e062248f20d24c633dd4e1c8b7eebcbfa1b..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "DataFormat.pb.h"
-#include "paddle/utils/Stat.h"
-
-#include "DataProvider.h"
-#include "ProtoReader.h"
-
-namespace paddle {
-
-/**
- * @brief Provider data from protobuf data file with each sample
- * specified by proto message
- *
- * DataSample defined in DataFormat.proto.
- *
- * The file format is
- *
- *    header
- *
- *    sample1
- *
- *    sample2
- *
- *    ...
- *
- *    sampleN
- *
- * @note: In the data file, each message is prefixed with its length.
- * The read/write of the protbuf are implemented in ProtoReader.h
- */
-class ProtoDataProvider : public DataProvider {
-public:
-  ProtoDataProvider(const DataConfig& config,
-                    bool useGpu,
-                    bool loadDataAll = true);
-  virtual void reset();
-
-  /**
-   * @note this size includes the sequences which are skipped because they
-   * are longer than the batch size.
-   */
-  virtual int64_t getSize() {
-    int64_t size = sampleNums_;
-    if (usageRatio_ < 1.0f) {
-      size = static_cast<int64_t>(size * usageRatio_);
-    }
-    return size;
-  }
-  virtual void shuffle();
-
-  void loadData(const std::vector<std::string>& fileList);
-
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
-protected:
-  /**
-   * @brief load protobuf data from a list of file
-   * @param[in]  fileName  file name of a file which contains
-   * a list of file names
-   */
-  void loadData(const std::string& fileName);
-
-  /**
-   * @brief load protobuf data from file
-   * @param[in]  fileName   data file name
-   */
-  void loadDataFile(const std::string& fileName);
-  /** @brief check data header of each data sample
-   *  @param[in] header     data header read from protobuf data
-   */
-  void checkDataHeader(const DataHeader& header);
-  /**
-   * @brief fill protobuf data into slot_,
-   * slot_ is a vector of ProtoSlot in memory.
-   * @param[in]  sample     data sample read from protobuf data
-   */
-  void fillSlots(const DataSample& sample);
-
-  /**
-   * @brief return true if each sample is one sequence, i.e., independent
-   * of other samples.
-   */
-  inline bool iidData() const { return sequenceStartPositions_.empty(); }
-
-  /**
-   * @brief check that sample is consistent with header_
-   */
-  void checkSample(const DataSample& sample);
-
-  template <class Op>
-  int64_t sequenceLoop(Op op, int64_t size);
-
-  template <class Op>
-  int64_t sampleLoop(Op op, int64_t size);
-
-  template <class Op>
-  int64_t subSampleLoop(Op op, int64_t size, int slot);
-
-  void showDataStats();
-
-protected:
-  struct ProtoVarSlot {
-    std::vector<real> data;
-    std::vector<int> dims;
-  };
-
-  struct ProtoSlot {
-    SlotDef::SlotType type;
-    int dim;
-    std::vector<int> indexData;
-    std::vector<real> denseData;
-    std::vector<sparse_non_value_t> sparseNonValueData;
-    std::vector<sparse_float_value_t> sparseFloatValueData;
-    std::vector<int64_t> indices;
-    std::vector<int64_t> subIndices;
-
-    std::vector<ProtoVarSlot> varDenseData;
-    std::vector<std::vector<int>> varIndices;
-    std::vector<std::string> strData;
-  };
-  DataHeader header_;
-  int numVecSlots_;
-
-  std::vector<ProtoSlot> slots_;
-  size_t sampleNums_;
-
-  /**
-   * The starting position of each sequence in samples.
-   * The last element should be num of samples.
-   * If empty, each sample is one sequence.
-   */
-  std::vector<size_t> sequenceStartPositions_;
-
-  int64_t currentSequenceIndex_;
-
-  // The size should be the number of sequences.
-  std::vector<size_t> shuffledSequenceIds_;
-
-  ThreadLocalD<DataBatch> cpuBatch_;
-  ThreadLocalD<DataBatch> gpuBatch_;
-
-  RWLock lock_;
-  std::vector<StatPtr> nnzStats_;  // stats for number of none-zeros entries
-};
-
-/**
- * @brief Special use for Proto data: instances should contain sparse-non-value
- * slots
- * and label.
- *
- * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
- */
-class ProtoSequenceDataProvider : public ProtoDataProvider {
-public:
-  ProtoSequenceDataProvider(const DataConfig& config,
-                            bool useGpu,
-                            bool loadDataAll = true);
-  ~ProtoSequenceDataProvider() {}
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DotProdLayer.cpp b/paddle/gserver/layers/DotProdLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e2dbe3c3c416f606d2938701f26288642b55267
--- /dev/null
+++ b/paddle/gserver/layers/DotProdLayer.cpp
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for computing the dot product of two vectors.
+ * Input1: vector (batchSize * dim)
+ * Input2: vector (batchSize * dim)
+ * Output: a matrix: (batchSize * 1)
+ */
+
+class DotProdLayer : public Layer {
+public:
+  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~DotProdLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(dot_prod, DotProdLayer);
+
+bool DotProdLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+  CHECK_EQ(1UL, getSize())
+      << "The output dimensionality of this layer should be fixed to 1.";
+
+  return true;
+}
+
+void DotProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  CHECK_EQ(inV1->getHeight(), batchSize);
+  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, 1);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
+    outV->sumOfProducts(*inV0, *inV1, 1, 0);
+  }
+}
+
+void DotProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  {
+    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
+
+    if (inG0) {
+      inG0->addRowScale(0, *inV1, *outG);
+    }
+
+    if (inG1) {
+      inG1->addRowScale(0, *inV0, *outG);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.cpp b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c9099297cc5c741fbae0b42f21b988e6c561ef11
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
@@ -0,0 +1,202 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNConcatLayer.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer);
+
+bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  CHECK_GT(inputLayers_.size(), 1UL);
+  CHECK(!biasParameter_);
+  return true;
+}
+
+void MKLDNNConcatLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  ic = inputLayers_[0]->getSize() / ih / iw;
+  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
+  CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+  CHECK_GT(inputLayers_.size(), 1UL);
+  channels_.resize(inputLayers_.size());
+  channels_[0] = ic;
+  // need change the output channel, so use oc_ instead
+  // TODO(TJ): change API, use &oc
+  oc_ = ic;
+  for (size_t i = 1; i < inputLayers_.size(); i++) {
+    int batchsize, height, witdh;
+    reshapeInput(batchsize, height, witdh, i);
+    CHECK_EQ(bs, batchsize);
+    CHECK_EQ(ih, height);
+    CHECK_EQ(iw, witdh);
+
+    channels_[i] = inputLayers_[i]->getSize() / height / witdh;
+    CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
+    oc_ += channels_[i];
+  }
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc_ * oh * ow);
+}
+
+void MKLDNNConcatLayer::resetFwd(std::vector<primitive>& pipeline,
+                                 MKLDNNMatrixPtr& in,
+                                 MKLDNNMatrixPtr& wgt,
+                                 MKLDNNMatrixPtr& bias,
+                                 MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inVals_, out);
+  in = inVals_[0];
+
+  std::shared_ptr<concat::primitive_desc> fwdPD;
+  resetFwdPD(fwdPD, inVals_, out);
+
+  resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+}
+
+void MKLDNNConcatLayer::resetBwd(std::vector<primitive>& pipeline,
+                                 MKLDNNMatrixPtr& in,
+                                 MKLDNNMatrixPtr& wgt,
+                                 MKLDNNMatrixPtr& bias,
+                                 MKLDNNMatrixPtr& out) {
+  resetBwdBuffers(inGrads_, out);
+  in = inGrads_[0];
+
+  resetBwdPipeline(pipeline, bwds_, inGrads_, out);
+}
+
+void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                        MKLDNNMatrixPtr& out) {
+  inputs.resize(inputLayers_.size());
+  bool has8c = false, has16c = false, hasnc = false;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    // resetInValue will use ic_ so temporary change as current input's channel
+    // TODO(TJ): change ic_ as vector then can remove channels_
+    ic_ = channels_[i];
+    resetInValue(inputs[i], nullptr, i);
+    CHECK(inputs[i]);
+    auto dm = inputs[i]->getDims();
+    // inputs format can be different, but ndims must equal
+    CHECK(i == 0 || dm.size() == inputs[0]->getDims().size());
+    CHECK_EQ(bs_, dm[0]);
+    CHECK_EQ(channels_[i], dm[1]);
+    if (dm.size() > 2) {
+      CHECK_EQ(ih_, dm[2]);
+      CHECK_EQ(iw_, dm[3]);
+    }
+    if (inputs[i]->getFormat() == format::nc) {
+      hasnc = true;
+    }
+    if (inputs[i]->getFormat() == format::nChw8c) {
+      has8c = true;
+    }
+    if (inputs[i]->getFormat() == format::nChw16c) {
+      has16c = true;
+    }
+  }
+  // change back, ic_ always save the input 0 size
+  ic_ = channels_[0];
+
+  format outFmt;
+  if (has16c && oc_ % 16 == 0) {
+    outFmt = format::nChw16c;
+  } else if (has8c && oc_ % 8 == 0) {
+    outFmt = format::nChw8c;
+  } else if (hasnc) {
+    CHECK(oh_ == 1 && ow_ == 1);
+    outFmt = format::nc;
+  } else {
+    outFmt = format::nchw;
+  }
+  memory::dims outDims =
+      hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_};
+  auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_);
+  resetOutValue(out, outPD);
+}
+
+void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr<concat::primitive_desc>& pd,
+                                   std::vector<MKLDNNMatrixPtr>& inputs,
+                                   MKLDNNMatrixPtr out) {
+  std::vector<memory::primitive_desc> srcPDs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
+  }
+  CHECK(out);
+  pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs));
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+}
+
+void MKLDNNConcatLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<concat::primitive_desc>& pd,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  std::vector<primitive::at> srcs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcs.push_back(*(inputs[i]));
+  }
+  fwd_.reset(new concat(*pd, srcs, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNConcatLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                        MKLDNNMatrixPtr& out) {
+  CHECK(outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  CHECK(out);
+
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    CHECK(inVals_[i]);
+    // resetInGrad will use inVal_
+    // TODO(TJ): change move inVals_ to MKLDNNLayer ans remove inVal_
+    inVal_ = inVals_[i];
+    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
+  }
+  // change back, inVal_ always save the input 0
+  inVal_ = inVals_[0];
+}
+
+void MKLDNNConcatLayer::resetBwdPipeline(
+    std::vector<mkldnn::primitive>& pipeline,
+    std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  // reset the backward primitives
+  memory::dims offsets = {0, 0, 0, 0};
+  prims.resize(inputs.size());
+  CHECK_EQ(inputs.size(), channels_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto viewPD = view::primitive_desc(
+        out->getPrimitiveDesc(), inputs[i]->getDims(), offsets);
+    auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(),
+                                         inputs[i]->getPrimitiveDesc());
+    prims[i].reset(new reorder(bwdPD, *out, *(inputs[i])));
+    offsets[axis_] += channels_[i];
+    // push to pipeline
+    pipeline.push_back(*prims[i]);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.h b/paddle/gserver/layers/MKLDNNConcatLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5749d327e4259b81541a234f48a4538ab035fe4
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer Concatenate layer.
+ *
+ * The config file api is mkldnn_concat
+ */
+class MKLDNNConcatLayer : public MKLDNNLayer {
+protected:
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
+  std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
+  // input channel numbers
+  std::vector<int> channels_;
+
+  // concat_dimension in MKLDNN
+  // if axis_ == 0, concat batchsize
+  // if axis_ == 1, concat channel (default)
+  int axis_;
+
+public:
+  explicit MKLDNNConcatLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), axis_(1) {}
+
+  ~MKLDNNConcatLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+
+  void printSizeInfo() override {
+    CHECK_EQ(channels_.size(), inputLayers_.size());
+    for (size_t i = 0; i < channels_.size(); ++i) {
+      VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
+                         << ": " << bs_ << ", " << channels_[i] << ", " << ih_
+                         << ", " << iw_;
+    }
+    VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
+                       << ", " << ow_;
+  }
+
+  void printValueFormat() override {
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << inVals_[i]->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
+    }
+    if (extOutVal_) {
+      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+    }
+  }
+
+  void printGradFormat() override {
+    if (extOutGrad_) {
+      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
+    }
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+    }
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << inGrads_[i]->getFormat() << "<<<";
+    }
+  }
+
+protected:
+  /**
+   * Forward functions: reset buffers(inputs, output, bias),
+   *                    reset primitive descriptor,
+   *                    reset pipeline.
+   */
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+
+  /**
+   * Backward functions: reset buffers(inputs, output, bias)
+   *                     reset primitives and pipeline
+   */
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index e75ac5ba4647a8267b7bc189893bd7adb5c3053f..cf42da0735282d667d6b87061c8c59bf2f96e0be 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -21,8 +21,8 @@ namespace paddle {
 
 bool MKLDNNLayer::init(const LayerMap& layerMap,
                        const ParameterMap& parameterMap) {
-  CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
-                          << "Please set WITH_MKLDNN=ON "
+  CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn."
+                          << "Please set WITH_MKL=ON "
                           << "and set use_mkldnn=True";
   CHECK(!useGpu_) << "Do not support GPU yet";
 
@@ -138,8 +138,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
   }
 }
 
-void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
-  const Argument& input = inputLayers_[0]->getOutput();
+void MKLDNNLayer::reshapeInput(int& batchsize,
+                               int& height,
+                               int& width,
+                               size_t inputIdx) {
+  const Argument& input = inputLayers_[inputIdx]->getOutput();
   batchsize = input.getBatchSize();
   int h = input.getFrameHeight();
   int w = input.getFrameWidth();
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 7479c34c92b5231b2521493bc631474d4efd4224..4c42df1bee75fa7b28c2001c30797cc0df7c5554 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -178,7 +178,10 @@ protected:
   /**
    * reshape the input image sizes and input batchsize
    */
-  void reshapeInput(int& batchsize, int& height, int& width);
+  void reshapeInput(int& batchsize,
+                    int& height,
+                    int& width,
+                    size_t inputIdx = 0);
 
   /**
    * reshape output image sizes
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 4bea348f637f39444e8aad89278e6366ecd73b1d..c295ea19c9ccb3d05c509a41925d2c36efdba8ef 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -29,7 +29,7 @@ gserver_test(test_KmaxSeqScore)
 gserver_test(test_Expand)
 gserver_test(test_MaxPoolingWithMaskOutput)
 
-########## test_Mkldnn layers and activations ##########
+########## test_MKLDNN layers and activations ##########
 if(WITH_MKLDNN)
     add_unittest_without_exec(test_MKLDNN
         test_MKLDNN.cpp
@@ -62,17 +62,6 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
 endif()
 
 if(NOT MOBILE_INFERENCE)
-################### test_ProtoDataProvider ############
-    add_unittest_without_exec(test_ProtoDataProvider
-        test_ProtoDataProvider.cpp)
-
-    # test_ProtoDataProvider will mkdir as same name,
-    # so if WORKING_DIRECTORY is default directory, then
-    # mkdir will get error.
-    add_test(NAME test_ProtoDataProvider
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-
 ################## test_Evaluator #######################
     add_unittest(test_Evaluator
         test_Evaluator.cpp)
@@ -110,3 +99,24 @@ add_test(NAME test_PyDataProvider2
    COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
         WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
 )
+
+################# test_CompareSparse ##################
+add_unittest_without_exec(test_CompareSparse
+    test_CompareSparse.cpp)
+if(NOT ON_TRAVIS)
+  add_test(NAME test_CompareSparse
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+          ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+              ./.set_port.sh -p port -n 6
+                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endif()
+
+################ test_CompareTwoNets ######################
+add_unittest_without_exec(test_CompareTwoNets
+    test_CompareTwoNets.cpp)
+add_test(NAME test_CompareTwoNets
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+        ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index ca55a45bc77b4e171619ab788d7c7dfeefcd036a..9d61533c0b6f20c41130d7b7c15ad93392b2d24c 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * @brief test the functionality of Mkldnnlayers
+ * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
  * refer to paddle original function
  */
 class MKLDNNTester {
diff --git a/paddle/gserver/tests/proto_files.txt b/paddle/gserver/tests/proto_files.txt
deleted file mode 100644
index 691b38c7940bd21360eb00384e060554aa4b3e22..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/proto_files.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-./test_ProtoDataProvider/data1.bin
-./test_ProtoDataProvider/data2.bin
diff --git a/paddle/gserver/tests/proto_files_compressed.txt b/paddle/gserver/tests/proto_files_compressed.txt
deleted file mode 100644
index 7413c81e185d02e0d03aefa06480b9722357c5eb..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/proto_files_compressed.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-./test_ProtoDataProvider/data1.bin.gz
-./test_ProtoDataProvider/data2.bin.gz
diff --git a/paddle/gserver/tests/sequence_lstm.conf b/paddle/gserver/tests/sequence_lstm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..f49a827f22edce056eaf9903e99b732cab7f3784
--- /dev/null
+++ b/paddle/gserver/tests/sequence_lstm.conf
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+sparse_update = get_config_arg("sparse_update", bool, False)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data,
+    size=word_dim,
+    param_attr=ParamAttr(sparse_update=sparse_update))
+
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory(
+    input=lstm_input,
+    act=TanhActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=TanhActivation())
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_recurrent.py b/paddle/gserver/tests/sequence_recurrent.py
new file mode 100644
index 0000000000000000000000000000000000000000..4895df186bfecc5cb5263676a9cd5bac5039d565
--- /dev/null
+++ b/paddle/gserver/tests/sequence_recurrent.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent_group.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_recurrent_group.py b/paddle/gserver/tests/sequence_recurrent_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1d54542e3bc4e89f70d31d5e89c0f44953c9f90
--- /dev/null
+++ b/paddle/gserver/tests/sequence_recurrent_group.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    with mixed_layer(
+            name="rnn_state",
+            size=hidden_dim,
+            bias_attr=False,
+            act=SoftmaxActivation()) as out:
+        out += identity_projection(input=y)
+        out += full_matrix_projection(
+            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
+    return out
+
+
+recurrent = recurrent_group(name="rnn", step=step, input=emb)
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/gserver/tests/test_CompareSparse.cpp
similarity index 98%
rename from paddle/trainer/tests/test_CompareSparse.cpp
rename to paddle/gserver/tests/test_CompareSparse.cpp
index 5f1834bd730375fc10762fc19788d0c693f8e752..c6e07650fc4805a25baf38b9059f6c996d00cafc 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/gserver/tests/test_CompareSparse.cpp
@@ -22,8 +22,7 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-static const string& configFile1 =
-    "trainer/tests/sample_trainer_config_compare_sparse.conf";
+static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
 
 DECLARE_bool(use_gpu);
 DECLARE_string(config);
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/gserver/tests/test_CompareTwoNets.cpp
similarity index 95%
rename from paddle/trainer/tests/test_CompareTwoNets.cpp
rename to paddle/gserver/tests/test_CompareTwoNets.cpp
index 94f65e545d116c802fb4877dc14f07aaaf83a4fb..801d9607565910b1f7f68a9c4532de5877e44f30 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/gserver/tests/test_CompareTwoNets.cpp
@@ -30,8 +30,6 @@ DECLARE_bool(use_gpu);
 DECLARE_string(config);
 DECLARE_string(nics);
 
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
 DEFINE_bool(need_high_accuracy,
             false,
             "whether need to run in double accuracy");
@@ -42,6 +40,10 @@ DEFINE_double(
 DECLARE_bool(thread_local_rand_use_global_seed);
 DECLARE_int32(seed);
 
+static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
+static const string& config_file_b =
+    "gserver/tests/sequence_recurrent_group.py";
+
 struct ComData {
   vector<Argument> outArgs;
   vector<ParameterPtr> parameters;
@@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) {
   DataBatch dataBatch;
   int32_t batchSize = trainer.getConfig().opt_config().batch_size();
 
+  trainer.getDataProvider()->reset();
   trainer.getDataProvider()->setSkipShuffle();
   trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
 
@@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
 TEST(Trainer, create) {
   ComData dataA;
-  calcGradient(dataA, FLAGS_config_file_a);
+  calcGradient(dataA, config_file_a);
   LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
 
   ComData dataB;
-  calcGradient(dataB, FLAGS_config_file_b);
+  calcGradient(dataB, config_file_b);
   LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
 
   compareGradient(dataA, dataB);
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 3517d293e3c901caaa19952b04e56d1ef0d2b46e..fb4eea6f67da9078ef43268a3a1603dc6ccfa652 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1081,6 +1081,21 @@ TEST(Layer, InterpolationLayer) {
   }
 }
 
+TEST(Layer, DotProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("dot_prod");
+  config.layerConfig.set_size(1);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "dot_prod", 10, false, useGpu);
+  }
+}
+
 TEST(Layer, OuterProdLayer) {
   TestConfig config;
   config.layerConfig.set_type("out_prod");
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index a859e34c8996d81f14bf1edcb6e23d5a4f687e6b..42644e9601a82ea81c417adc6441edeb036998e2 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -313,6 +313,47 @@ TEST(MKLDNNLayer, AddtoLayer) {
   testAddtoLayer({4, 12, 1, 1}, 3);
 }
 
+static void getMKLDNNConcatConfig(TestConfig& cfg,
+                                  const std::vector<testImageDesc>& inputs) {
+  CHECK_GE(inputs.size(), 2) << "at least two inputs";
+  int oc = inputs[0].ic;
+  for (size_t i = 1; i < inputs.size(); ++i) {
+    CHECK_EQ(inputs[i].bs, inputs[0].bs);
+    CHECK_EQ(inputs[i].ih, inputs[0].ih);
+    CHECK_EQ(inputs[i].iw, inputs[0].iw);
+    oc += inputs[i].ic;
+  }
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("mkldnn_concat");
+  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back(
+        {INPUT_DATA,
+         ss.str(),
+         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
+         0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(inputs[i].ic);
+    img_conf->set_img_size_y(inputs[i].ih);
+    img_conf->set_img_size(inputs[i].iw);
+  }
+}
+
+void testConcatLayer(const std::vector<testImageDesc>& inputs) {
+  TestConfig dnnConfig;
+  getMKLDNNConcatConfig(dnnConfig, inputs);
+  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
+}
+
+TEST(MKLDNNLayer, ConcatLayer) {
+  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
+  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
+}
+
 void testActivation(std::string actType, const testImageDesc& pm) {
   // TODO(TJ): remove me when paddle support elu activation
   if (actType == "mkldnn_elu") {
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
deleted file mode 100644
index af6472619d1840e82787974d265d601b4a406c09..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ /dev/null
@@ -1,732 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-
-#include "paddle/gserver/dataproviders/ProtoDataProvider.h"
-#include "paddle/utils/Util.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace std;  // NOLINT
-
-std::vector<string> protoFiles{
-    "./test_ProtoDataProvider/data1.bin", "./test_ProtoDataProvider/data2.bin",
-};
-std::vector<string> protoFilesCompressed{
-    "./test_ProtoDataProvider/data1.bin.gz",
-    "./test_ProtoDataProvider/data2.bin.gz",
-};
-
-const char* kTestDir = "./test_ProtoDataProvider";
-const char kProtoFileList[] = "gserver/tests/proto_files.txt";
-const char kProtoFileListCompressed[] =
-    "gserver/tests/proto_files_compressed.txt";
-const int kSpraseMatrixDim = 1024;
-
-using namespace paddle;  // NOLINT
-
-void prepareData(DataBatch* batch,
-                 const int* numPerSlotType,
-                 bool iid,
-                 bool useGpu) {
-  batch->clear();
-  int64_t size = uniformRandom(100) + 10;
-  batch->setSize(size);
-
-  ICpuGpuVectorPtr sequenceStartPositions;
-  ICpuGpuVectorPtr subSequenceStartPositions;
-  if (!iid) {
-    int numSeqs = uniformRandom(10) + 1;
-    sequenceStartPositions =
-        ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
-    int* buf = sequenceStartPositions->getMutableData(false);
-    subSequenceStartPositions =
-        ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
-    int* subBuf = subSequenceStartPositions->getMutableData(false);
-    int64_t pos = 0;
-    int maxLen = 2 * size / numSeqs;
-    for (int i = 0; i < numSeqs; ++i) {
-      int len =
-          uniformRandom(min<int64_t>(maxLen, size - pos - numSeqs + i)) + 1;
-      buf[i] = pos;
-      subBuf[i] = pos;
-      pos += len;
-      VLOG(1) << " len=" << len;
-    }
-    buf[numSeqs] = size;
-    subBuf[numSeqs] = size;
-  }
-
-  vector<Argument>& arguments = batch->getStreams();
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_DENSE]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    MatrixPtr mat = Matrix::create(size, dim, /* trans= */ false, false);
-    mat->randomizeUniform();
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE]; ++i) {
-    MatrixPtr mat =
-        makeRandomSparseMatrix(size, kSpraseMatrixDim, false, useGpu);
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arg.subSequenceStartPositions = subSequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE]; ++i) {
-    MatrixPtr mat =
-        makeRandomSparseMatrix(size, kSpraseMatrixDim, true, useGpu);
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::STRING]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    SVectorPtr vec = std::make_shared<std::vector<std::string>>();
-    for (int j = 0; j < size; ++j) {
-      vec->push_back(randStr(dim));
-    }
-    Argument arg;
-    arg.strs = vec;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::INDEX]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    IVectorPtr vec = IVector::create(size, /* useGpu= */ false);
-    int* buf = vec->getData();
-    for (int j = 0; j < size; ++j) {
-      buf[j] = uniformRandom(dim);
-    }
-    Argument arg;
-    arg.ids = vec;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-}
-
-inline int getSlotDim(const Argument& arg) {
-  if (arg.value) {
-    return arg.value->getWidth();
-  } else if (arg.ids) {
-    return arg.ids->getMax() + 1;
-  } else if (arg.strs) {
-    return 1;
-  }
-  LOG(FATAL) << "Invalid argument";
-  return 0;
-}
-
-inline SlotDef::SlotType getSlotType(const Argument& arg) {
-  if (arg.value) {
-    auto& m = *arg.value;
-    auto& type = typeid(m);
-    if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) {
-      return SlotDef::VECTOR_DENSE;
-    }
-    if (type == typeid(CpuSparseMatrix)) {
-      auto valueType =
-          std::dynamic_pointer_cast<CpuSparseMatrix>(arg.value)->getValueType();
-      if (NO_VALUE == valueType) {
-        return SlotDef::VECTOR_SPARSE_NON_VALUE;
-      } else {
-        return SlotDef::VECTOR_SPARSE_VALUE;
-      }
-    }
-    if (type == typeid(GpuSparseMatrix)) {
-      auto valueType =
-          std::dynamic_pointer_cast<GpuSparseMatrix>(arg.value)->getValueType();
-      if (NO_VALUE == valueType) {
-        return SlotDef::VECTOR_SPARSE_NON_VALUE;
-      } else {
-        return SlotDef::VECTOR_SPARSE_VALUE;
-      }
-    }
-
-    LOG(FATAL) << "Unknown matrix type";
-  }
-  if (arg.ids) return SlotDef::INDEX;
-  if (arg.strs) return SlotDef::STRING;
-  LOG(FATAL) << "Invalid argument";
-  return SlotDef::VECTOR_DENSE;
-}
-
-void getColRow(const Argument& arg,
-               int64_t pos,
-               bool useGpu,
-               int* colNum,
-               const int** rowCols,
-               const real** rowValues) {
-  SlotDef::SlotType type = getSlotType(arg);
-  GpuSparseMatrixPtr matGpu;
-  CpuSparseMatrixPtr matCpu;
-  if (useGpu) {
-    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(arg.value);
-    ASSERT_TRUE(matGpu != NULL);
-  } else {
-    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(arg.value);
-    ASSERT_TRUE(matCpu != NULL);
-  }
-  *colNum = useGpu ? matGpu->getColNum(pos) : matCpu->getColNum(pos);
-  *rowCols = useGpu ? matGpu->getRowCols(pos) : matCpu->getRowCols(pos);
-  if (type == SlotDef::VECTOR_SPARSE_VALUE) {
-    *rowValues = useGpu ? matGpu->getRowValues(pos) : matCpu->getRowValues(pos);
-  } else {
-    *rowValues = NULL;
-  }
-}
-
-void makeSample(const vector<Argument>& arguments,
-                int64_t pos,
-                bool isBeginning,
-                DataSample* sample,
-                bool useGpu) {
-  sample->set_is_beginning(isBeginning);
-  int slotid = 0;
-  for (auto& arg : arguments) {
-    SlotDef::SlotType type = getSlotType(arg);
-    int64_t dim = getSlotDim(arg);
-    switch (type) {
-      case SlotDef::VECTOR_DENSE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto values = vecSlot->mutable_values();
-        values->Reserve(dim);
-        for (int i = 0; i < dim; ++i) {
-          values->AddAlreadyReserved(
-              static_cast<float>(arg.value->getElement(pos, i)));
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        sample->add_id_slots(arg.ids->get(pos));
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto ids = vecSlot->mutable_ids();
-        int colNum;
-        const int* rowCols;
-        const real* rowValues;  // nullptr
-        getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
-        ids->Reserve(colNum);
-        for (int i = 0; i < colNum; ++i) {
-          ids->AddAlreadyReserved(rowCols[i]);
-        }
-        SubseqSlot* subseqSlot = sample->add_subseq_slots();  // subseq
-        subseqSlot->set_slot_id(slotid);
-        auto lens = subseqSlot->mutable_lens();
-        lens->Add(colNum);
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto values = vecSlot->mutable_values();
-        auto ids = vecSlot->mutable_ids();
-        int colNum;
-        const int* rowCols;
-        const real* rowValues;
-        getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
-        ids->Reserve(colNum);
-        values->Reserve(colNum);
-        for (int i = 0; i < colNum; ++i) {
-          ids->AddAlreadyReserved(rowCols[i]);
-          values->AddAlreadyReserved(rowValues[i]);
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "Not implemented";
-        break;
-      }
-      case SlotDef::STRING: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        vecSlot->add_strs((*arg.strs)[pos]);
-        break;
-      }
-    }
-    slotid++;
-  }
-}
-
-void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
-  DataHeader header;
-  const vector<Argument>& arguments = batch.getStreams();
-  for (auto& argument : arguments) {
-    SlotDef* slotDef = header.add_slot_defs();
-    slotDef->set_type(getSlotType(argument));
-    slotDef->set_dim(getSlotDim(argument));
-  }
-  VLOG(1) << "header=" << header.DebugString();
-
-  int64_t totalSeqs = batch.getNumSequences();
-  int64_t seq = 0;
-  ICpuGpuVectorPtr sequenceStartPositions = arguments[0].sequenceStartPositions;
-  int64_t numWritten = 0;
-  vector<string> curProtoFiles =
-      dataCompression ? protoFilesCompressed : protoFiles;
-  for (size_t i = 0; i < curProtoFiles.size(); ++i) {
-    int64_t numSeqs = totalSeqs * (i + 1) / curProtoFiles.size() -
-                      totalSeqs * i / curProtoFiles.size();
-    ofstream os(curProtoFiles[i]);
-    CHECK(os) << "Fail to open " << curProtoFiles[i];
-    unique_ptr<ProtoWriter> writer(new ProtoWriter(&os, dataCompression));
-    CHECK(writer->write(header));
-    for (int j = 0; j < numSeqs; ++j, ++seq) {
-      int64_t begin = seq;
-      int64_t end = seq + 1;
-      if (sequenceStartPositions) {
-        begin = sequenceStartPositions->getElement(seq);
-        end = sequenceStartPositions->getElement(seq + 1);
-      }
-      for (int pos = begin; pos < end; ++pos) {
-        DataSample sample;
-        makeSample(arguments, pos, pos == begin, &sample, useGpu);
-        CHECK(writer->write(sample));
-        ++numWritten;
-      }
-    }
-
-    writer.reset(nullptr);
-    os.close();
-  }
-  CHECK_EQ(arguments[0].getBatchSize(), numWritten);
-}
-
-// check that the sample at pos1 in args1 is same as the sample at pos2 in args2
-void checkSample(const vector<Argument>& args1,
-                 int64_t pos1,
-                 const vector<Argument>& args2,
-                 int64_t pos2,
-                 bool useGpu) {
-  EXPECT_EQ(args1.size(), args2.size());
-  VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2;
-
-  for (size_t i = 0; i < args1.size(); ++i) {
-    auto type = getSlotType(args1[i]);
-    int dim = getSlotDim(args1[i]);
-    EXPECT_EQ(type, getSlotType(args2[i]));
-    if (type == SlotDef::INDEX) {
-      EXPECT_GE(dim, getSlotDim(args2[i]));
-    } else {
-      EXPECT_EQ(dim, getSlotDim(args2[i]));
-    }
-    switch (type) {
-      case SlotDef::VECTOR_DENSE: {
-        for (int j = 0; j < dim; ++j) {
-          EXPECT_EQ(static_cast<float>(args1[i].value->getElement(pos1, j)),
-                    static_cast<float>(args2[i].value->getElement(pos2, j)));
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        EXPECT_EQ(args1[i].ids->get(pos1), args2[i].ids->get(pos2));
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE:
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        int colNum1, colNum2;
-        const int *rowCols1, *rowCols2;
-        const real *rowValues1, *rowValues2;
-        getColRow(args1[i], pos1, useGpu, &colNum1, &rowCols1, &rowValues1);
-        getColRow(args2[i], pos2, useGpu, &colNum2, &rowCols2, &rowValues2);
-        EXPECT_EQ(colNum1, colNum2);
-        for (int j = 0; j < colNum1; ++j) {
-          EXPECT_EQ(rowCols1[j], rowCols2[j]);
-          if (type == SlotDef::VECTOR_SPARSE_VALUE) {
-            EXPECT_EQ(rowValues1[j], rowValues2[j]);
-          }
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "Not implemented";
-        break;
-      }
-      case SlotDef::STRING: {
-        EXPECT_EQ((*args1[i].strs)[pos1], (*args2[i].strs)[pos2]);
-        break;
-      }
-    }
-  }
-}
-
-void testProtoDataProvider(int* numPerSlotType,
-                           bool iid,
-                           bool async,
-                           bool useGpu,
-                           bool dataCompression,
-                           int numConstantSlots = 0) {
-  mkDir(kTestDir);
-  DataBatch data;
-
-  prepareData(&data, numPerSlotType, iid, useGpu);
-  writeData(data, useGpu, dataCompression);
-
-  DataConfig config;
-  config.set_type("proto");
-  config.set_files(dataCompression ? kProtoFileListCompressed : kProtoFileList);
-  config.set_async_load_data(async);
-
-  for (int i = 0; i < numConstantSlots; ++i) {
-    config.add_constant_slots(i + 11);
-    MatrixPtr w = Matrix::create(data.getSize(),
-                                 1,
-                                 /* trans= */ false,
-                                 /* useGpu= */ false);
-    w->assign(config.constant_slots(i));
-    data.appendData(w);
-  }
-
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  dataProvider->setSkipShuffle();
-
-  EXPECT_EQ(data.getSize(), dataProvider->getSize());
-
-  int64_t batchSize = 10;
-  DataBatch batch;
-
-  size_t seq1 = 0;
-  vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
-
-  dataProvider->reset();
-
-  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
-    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
-    vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
-    for (auto& arg : args2) {
-      EXPECT_EQ(iid, !arg.sequenceStartPositions);
-    }
-    size_t numSeqs = batch.getNumSequences();
-    VLOG(1) << "numSeqs=" << numSeqs;
-    for (size_t seq2 = 0; seq2 < numSeqs; ++seq1, ++seq2) {
-      int64_t begin1 = seq1;
-      int64_t end1 = seq1 + 1;
-      if (sequenceStartPositions1) {
-        begin1 = sequenceStartPositions1->getElement(seq1);
-        end1 = sequenceStartPositions1->getElement(seq1 + 1);
-        EXPECT_LT(seq1, sequenceStartPositions1->getSize() - 1);
-      }
-
-      int64_t begin2 = seq2;
-      int64_t end2 = seq2 + 1;
-      if (sequenceStartPositions2) {
-        begin2 = sequenceStartPositions2->getElement(seq2);
-        end2 = sequenceStartPositions2->getElement(seq2 + 1);
-      }
-      VLOG(1) << " begin1=" << begin1 << " end1=" << end1
-              << " begin2=" << begin2 << " end2=" << end2;
-      EXPECT_EQ(end1 - begin1, end2 - begin2);
-      for (int i = 0; i < end1 - begin1; ++i) {
-        checkSample(args1, begin1 + i, args2, begin2 + i, useGpu);
-      }
-    }
-  }
-
-  EXPECT_EQ(seq1, (size_t)data.getNumSequences());
-  rmDir(kTestDir);
-}
-
-TEST(ProtoDataProvider, test) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  int numSlotsArraySize = sizeof(numSlotsArray) / sizeof(numSlotsArray[0]);
-  const int numSlot = 5;
-  int combination[numSlot] = {0};
-  int k = numSlot - 1;
-  while (k >= 0) {
-    int numDenseVecSlots = numSlotsArray[combination[0]];
-    int numSparseNonValueVecSlots = numSlotsArray[combination[1]];
-    int numSparseValueVectorSlots = numSlotsArray[combination[2]];
-    int numStrSlots = numSlotsArray[combination[3]];
-    int numIdSlots = numSlotsArray[combination[4]];
-    // while loop : traverse all cases
-    k = numSlot - 1;
-    while (k >= 0) {
-      if (combination[k] < (numSlotsArraySize - 1)) {
-        ++combination[k];
-        break;
-      } else {
-        combination[k] = 0;
-        --k;
-      }
-    }
-    if (numDenseVecSlots + numSparseNonValueVecSlots +
-            numSparseValueVectorSlots + numStrSlots + numIdSlots <
-        1)
-      continue;
-    for (int iid : numTwoArray) {
-      for (int async : numTwoArray) {
-        for (int useGpu : numTwoArray) {
-          for (int dataCompression : numTwoArray) {
-            if (async && useGpu) {
-              // Currently in async mode, useGpu is not supported
-              continue;
-            }
-#ifndef PADDLE_WITH_CUDA
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numSparseValueVectorSlots="
-                      << numSparseValueVectorSlots
-                      << " numStrSlots=" << numStrSlots
-                      << " numIdSlots=" << numIdSlots << " iid=" << iid
-                      << " async=" << async << " useGpu=" << useGpu
-                      << " dataCompression=" << dataCompression;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] =
-                numSparseValueVectorSlots;
-            numPerSlotType[SlotDef::INDEX] = numIdSlots;
-            numPerSlotType[SlotDef::STRING] = numStrSlots;
-            testProtoDataProvider(
-                numPerSlotType, iid, async, useGpu, dataCompression);
-          }  // end for (int dataCompression : numTwoArray)
-        }    // end for (int useGpu : numTwoArray)
-      }      // end for (int async : numTwoArray)
-    }        // end for (int iid : numTwoArray)
-  }          // end for (while, traverse all slots)
-}
-
-TEST(ProtoDataProvider, constant_slots) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  for (int numDenseVecSlots : numSlotsArray) {
-    for (int numSparseNonValueVecSlots : numSlotsArray) {
-      if (numDenseVecSlots + numSparseNonValueVecSlots < 1) continue;
-      for (int numConstantSlots : {1, 2}) {
-        for (int useGpu : numTwoArray) {
-          for (int dataCompression : numTwoArray) {
-#ifndef PADDLE_WITH_CUDA
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numConstantSlogs=" << numConstantSlots
-                      << " useGpu=" << useGpu
-                      << " dataCompression=" << dataCompression;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] = 1;
-            numPerSlotType[SlotDef::INDEX] = 1;
-            testProtoDataProvider(numPerSlotType,
-                                  /* iid= */ true,
-                                  /* async= */ false,
-                                  useGpu,
-                                  dataCompression,
-                                  numConstantSlots);
-          }  // end for (int dataCompression : numTwoArray)
-        }    // end for (int useGpu : numTwoArray)
-      }      // end for (int numConstantSlots : {1, 2})
-    }        // end for (int numSparseNonValueVecSlots : numSlotsArray)
-  }          // end for (int numDenseVecSlots : numSlotsArray)
-}
-
-void checkSampleSequence(const vector<Argument>& args1,
-                         const vector<Argument>& args2,
-                         int64_t offset,
-                         int64_t numSeqs,
-                         bool useGpu) {
-  // check slot num are equal
-  EXPECT_EQ(args1.size(), args2.size());
-  for (size_t i = 0; i < args1.size(); i++) {
-    auto type = getSlotType(args1[i]);
-    // check for args2: sequenceStartPositions vs numSeqs
-    // (1) size
-    EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), (size_t)numSeqs + 1);
-    // (2) content
-    auto checkArgContent = [&](const Argument& args, int numSeqs) {
-      for (int j = 0; j <= numSeqs; j++) {
-        int start_pos = args.sequenceStartPositions->getElement(j);
-        EXPECT_EQ(start_pos, j);
-      }
-    };
-    switch (type) {
-      case SlotDef::INDEX: {
-        // args1: for label
-        checkArgContent(args2[i], numSeqs);
-        // check for args2: ids are equal to args1[offset]
-        // (1) size
-        EXPECT_EQ(args2[i].ids->getSize(), (size_t)numSeqs);
-        // (2) content
-        for (int j = 0; j < numSeqs; j++) {
-          EXPECT_EQ(args2[i].ids->get(j), args1[i].ids->get(offset + j));
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        // args1: for sparse_non_value
-        // args2 should put sparse indexes in ids
-        int colNum1;
-        const int* rowCols1;
-        const real* rowValues1;  // nullptr
-        int totalLength = 0;
-        for (int j = 0; j < numSeqs; j++) {
-          getColRow(
-              args1[i], offset + j, useGpu, &colNum1, &rowCols1, &rowValues1);
-          // (1) lengths
-          EXPECT_EQ(totalLength,
-                    args2[i].sequenceStartPositions->getElement(j));
-          EXPECT_EQ(totalLength,
-                    args2[i].subSequenceStartPositions->getElement(j));
-          // (2) content
-          for (int k = 0; k < colNum1; k++) {
-            EXPECT_EQ(rowCols1[k], args2[i].ids->get(totalLength + k));
-          }
-          totalLength += colNum1;
-          if (colNum1 == 0) {
-            // special case here: we will put a "-1" into ids when column num is
-            // zero. see ProtoSequenceDataProvider::getNextBatchInternal.
-            EXPECT_EQ(-1, args2[i].ids->get(totalLength));
-            totalLength++;
-          }
-        }
-        EXPECT_EQ(totalLength,
-                  args2[i].sequenceStartPositions->getElement(numSeqs));
-        EXPECT_EQ(totalLength,
-                  args2[i].subSequenceStartPositions->getElement(numSeqs));
-        break;
-      }
-      case SlotDef::VECTOR_DENSE: {
-        // args1: for dense vector
-        checkArgContent(args2[i], numSeqs);
-        // check for args2: values are equal to args1[offset]
-        // (1) size
-        EXPECT_EQ(args2[i].value->getHeight(), (size_t)numSeqs);
-        EXPECT_EQ(args2[i].value->getWidth(), (size_t)getSlotDim(args1[i]));
-        // (2) content
-        for (int j = 0; j < numSeqs; j++) {
-          for (size_t k = 0; k < args2[i].value->getWidth(); k++) {
-            EXPECT_EQ(
-                static_cast<float>(args1[i].value->getElement(j + offset, k)),
-                static_cast<float>(args2[i].value->getElement(j, k)));
-          }
-        }
-        break;
-      }
-      default: { EXPECT_EQ(true, false) << "should not reach here"; }
-    }
-  }
-}
-
-void testProtoSequenceDataProvider(int* numPerSlotType,
-                                   bool async,
-                                   bool useGpu) {
-  mkDir(kTestDir);
-  DataBatch data;
-
-  prepareData(&data,
-              numPerSlotType,
-              /* iid */ true,
-              useGpu);
-  writeData(data, useGpu, /* dataCompression */ false);
-
-  DataConfig config;
-  config.set_type("proto_sequence");
-  config.set_files(kProtoFileList);
-  config.set_async_load_data(async);
-
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  dataProvider->setSkipShuffle();
-
-  EXPECT_EQ(data.getSize(), dataProvider->getSize());
-
-  int64_t batchSize = 10;
-  DataBatch batch;
-
-  vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
-
-  dataProvider->reset();
-
-  size_t args1Offset = 0;
-  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
-    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
-    vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
-    for (auto& arg : args1) {
-      // args1 should not has sequence
-      EXPECT_EQ(true, !arg.sequenceStartPositions);
-    }
-    for (auto& arg : args2) {
-      // args2 should has sequence
-      EXPECT_NE(true, !arg.sequenceStartPositions);
-    }
-    size_t numSeqs = batch.getNumSequences();
-    checkSampleSequence(args1, args2, args1Offset, numSeqs, useGpu);
-    args1Offset += numSeqs;
-  }
-
-  EXPECT_EQ(args1Offset, (size_t)data.getNumSequences());
-  rmDir(kTestDir);
-}
-
-TEST(ProtoSequenceDataProvider, test) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  for (int numSparseNonValueVecSlots : numSlotsArray) {
-    for (int numIdSlots : numSlotsArray) {
-      for (int numDenseVecSlots : numSlotsArray) {
-        if (numDenseVecSlots + numSparseNonValueVecSlots + numIdSlots < 1)
-          continue;
-        for (int async : numTwoArray) {
-          for (int useGpu : numTwoArray) {
-            if (async && useGpu) {
-              // Currently in async mode, useGpu is not supported
-              continue;
-            }
-#ifndef PADDLE_WITH_CUDA
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numIdSlots=" << numIdSlots << " async=" << async
-                      << " useGpu=" << useGpu;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::INDEX] = numIdSlots;
-            testProtoSequenceDataProvider(numPerSlotType, async, useGpu);
-          }  // end for (int useGpu : numTwoArray)
-        }    // end for (int async : numTwoArray)
-      }      // end for (int numDenseVecSlots : numSlotsArray)
-    }        // end for (int numIdSlots : numSlotsArray)
-  }          // end for (int numSparseNonValueVecSlots : numSlotsArray)
-}
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 4adaaef9838f0d178468af3af142031325bfc11d..a2ef731ecbcd18ca4bd0b2381de04650a2686c2d 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -17,9 +17,13 @@ limitations under the License. */
 #include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
+#ifndef PADDLE_MOBILE_INFERENCE
 DEFINE_int32(pool_limit_size,
              536870912,
              "maximum memory size managed by a memory pool, default is 512M");
+#else
+DEFINE_int32(pool_limit_size, 0, "default is 0");
+#endif
 
 namespace paddle {
 
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index a719da2560291dbc7e98aadfae41d4692d8afcad..46c2833030c936119e98adcdd338245bbdaddce7 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -61,6 +61,18 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
+    if ("${TARGET}" STREQUAL "compare_op")
+        set(pybind_flag 1)
+        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
+    endif()
+
+    # conv_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
+    endif()
+
     # pool_op contains several operators
     if ("${TARGET}" STREQUAL "pool_op")
         set(pybind_flag 1)
@@ -68,9 +80,11 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
     endif()
 
-    if ("${TARGET}" STREQUAL "compare_op")
+    # pool_cudnn_op contains several operators
+    if ("${TARGET}" STREQUAL "pool_cudnn_op")
         set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
     endif()
 
     # pool_with_index_op contains several operators
@@ -80,25 +94,18 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
     endif()
 
-    # conv_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
-    endif()
-
     # conv_transpose_op contains several operators
     if ("${TARGET}" STREQUAL "conv_transpose_op")
         set(pybind_flag 1)
         # It's enough to just adding one operator to pybind
         file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
     endif()
-    
-    # pool_cudnn_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_cudnn_op")
+
+    # conv_transpose_cudnn_op contains two operators
+    if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
         set(pybind_flag 1)
         # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
+        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
     endif()
 
     # save_restore_op contains several operators
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
index 666043e824f885e9c0e79e319d0a38ba108c209a..233a81198e336d3190565fb18556f96979cec0ce 100644
--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -42,6 +42,7 @@ class ArrayOp : public framework::OperatorBase {
     } else {
       offset = static_cast<size_t>(*i_tensor.data<int64_t>());
     }
+    VLOG(10) << " Offset = " << offset;
     return offset;
   }
 };
diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h
index ffa4f43a327418498c1f110504127e7d2878409d..1113a4c6f357edb4f6b14b73c6eec9c6cca24ce5 100644
--- a/paddle/operators/bilinear_tensor_product_op.h
+++ b/paddle/operators/bilinear_tensor_product_op.h
@@ -174,7 +174,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
     // Caculate the gradient of Input(Bias).
     if (d_bias) {
       d_bias->mutable_data<T>(ctx.GetPlace());
-      auto d_bias_mat = EigenMatrix<T>::From(*d_bias);
+      auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
       d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
     }
   }
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
index 2aec4a2760260623c4c7054c590afa8e1c6c3fea..4900f7b086c869b496c492743c71ab7047c5f672 100644
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -226,9 +226,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     T alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
+      // Because beta is zero, it is unnecessary to reset input_grad.
+
       for (int i = 0; i < groups; i++) {
         PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
             handle, &alpha, cudnn_filter_desc,
@@ -241,9 +240,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv backward filter ---------------------
     if (filter_grad) {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*filter_grad);
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
+      // Because beta is zero, it is unnecessary to reset filter_grad.
+
       for (int i = 0; i < groups; i++) {
         PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
             handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index 687d741cb22a081eab18c61752200b9fd48f68a7..7a36a9b21aa6a1b415ac5a232e65eda8051c87f8 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -225,11 +225,15 @@ REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
             ops::ConvOpGrad);
 
 REGISTER_OP_CPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
 
 REGISTER_OP_CPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
index 8e6f9da455b7291049aee57189dae15b8bcc2150..546451234a1ed1a4d3119cb175c6d37ae3f0aac1 100644
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -17,11 +17,15 @@
 namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
 
 REGISTER_OP_GPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/conv2d_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc
similarity index 61%
rename from paddle/operators/conv2d_transpose_cudnn_op.cc
rename to paddle/operators/conv_transpose_cudnn_op.cc
index fce1357ce5af5f11ccc5941690431393301e6725..dbd1bc3c3bc2d026f13ddcf62919db6cf7d87bc5 100644
--- a/paddle/operators/conv2d_transpose_cudnn_op.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cc
@@ -23,7 +23,24 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
                               framework::OpAttrChecker* op_checker)
       : Conv2DTransposeOpMaker(proto, op_checker) {
     AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault(std::vector<int>{1, 1});
+        .SetDefault({1, 1});
+    AddAttr<int>("workspace_size_MB",
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardward. This size should be carefully setted.")
+        .SetDefault(4096);
+  }
+};
+
+class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker {
+ public:
+  CudnnConv3DTransposeOpMaker(framework::OpProto* proto,
+                              framework::OpAttrChecker* op_checker)
+      : Conv3DTransposeOpMaker(proto, op_checker) {
+    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
+        .SetDefault({1, 1, 1});
     AddAttr<int>("workspace_size_MB",
                  "workspace size for cudnn, in MB, "
                  "workspace is a section of GPU memory which will be "
@@ -48,3 +65,14 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_cudnn_grad,
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp,
+            ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad,
+            ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_cudnn,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_cudnn_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/conv2d_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc
similarity index 92%
rename from paddle/operators/conv2d_transpose_cudnn_op.cu.cc
rename to paddle/operators/conv_transpose_cudnn_op.cu.cc
index eff058afc6cc5dacf2a054a33f352824865c1924..e2ba77086e737a07471f14e483cbd32ab1d4ee12 100644
--- a/paddle/operators/conv2d_transpose_cudnn_op.cu.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc
@@ -54,15 +54,21 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
     ScopedTensorDescriptor output_desc;
     ScopedFilterDescriptor filter_desc;
     ScopedConvolutionDescriptor conv_desc;
-    DataLayout layout = DataLayout::kNCHW;
+    DataLayout layout;
+
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
 
-    // N, M, H, W
+    // (N, M, H, W) or (N, M, D, H, W)
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()));
-    // N, C, O_h, O_w
+    // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
         layout, framework::vectorize2int(output->dims()));
-    // M, C, K_h, K_w
+    // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()));
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
@@ -136,13 +142,13 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     ScopedConvolutionDescriptor conv_desc;
     DataLayout layout = DataLayout::kNCHW;
 
-    // Input: (N, M, H, W)
+    // Input: (N, M, H, W) or (N, M, D, H, W)
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()));
-    // Output: (N, C, O_H, O_W)
+    // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
         layout, framework::vectorize2int(output_grad->dims()));
-    // Filter (M, C, K_H, K_W)
+    // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()));
 
@@ -200,8 +206,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     T alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      math::set_constant(ctx.device_context(), input_grad, 0);
-
+      // Because beta is zero, it is unnecessary to reset input_grad.
       PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
           handle, &alpha, cudnn_output_desc, output_grad_data,
           cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
@@ -212,8 +217,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv backward filter ---------------------
     if (filter_grad) {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      math::set_constant(ctx.device_context(), filter_grad, 0);
-
+      // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
       PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
           handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
@@ -234,3 +238,8 @@ REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn,
                        ops::CudnnConvTransposeOpKernel<float>);
 REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad,
                        ops::CudnnConvTransposeGradOpKernel<float>);
+
+REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn,
+                       ops::CudnnConvTransposeOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad,
+                       ops::CudnnConvTransposeGradOpKernel<float>);
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
index 13ac0cd54cbeb8f68c2246f7e1d02f032266a72e..3e55ef036a7fb976117054574d1347fa943acd55 100644
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -30,11 +30,6 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
 
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    PADDLE_ENFORCE_EQ(paddings[i], 0,
-                      "No Padding allowed in conv transpose op.");
-  }
-
   PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                  "ConvTransposeOp intput should be 4-D or 5-D tensor.");
   PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
@@ -52,7 +47,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] +
+    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
                            filter_dims[i + 2]);
   }
   ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
@@ -190,17 +185,21 @@ REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
 
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
 
 REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
             conv3d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/operators/conv_transpose_op.cu.cc
index 401cddb379ced134b800d2a078fe130a2850fbb2..4165eb0c7b048b83bbd94c57b971530043b66545 100644
--- a/paddle/operators/conv_transpose_op.cu.cc
+++ b/paddle/operators/conv_transpose_op.cu.cc
@@ -18,14 +18,18 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(
     conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
     conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
 
 REGISTER_OP_GPU_KERNEL(
     conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
     conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
index 4b2bd60437da8f58054d8cdd5e6ba1fdac05f0d5..ab336ad23ce1c180b68d04e4c85b299e301d5376 100644
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -62,7 +62,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     Tensor* output = context.Output<Tensor>("Output");
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    // Actually, no paddings and groups allowed in conv transpose.
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     // TODO(Zhuoyuan): Paddings can be added in future.
     // groups will alway be disabled in conv2dtranspose.
@@ -148,8 +147,8 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
       } else if (filter_shape_vec.size() == 3) {
         // col2vol: col_matrix -> dy
         // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
-        col2vol(context.device_context(), col, dilations, strides,
-                std::vector<int>{0, 0, 0}, &output_batch);
+        col2vol(context.device_context(), col, dilations, strides, paddings,
+                &output_batch);
       }
     }
   }
@@ -173,7 +172,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     if ((!input_grad) && (!filter_grad)) return;
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    // Actually, no paddings and groups allowed in conv transpose.
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
 
     const int batch_size = static_cast<int>(input->dims()[0]);
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index 68c56f531f941e1b8f66ac7ba6bf318881642c4f..62a4e484eceeabc4cc26e68ac54a50be1ac95df7 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -132,7 +132,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
       // compute dy
       if (out_grad_y) {
         out_grad_y->mutable_data<T>(context.GetPlace());
-        auto dy = EigenMatrix<T>::Reshape(*out_grad_y, 1);
+        auto dy = EigenVector<T>::Flatten(*out_grad_y);
         auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast;
         dy.device(place) = (dz_bcast * grad).sum(Eigen::array<int, 1>({{0}}));
       }
diff --git a/paddle/operators/detail/safe_ref.h b/paddle/operators/detail/safe_ref.h
new file mode 100644
index 0000000000000000000000000000000000000000..b71af17309f9f46b5c87f0f479d4e03443fa7f93
--- /dev/null
+++ b/paddle/operators/detail/safe_ref.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace operators {
+namespace detail {
+/**
+ * Get Reference From Pointer with check. The error message is printf format,
+ * and passed by `args`
+ */
+template <typename T, typename... ARGS>
+inline T &Ref(T *ptr, ARGS &&... args) {
+  PADDLE_ENFORCE(ptr != nullptr, args...);
+  return *ptr;
+}
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index 85871ebbfcd8ee38ef5e8078d1d6cb6bdda46a7b..985b5d1e865e513d833bff72dcd20a8f20851d8c 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -101,4 +101,7 @@ REGISTER_OPERATOR(fill_constant_batch_size_like,
 REGISTER_OP_CPU_KERNEL(
     fill_constant_batch_size_like,
     ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, double>);
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace,
+                                           int64_t>);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
index 87e3697e2832e7c60a4293fe7126ae4c9c053e4d..9e7a1eeab863c962ca72908e561e12a04d5021c5 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
@@ -19,4 +19,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
     fill_constant_batch_size_like,
     ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, double>);
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace,
+                                           int64_t>);
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index 8ab39d4fb012b8fa3883f33e4d15be7918500354..95fb5932b8b555e1357adc9fdfb7b6e6db7da71d 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -54,5 +54,8 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
                              ops::FillZerosLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    fill_zeros_like,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>);
+    fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, bool>);
diff --git a/paddle/operators/fill_zeros_like_op.cu.cc b/paddle/operators/fill_zeros_like_op.cu.cc
index 2adb40cf90b42a5ba608302f7985346c949ff6ed..1501a17441072223ba0e8cf5b6c8cdd5e903a467 100644
--- a/paddle/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/operators/fill_zeros_like_op.cu.cc
@@ -17,5 +17,8 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    fill_zeros_like,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
+    fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int>,
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>,
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, double>,
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, bool>);
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
index 55e9cc4a98bd6d36ce5d6bb4116039d0ec18b485..1b18368e0e16365682520b62a7f6adab0cbb527f 100644
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -24,8 +24,17 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+inline void ReorderInitState(const platform::DeviceContext& ctx,
+                             const framework::Tensor& src, const size_t* index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
 
 template <typename Place, typename T>
 class GRUKernel : public framework::OpKernel<T> {
@@ -33,7 +42,6 @@ class GRUKernel : public framework::OpKernel<T> {
   void BatchCompute(const framework::ExecutionContext& context) const {
     auto* input = context.Input<LoDTensor>("Input");
     auto* h0 = context.Input<Tensor>("H0");
-    const T* h0_data = h0 ? h0->data<T>() : nullptr;
     auto* weight = context.Input<Tensor>("Weight");
     const T* weight_data = weight->data<T>();
     auto* bias = context.Input<Tensor>("Bias");
@@ -66,7 +74,18 @@ class GRUKernel : public framework::OpKernel<T> {
     gru_value.gateWeight = const_cast<T*>(weight_data);
     gru_value.stateWeight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    gru_value.prevOutValue = const_cast<T*>(h0_data);
+    Tensor ordered_h0;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<Place, T>(context.device_context(), *h0, order,
+                                 &ordered_h0, true);
+      gru_value.prevOutValue = ordered_h0.data<T>();
+    } else {
+      gru_value.prevOutValue = nullptr;
+    }
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
     for (size_t n = 0; n < num_batch; n++) {
@@ -102,7 +121,6 @@ class GRUGradKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
     auto* h0 = context.Input<Tensor>("H0");
-    const T* h0_data = h0 ? h0->data<T>() : nullptr;
     auto* weight = context.Input<Tensor>("Weight");
     const T* weight_data = weight->data<T>();
     auto* batch_gate = context.Input<LoDTensor>("BatchGate");
@@ -135,6 +153,17 @@ class GRUGradKernel : public framework::OpKernel<T> {
     zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
     zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
 
+    Tensor ordered_h0, ordered_h0_grad;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (h0) {
+      ReorderInitState<Place, T>(context.device_context(), *h0, order,
+                                 &ordered_h0, true);
+    }
+    if (h0_grad) {
+      ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
+      zero(context.device_context(), &ordered_h0_grad, static_cast<T>(0.0));
+    }
+
     bool is_reverse = context.Attr<bool>("is_reverse");
     batch_hidden_grad.set_lod(batch_hidden->lod());
     to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
@@ -176,14 +205,9 @@ class GRUGradKernel : public framework::OpKernel<T> {
           batch_reset_hidden_prev_grad.Slice(bstart, bend);
       gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
       if (n == 0) {
-        gru_value.prevOutValue = const_cast<T*>(h0_data);
-        if (h0_grad) {
-          T* h0_grad_data = h0_grad->mutable_data<T>(context.GetPlace());
-          zero(dev_ctx, h0_grad, static_cast<T>(0.0));
-          gru_grad.prevOutGrad = h0_grad_data;
-        } else {
-          gru_grad.prevOutGrad = nullptr;
-        }
+        gru_value.prevOutValue = h0 ? ordered_h0.data<T>() : nullptr;
+        gru_grad.prevOutGrad =
+            h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
       } else {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
         Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
@@ -208,6 +232,10 @@ class GRUGradKernel : public framework::OpKernel<T> {
       math::ColwiseSum<Place, T> col_sum;
       col_sum(dev_ctx, batch_gate_grad, bias_grad);
     }
+    if (h0 && h0_grad) {
+      ReorderInitState<Place, T>(context.device_context(), ordered_h0_grad,
+                                 order, h0_grad, false);
+    }
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/operators/is_empty_op.cc b/paddle/operators/is_empty_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54fecf44e881b5c283c81580fd161da9808d253e
--- /dev/null
+++ b/paddle/operators/is_empty_op.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr char kInput[] = "X";
+constexpr char kOutput[] = "Out";
+
+class IsEmptyOp : public framework::OperatorBase {
+ public:
+  IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    // get input
+    auto *var = scope.FindVar(Input(kInput));
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto &tensor = var->Get<framework::LoDTensor>();
+    // get output
+    auto *out = scope.FindVar(Output(kOutput));
+    PADDLE_ENFORCE_NOT_NULL(out);
+    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+
+    out_tensor->Resize({1});
+    out_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
+        framework::product(tensor.dims()) == 0;
+  }
+};
+
+class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IsEmptyOpProtoMaker(framework::OpProto *proto,
+                      framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kInput, "(Tensor) Tensor which is to be checked.");
+    AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
+    AddComment(R"DOC(
+IsEmpty Operator which checks whether a tensor is empty.
+
+It will just return product(tensor.ddims()) > 0;
+              )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp,
+                             paddle::operators::IsEmptyOpProtoMaker);
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index b9417f1d7fdc663fff751328d18239af3dbb1216..002b68fecf4f1e294387357f0346d9926a2b2b5a 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(detail)
 
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto)
     nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
     nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
     nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
@@ -15,7 +15,7 @@ if(WITH_GPU)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
     nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context)
+    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
     cc_library(softmax SRCS softmax.cc DEPS device_context)
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu
index 347df7a0ffdec163c0479a71ec775a813930ba5f..bf7894243919571c2ab15d53690b1ef05bfcc6ee 100644
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
@@ -119,8 +119,8 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width,
 
   if (index < n) {
     T val = 0;
-    int w = index % im_width;
-    int h = (index / im_width) % im_height;
+    int w = index % im_width + padding_width;
+    int h = (index / im_width) % im_height + padding_height;
     int c = index / (im_width * im_height);
 
     // compute the start and end of the output
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index 5ee091788687133f6eaef7229d9f95e2025a2daf..2e333a8cde721f8e65dbf2cf5e3aac6272172cc0 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -250,6 +250,8 @@ void axpy<platform::CPUPlace, double>(const platform::DeviceContext& context,
 template struct SetConstant<platform::CPUPlace, float>;
 template struct SetConstant<platform::CPUPlace, double>;
 template struct SetConstant<platform::CPUPlace, int>;
+template struct SetConstant<platform::CPUPlace, int64_t>;
+template struct SetConstant<platform::CPUPlace, bool>;
 
 #define DEFINE_CPU_TRANS(RANK)                                \
   template struct Transpose<platform::CPUPlace, float, RANK>; \
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 38c04b97f9d07b9cca938b09f46ea81328a35322..58356a4b7783241ca0292829bf05dc1a8ed80c6c 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -256,6 +256,8 @@ void axpy<platform::GPUPlace, double>(const platform::DeviceContext& context,
 template struct SetConstant<platform::GPUPlace, float>;
 template struct SetConstant<platform::GPUPlace, double>;
 template struct SetConstant<platform::GPUPlace, int>;
+template struct SetConstant<platform::GPUPlace, int64_t>;
+template struct SetConstant<platform::GPUPlace, bool>;
 
 #define DEFINE_GPU_TRANS(RANK)                                \
   template struct Transpose<platform::GPUPlace, float, RANK>; \
diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc
index 8711567b95fea355396173b5312d26d31f9ffb12..f9d8af3e1c5db49873979fdfeb17a32d16341a1a 100644
--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ b/paddle/operators/pool_cudnn_op.cu.cc
@@ -135,8 +135,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
 
     if (input_grad) {
       T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<paddle::platform::GPUPlace, T> set_zero;
-      set_zero(ctx.device_context(), input_grad, static_cast<T>(0));
+      // Because beta is zero, it is unnecessary to reset input_grad.
 
       PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
           handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
index 990053b8af9f6bfb982eaf994374daa45ce6cbfb..cbe0b4233160dd1f3ebdf6db8b5f6df392efdfe7 100755
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -42,7 +42,8 @@ class SequenceSliceOp : public framework::OperatorWithKernel {
         length_dim.size(), 2UL,
         "Only support one level sequence now, The rank of Length must be 2.");
 
-    // Initialize the output's dims to maximum
+    // Initialize the output's dims to maximum,
+    // and re-set to real dims by the value of Offset and Length at kernel
     ctx->SetOutputDim("Out", input_dims);
     }
 
diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h
index c7d7ef4916756397c3d5911d7ec25305efc815c0..2ef2c8f0c4f3061c0082916edc18564698821bfc 100755
--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/operators/sequence_slice_op.h
@@ -143,6 +143,7 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
 
     if (x_grad) {
       x_grad->mutable_data<T>(ctx.GetPlace());
+      x_grad->set_lod(in->lod());
       math::SetConstant<Place, T> set_zero;
       set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
 
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 9837f325e30f68ba927a540d395cc7d7e093a607..c2b7632b2865a3ef66051d815d7722a08c6a8cbd 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/operators/sum_op.h"
 #include <vector>
 #include "paddle/framework/var_type_inference.h"
+#include "paddle/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
@@ -59,13 +60,16 @@ class SumOp : public framework::OperatorWithKernel {
               x_vars[0]->Get<framework::SelectedRows>().value().type()),
           ctx.device_context());
     } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
-      auto& array = x_vars[0]->Get<framework::LoDTensorArray>();
-      for (auto& each : array) {
-        if (each.numel() != 0) {
-          return framework::OpKernelType(framework::ToDataType(each.type()),
-                                         ctx.device_context());
+      for (auto& x_var : x_vars) {
+        auto& array = x_var->Get<framework::LoDTensorArray>();
+        for (auto& each : array) {
+          if (each.numel() != 0) {
+            return framework::OpKernelType(framework::ToDataType(each.type()),
+                                           ctx.device_context());
+          }
         }
       }
+      PADDLE_THROW("Cannot find the input data type by all input data");
     }
     PADDLE_THROW("Unexpected branch. Input type is %s",
                  x_vars[0]->Type().name());
@@ -96,6 +100,11 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
     auto& inputs = op_desc.Input("X");
     auto var_type = framework::VarDesc::SELECTED_ROWS;
 
+    for (auto& name : op_desc.Input("X")) {
+      VLOG(10) << name << " "
+               << block->FindRecursiveOrCreateVar(name)->GetType();
+    }
+
     bool any_input_is_lod_tensor = std::any_of(
         inputs.begin(), inputs.end(), [block](const std::string& name) {
           return block->FindRecursiveOrCreateVar(name)->GetType() ==
@@ -103,7 +112,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
         });
 
     auto is_tensor_array = [block](const std::string& name) {
-      return block->FindRecursiveOrCreateVar(name)->GetType() ==
+      return detail::Ref(block->FindRecursiveOrCreateVar(name)).GetType() ==
              framework::VarDesc::LOD_TENSOR_ARRAY;
     };
 
@@ -113,14 +122,26 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
         std::all_of(inputs.begin(), inputs.end(), is_tensor_array);
 
     if (any_input_is_tensor_array) {
-      PADDLE_ENFORCE(all_inputs_are_tensor_array);
+      if (!all_inputs_are_tensor_array) {
+        std::ostringstream os;
+        for (auto& each : inputs) {
+          os << "    " << each << " type is "
+             << detail::Ref(block->FindRecursiveOrCreateVar(each)).GetType()
+             << "\n";
+        }
+        PADDLE_ENFORCE(all_inputs_are_tensor_array,
+                       "Not all inputs are tensor array:\n%s", os.str());
+      }
       var_type = framework::VarDesc::LOD_TENSOR_ARRAY;
     } else if (any_input_is_lod_tensor) {
       var_type = framework::VarDesc::LOD_TENSOR;
     }
 
     auto out_var_name = op_desc.Output("Out").front();
-    block->FindRecursiveOrCreateVar(out_var_name)->SetType(var_type);
+    auto& out_var = detail::Ref(block->FindRecursiveOrCreateVar(out_var_name));
+    out_var.SetType(var_type);
+    auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
+    out_var.SetDataType(in_var.GetDataType());
   }
 };
 
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index 62e15604c47f25c458abc69ecd1cabf964de39bb..ae1b48d7a8e3d573a5134a822a2ed5ef70511077 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 #include "paddle/operators/array_operator.h"
-
+#include "paddle/operators/detail/safe_ref.h"
 namespace paddle {
 namespace operators {
 
@@ -33,6 +33,8 @@ class WriteToArrayOp : public ArrayOp {
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
     if (offset >= out->size()) {
+      VLOG(10) << "Resize " << Output("Out") << " from " << out->size()
+               << " to " << offset + 1;
       out->resize(offset + 1);
     }
     auto *out_tensor = &out->at(offset);
@@ -85,11 +87,15 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
  public:
   void operator()(const framework::OpDescBind &op_desc,
                   framework::BlockDescBind *block) const override {
-    for (auto &out_var : op_desc.OutputArgumentNames()) {
-      VLOG(10) << "Set Variable " << out_var << " as LOD_TENSOR_ARRAY";
-      block->FindRecursiveOrCreateVar(out_var)->SetType(
-          framework::VarDesc::LOD_TENSOR_ARRAY);
-    }
+    auto x_name = op_desc.Input("X")[0];
+    auto out_name = op_desc.Output("Out")[0];
+    VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
+    auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name),
+                            "Cannot found %s", out_name);
+    out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+    auto &x =
+        detail::Ref(block->FindVarRecursive(x_name), "Cannot found %s", x_name);
+    out.SetDataType(x.GetDataType());
   }
 };
 
@@ -107,11 +113,11 @@ class ReadFromArrayOp : public ArrayOp {
     auto &x_array = x->Get<framework::LoDTensorArray>();
     auto *out = scope.FindVar(Output("Out"));
     PADDLE_ENFORCE(out != nullptr, "Out must be set");
-    auto *out_tesnor = out->GetMutable<framework::LoDTensor>();
+    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
     size_t offset = GetOffset(scope, dev_ctx);
     PADDLE_ENFORCE_LT(offset, x_array.size());
-    out_tesnor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx);
-    out_tesnor->set_lod(x_array[offset].lod());
+    out_tensor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx);
+    out_tensor->set_lod(x_array[offset].lod());
   }
 };
 
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 4ca6c8507a48507fd29a9c9acae2bdf36ed936ee..dcc59f5ff2ae3a8ca999d72a20cfd5c759987d89 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -14,8 +14,10 @@
 
 #include <vector>
 #include "paddle/framework/executor.h"
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
+#include "paddle/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
@@ -26,8 +28,9 @@ using LoDTensor = framework::LoDTensor;
 constexpr char kStepBlock[] = "step_block";
 constexpr char kCondition[] = "Condition";
 constexpr char kStepScopes[] = "StepScopes";
-constexpr char kParamGrads[] = "X@Grad";
 constexpr char kParameters[] = "X";
+constexpr char kParamGrads[] = "X@GRAD";
+constexpr char kOutputs[] = "Out";
 
 class WhileOp : public framework::OperatorBase {
  public:
@@ -71,9 +74,9 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
         kCondition,
         "(Bool) An scalar. When it's False, the While Op will be terminated.")
         .AsDuplicable();
-    AddOutput("Out",
+    AddOutput(kOutputs,
               "A set of variables, which will be assigned with values "
-              "generated by perators inside the block of While Op.")
+              "generated by the operators inside the block of While Op.")
         .AsDuplicable();
     AddOutput(kStepScopes,
               "(StepScopeVar) A vector of local scope, which size equals the "
@@ -104,17 +107,64 @@ class WhileGradOp : public framework::OperatorBase {
     auto *step_scopes =
         scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
 
+    auto outside_og_names = Inputs(framework::GradVarName(kOutputs));
+    auto inside_og_names =
+        Attr<std::vector<std::string>>("original_output_grad");
+
+    PADDLE_ENFORCE_EQ(outside_og_names.size(), inside_og_names.size());
+
     for (auto cur_scope_iter = step_scopes->rbegin();
          cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
+      VLOG(3) << "Start backward at time_step "
+              << cur_scope_iter - step_scopes->rbegin();
+      framework::Scope &cur_scope = **cur_scope_iter;
+      // Link OG from outside to inside
+      for (size_t i = 0; i < outside_og_names.size(); ++i) {
+        auto outside_og_name = outside_og_names[i];
+        auto inside_og_name = inside_og_names[i];
+        VLOG(10) << "Linking outside " << outside_og_name << " --> inside "
+                 << inside_og_name;
+        auto &og_outside = detail::Ref(scope.FindVar(outside_og_name));
+        auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name));
+        if (og_outside.Type().hash_code() ==
+            typeid(framework::LoDTensor).hash_code()) {
+          auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
+          auto &inside_tensor =
+              detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
+          inside_tensor.set_lod(outside_tensor.lod());
+          inside_tensor.ShareDataWith(outside_tensor);
+        } else if (og_outside.Type().hash_code() ==
+                   typeid(framework::LoDTensorArray).hash_code()) {
+          auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
+          auto &inside_array =
+              detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
+          VLOG(10) << outside_og_name << " size = " << outside_array.size();
+          inside_array.resize(outside_array.size());
+
+          for (size_t j = 0; j < inside_array.size(); ++j) {
+            VLOG(10) << j << " " << outside_array[j].numel();
+            if (outside_array[j].numel() != 0) {
+              inside_array[j].set_lod(outside_array[j].lod());
+              inside_array[j].ShareDataWith(outside_array[j]);
+            } else {
+              PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0);
+            }
+          }
+        }
+      }
+
       executor.Run(*program, *cur_scope_iter, block->ID(), false);
 
       auto &pg_names = Outputs(kParamGrads);
       auto &p_names = Inputs(kParameters);
       PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
-      for (size_t prog_id = 0; prog_id < pg_names.size(); ++prog_id) {
-        auto inside_grad_name = framework::GradVarName(p_names[prog_id]);
+      for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
+        if (pg_names[param_id] == framework::kEmptyVarName) {
+          continue;  // iterator doesn't have gradient
+        }
+        auto inside_grad_name = framework::GradVarName(p_names[param_id]);
 
-        //  // TODO(tonyyang-savil: Not sure we need the following
+        //  // TODO(tonyyang-svail): Not sure we need the following
         //  // If does not compute gradient of that variable inside rnn,
         //  just
         //  // continue
@@ -126,7 +176,7 @@ class WhileGradOp : public framework::OperatorBase {
         // zero gradient variable in step 0
         if (cur_scope_iter == step_scopes->rbegin()) {
           auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
-          PADDLE_ENFORCE_NOT_NULL(var);
+          PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name);
           if (var->IsType<LoDTensor>()) {
             auto &inside_tensor = var->Get<framework::LoDTensor>();
             framework::AttributeMap attrs;
@@ -135,27 +185,18 @@ class WhileGradOp : public framework::OperatorBase {
             attrs["value"] = 0.0f;
 
             auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", {}, {{"Out", {pg_names[prog_id]}}}, attrs);
+                "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
             zero_op->Run(scope, dev_ctx);
           }
         }
 
         // sum gradient
-        auto *outside_var = scope.FindVar(pg_names[prog_id]);
-        PADDLE_ENFORCE_NOT_NULL(outside_var);
-        auto &outside_tensor = *outside_var->GetMutable<framework::LoDTensor>();
-
-        std::string result_var_name;
-        auto *local_result_var = (*cur_scope_iter)->Var(&result_var_name);
-        auto &local_result_tensor =
-            *local_result_var->GetMutable<framework::LoDTensor>();
-
-        local_result_tensor.ShareDataWith(outside_tensor);
-
+        auto new_inside_name = cur_scope.Rename(inside_grad_name);
         auto sum_op = framework::OpRegistry::CreateOp(
-            "sum", {{"X", {result_var_name, inside_grad_name}}},
-            {{"Out", {result_var_name}}}, {});
-        sum_op->Run(**cur_scope_iter, dev_ctx);
+            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
+            {{"Out", {pg_names[param_id]}}}, {});
+        sum_op->Run(cur_scope, dev_ctx);
+        cur_scope.Rename(new_inside_name, inside_grad_name);
       }
     }
   }
@@ -169,29 +210,110 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
   virtual std::unique_ptr<framework::OpDescBind> Apply() const {
     auto *grad = new framework::OpDescBind();
     grad->SetType("while_grad");
-    for (auto &input_param : this->InputNames()) {
-      grad->SetInput(input_param, this->Input(input_param));
-      grad->SetOutput(framework::GradVarName(input_param),
-                      this->InputGrad(input_param));
+    grad->SetInput(kParameters, Input(kParameters));
+    grad->SetOutput(
+        framework::GradVarName(kParameters),
+        InputGrad(kParameters, /*do not drop empty gradient*/ false));
+    grad->SetInput(kOutputs, Output(kOutputs));
+
+    // OG should be re-calculated by step blocks, since many outputs of while op
+    // do not need to calculate gradients.
+    std::unordered_set<std::string> block_ins;
+    {
+      for (auto &p : Input(kParameters)) {
+        block_ins.insert(p);
+      }
+      for (auto &o : Output(kOutputs)) {
+        block_ins.insert(o);
+      }
     }
+    std::unordered_set<std::string> extra_inputs;
+    for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
+      for (auto &input_name : grad_block_[0]->Op(i)->InputArgumentNames()) {
+        if (block_ins.find(input_name) != block_ins.end()) {
+          continue;
+        }
+        extra_inputs.insert(input_name);
+      }
 
-    for (auto &output_param : this->OutputNames()) {
-      grad->SetInput(output_param, this->Output(output_param));
-      if (output_param != kStepScopes) {
-        grad->SetInput(framework::GradVarName(output_param),
-                       this->OutputGrad(output_param));
+      for (auto &output_name : grad_block_[0]->Op(i)->OutputArgumentNames()) {
+        block_ins.insert(output_name);
       }
     }
+
+    std::vector<std::string> extra_inputs_list;
+    extra_inputs_list.resize(extra_inputs.size());
+    std::copy(extra_inputs.begin(), extra_inputs.end(),
+              extra_inputs_list.begin());
+    grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list);
+    grad->SetInput(kStepScopes, Output(kStepScopes));
     grad->SetAttrMap(this->Attrs());
     grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
+    // record the original output gradient names, since the gradient name of
+    // while operator could be renamed.
+    grad->SetAttr("original_output_grad", extra_inputs_list);
 
     return std::unique_ptr<framework::OpDescBind>(grad);
   }
 };
 
+class WhileGradOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDescBind &op_desc,
+                  framework::BlockDescBind *block) const override {
+    auto p_names = op_desc.Input(kParameters);
+    auto pg_names = op_desc.Output(framework::GradVarName(kParameters));
+
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
+      auto *g_var = block->FindVarRecursive(pg_names[i]);
+      if (g_var != nullptr) {  // Gradient could be @EMPTY@
+        VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i]
+                << " type: " << p_var.GetType();
+        g_var->SetType(p_var.GetType());
+        g_var->SetDataType(p_var.GetDataType());
+      }
+    }
+  }
+};
+
+class WhileGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    ctx->HasInputs(kParameters);
+    ctx->HasOutputs(framework::GradVarName(kParameters));
+    ctx->HasInputs(kOutputs);
+    ctx->HasInputs(framework::GradVarName(kOutputs));
+
+    auto p_names = ctx->Inputs(kParameters);
+    auto pg_names = ctx->Outputs(kParamGrads);
+    auto dims = ctx->GetInputsDim(kParameters);
+    auto var_types = ctx->GetInputsVarType(kParameters);
+    std::vector<std::string> names_to_set;
+    std::vector<framework::DDim> dims_to_set;
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      if (pg_names[i] == framework::kEmptyVarName) {
+        continue;
+      }
+      if (var_types[i] == framework::VarDesc::LOD_TENSOR) {
+        names_to_set.push_back(pg_names[i]);
+        dims_to_set.push_back(dims[i]);
+      } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) {
+        // not sure how to set the dim of LOD_TENSOR_ARRAY
+        names_to_set.push_back(pg_names[i]);
+        dims_to_set.push_back(dims[i]);
+      }
+    }
+    ctx->SetDims(names_to_set, dims_to_set);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 REGISTER_OPERATOR(while, paddle::operators::WhileOp,
                   paddle::operators::WhileOpMaker,
                   paddle::operators::WhileGradOpDescMaker);
+REGISTER_OPERATOR(while_grad, paddle::operators::WhileGradOp,
+                  paddle::operators::WhileGradOpShapeInference,
+                  paddle::operators::WhileGradOpVarTypeInference);
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index 8b3be062b654a52e667626199be8c8bb4a2a96d7..1898598e49652a2829e57329bab6017304cec662 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -30,7 +30,7 @@ void sgdUpdateCpu(real learningRate,
                   const real* grad,
                   real* momentumVec) {
   decayRate *= learningRate;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_USE_MKLML
 #pragma omp parallel for
 #endif
   for (size_t i = 0; i < size; ++i) {
diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h
index ce3421a3cb840e4c1e872eea12dedc1150c85962..dd48605b9ed688e4656d4cd1ddf1f298d0a50a9e 100644
--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
@@ -63,9 +63,10 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
     }                                                             \
   } while (false)
 
-enum class DataLayout {
+enum class DataLayout {  // Not use
   kNHWC,
   kNCHW,
+  kNCDHW,
   kNCHW_VECT_C,
 };
 
@@ -107,12 +108,15 @@ class CudnnDataType<double> {
   }
 };
 
-inline cudnnTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) {
+inline cudnnTensorFormat_t GetCudnnTensorFormat(
+    const DataLayout& order) {  // Not use
   switch (order) {
     case DataLayout::kNHWC:
       return CUDNN_TENSOR_NHWC;
     case DataLayout::kNCHW:
       return CUDNN_TENSOR_NCHW;
+    case DataLayout::kNCDHW:
+      return CUDNN_TENSOR_NCHW;  // TODO(chengduoZH) : add CUDNN_TENSOR_NCDHW
     default:
       PADDLE_THROW("Unknown cudnn equivalent for order");
   }
@@ -139,7 +143,7 @@ class ScopedTensorDescriptor {
       strides[i] = dims[i + 1] * strides[i + 1];
     }
     // Update tensor descriptor dims setting if groups > 1
-    // FIXME(typhoonzero): Assume using NCHW order
+    // FIXME(typhoonzero): Assume using NCHW or NCDHW order
     std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
@@ -176,9 +180,10 @@ class ScopedFilterDescriptor {
                                             const cudnnDataType_t type,
                                             const std::vector<int>& kernel,
                                             const int groups = 1) {
-    // filter layout: MCHW, where M is the number of
+    // filter layout: MCHW(MCDHW), where M is the number of
     // output image channels, C is the number of input image channels,
-    // H and W is height and width of filter.
+    // D is the depth of the filter, H is the height of the filter, and W is the
+    // width of the filter.
     std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
     if (groups > 1) {
       // M /= groups
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index b5fd68839ddb62e76f2fd930248d546bc093a892..f3a6f1dba7588c6b29c1dcae26ec134c1a7f937d 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -57,8 +57,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
 | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
 | `WITH_TESTING` | ON | Build unit tests binaries. |
-| `WITH_MKLDNN` | ON | Build with [Intel® MKL DNN](https://github.com/01org/mkl-dnn) support. |
-| `WITH_MKLML` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) support. |
+| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
 | `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
 | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
 | `WITH_C_API` | OFF | Build capi libraries for inference. |
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index e9c89eee1af1fcc4a7f168af5ec8b16912616687..595d25fd4830b6e69b9a1080803771b0464741db 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -34,8 +34,7 @@ function cmake_gen() {
         ${PYTHON_FLAGS}
         -DWITH_DOC=OFF
         -DWITH_GPU=${WITH_GPU:-OFF}
-        -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
-        -DWITH_MKLML=${WITH_MKLML:-ON}
+        -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_AVX=${WITH_AVX:-OFF}
         -DWITH_GOLANG=${WITH_GOLANG:-ON}
         -DWITH_SWIG_PY=ON
@@ -56,8 +55,7 @@ EOF
         ${PYTHON_FLAGS} \
         -DWITH_DOC=OFF \
         -DWITH_GPU=${WITH_GPU:-OFF} \
-        -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
-        -DWITH_MKLML=${WITH_MKLML:-ON} \
+        -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
         -DWITH_GOLANG=${WITH_GOLANG:-ON} \
         -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index b9a49526a7e02131767a4e9b26cd0b53278176d0..d71cb84df3785008ea5793519fc26a174e1b95f7 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -18,8 +18,8 @@ function version(){
         echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
         echo "    with_avx: @WITH_AVX@"
         echo "    with_gpu: @WITH_GPU@"
+        echo "    with_mkl: @WITH_MKL@"
         echo "    with_mkldnn: @WITH_MKLDNN@"
-        echo "    with_mklml: @WITH_MKLML@"
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
         echo "    with_rdma: @WITH_RDMA@"
@@ -45,8 +45,8 @@ function ver2num() {
 
 function cpu_config() {
   # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
-  # only when MKLDNN or MKLML enabled
-  if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
+  # only when MKL enabled
+  if [ "@WITH_MKL@" == "OFF" ]; then
     return 0
   fi
   ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
@@ -70,8 +70,8 @@ function cpu_config() {
 function threads_config() {
   # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
   # according to trainer_count and total processors
-  # only when MKLDNN or MKLML enabled
-  if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
+  # only when MKL enabled
+  if [ "@WITH_MKL@" == "OFF" ]; then
     return 0
   fi
   processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 973b2736e5ce2b733d52df4f5a270b296bca2cac..28d82343ed32273740d0c52d0451681e43b3675e 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -6,7 +6,7 @@ mkdir -p $TRAVIS_BUILD_DIR/build
 cd $TRAVIS_BUILD_DIR/build
 
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
 make -j `nproc` paddle_docs paddle_docs_cn
 
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index b68e29cd5ea223272151e7a8b52d998832f47103..88e684849df6fbfe4042b92bdb76ef98159eecea 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -137,6 +137,10 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
     }
   }
 
+  if (FLAGS_use_mkldnn) {
+    CHECK_EQ(FLAGS_trainer_count, 1UL) << "MKLDNN only need 1 trainer";
+  }
+
   if (testing) {
     LOG(INFO) << "trainer: in testing mode";
     if (config_->getOptConfig().use_sparse_remote_updater() ||
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index f01ad4142d4fe7c7f7d7aac60d967ea114b93e56..80665551ec51214d90b866f0c7b2abb2fdee5f39 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -28,35 +28,7 @@ if(WITH_PYTHON)
           ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
       WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
-################ test_CompareTwoNets ######################
-add_unittest_without_exec(test_CompareTwoNets
-    test_CompareTwoNets.cpp)
-add_test(NAME test_CompareTwoNets
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
-            --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
-############### test_CompareTwoOpts ###################
-add_unittest_without_exec(test_CompareTwoOpts
-    test_CompareTwoOpts.cpp)
-add_test(NAME test_CompareTwoOpts
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts
-            --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf
-            --num_passes=1 --need_high_accuracy=0
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-
-################# test_CompareSparse ##################
-add_unittest_without_exec(test_CompareSparse
-    test_CompareSparse.cpp)
-if(NOT ON_TRAVIS)
-  add_test(NAME test_CompareSparse
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-              ./.set_port.sh -p port -n 6
-                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-endif()
 ################# test_recurrent_machine_generation ###############
 add_unittest_without_exec(test_recurrent_machine_generation
     test_recurrent_machine_generation.cpp)
diff --git a/paddle/trainer/tests/mnist.list b/paddle/trainer/tests/mnist.list
deleted file mode 100644
index 703e87753d5a4f507aad11a6d875cea44787667b..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/mnist.list
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/mnist_bin_part
diff --git a/paddle/trainer/tests/mnist_bin_part b/paddle/trainer/tests/mnist_bin_part
deleted file mode 100644
index 08b93a0ebb5698bdafbc36c3c757918a50bab621..0000000000000000000000000000000000000000
Binary files a/paddle/trainer/tests/mnist_bin_part and /dev/null differ
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
deleted file mode 100644
index f189b21e86a50d70d317b5e43aa2d6e05af5e774..0000000000000000000000000000000000000000
Binary files a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data and /dev/null differ
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
deleted file mode 100644
index 6b406dff0ba91b5f310d7eafa111c0d21d6542c3..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
+++ /dev/null
@@ -1 +0,0 @@
-./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
diff --git a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
deleted file mode 100644
index 92f32a18c0068ab4672034a270aa8c52f2716d59..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
+++ /dev/null
@@ -1,154 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-# Note: when making change to this file, please make sure
-# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-
-default_initial_std(0.1)
-default_device(0)
-
-word_dim = 999
-l1 = 0
-l2 = 0
-
-model_type("nn")
-
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-TrainData(ProtoData(        
-            type = "proto_sequence",
-            files = ('trainer/tests/train_sparse.list'), 
-            ))
-
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-
-
-wordvec_dim = 32
-layer2_dim = 16
-layer3_dim = 16
-hidden_dim = 32
-
-slot_names = ["qb", "qw", "tb", "tw"]
-
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        Layer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            type = "recurrent",
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            inputs = Input(slot_names[i] + "_embedding_" + network_name,
-                           parameter_name = "rnn1.w0")
-        )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-
-
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_a.conf b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
deleted file mode 100644
index b1744db8d604c88ec47e7104f79b38bb9d0e4442..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_opt_a.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-################################### Data Configuration ###################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
-################################### Algorithm Configuration ###################################
-settings(batch_size = 1000,
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-
-fc1 = fc_layer(input=data, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=fc1, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-output = fc_layer(input=[fc1, fc2], size=10,
-                  bias_attr=True,
-                  act=SoftmaxActivation())
-
-lbl = data_layer(name ="label", size=1)
-
-cost = classification_cost(input=output, label=lbl)
-outputs(cost)
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_b.conf b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
deleted file mode 100644
index b1744db8d604c88ec47e7104f79b38bb9d0e4442..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_opt_b.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-################################### Data Configuration ###################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
-################################### Algorithm Configuration ###################################
-settings(batch_size = 1000,
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-
-fc1 = fc_layer(input=data, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=fc1, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-output = fc_layer(input=[fc1, fc2], size=10,
-                  bias_attr=True,
-                  act=SoftmaxActivation())
-
-lbl = data_layer(name ="label", size=1)
-
-cost = classification_cost(input=output, label=lbl)
-outputs(cost)
diff --git a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf b/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
deleted file mode 100644
index d19222360c2f424ddb306b155dfef07921098a6b..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
+++ /dev/null
@@ -1,154 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-# Note: when making change to this file, please make sure
-# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-
-default_initial_std(0.1)
-default_device(0)
-
-word_dim = 1451594
-l1 = 0
-l2 = 0
-
-model_type("nn")
-
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-TrainData(ProtoData(        
-            type = "proto_sequence",
-            files = ('trainer/tests/train.list'), 
-            ))
-
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-
-
-wordvec_dim = 128
-layer2_dim = 96
-layer3_dim = 96
-hidden_dim = 128
-
-slot_names = ["qb", "qw", "tb", "tw"]
-
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        Layer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            type = "recurrent",
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            inputs = Input(slot_names[i] + "_embedding_" + network_name,
-                           parameter_name = "rnn1.w0")
-        )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-
-
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
diff --git a/paddle/trainer/tests/sample_trainer_config_rnn.conf b/paddle/trainer/tests/sample_trainer_config_rnn.conf
deleted file mode 100644
index b720d4d5a6ca59e207832a8c5410c2cb6074c439..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_rnn.conf
+++ /dev/null
@@ -1,180 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-# Note: when making change to this file, please make sure
-# sample_trainer_config_qb_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-
-default_initial_std(0.1)
-default_device(0)
-
-word_dim = 1451594
-l1 = 0
-l2 = 0
-
-model_type("recurrent_nn")
-
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-TrainData(ProtoData(
-            type = "proto_sequence",
-            files = ('trainer/tests/train.list'), 
-            ))
-
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-
-
-wordvec_dim = 128
-layer2_dim = 96
-layer3_dim = 96
-hidden_dim = 128
-
-slot_names = ["qb", "qw", "tb", "tw"]
-
-def SimpleRecurrentLayer(name, 
-                         size, 
-                         active_type, 
-                         bias, 
-                         input_layer_name, 
-                         parameter_name,
-                         seq_reversed = False):
-    RecurrentLayerGroupBegin(name + "_layer_group", 
-                             in_links=[input_layer_name], 
-                             out_links=[name],
-                             seq_reversed=seq_reversed)
-    memory_name = Memory(name=name, size=size)
-    Layer(
-        name = name,
-        type = "mixed",
-        size = size,
-        active_type = active_type,
-        bias = bias,
-        inputs = [IdentityProjection(input_layer_name),
-                  FullMatrixProjection(memory_name,
-                                       parameter_name = parameter_name,
-                                       ),
-                  ]
-        )
-    RecurrentLayerGroupEnd(name + "_layer_group")
-
-
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        SimpleRecurrentLayer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            size = hidden_dim,
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            input_layer_name = slot_names[i] + "_embedding_" + network_name,
-            parameter_name = "rnn1.w0",
-            )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-
-
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
diff --git a/paddle/trainer/tests/testPyDataWrapper.py b/paddle/trainer/tests/testPyDataWrapper.py
index 2c29a274339747b78fbd6c27ae4070f0abbd4028..a76eeeacb91cdba305d2f71c6292f79e4b98dd73 100644
--- a/paddle/trainer/tests/testPyDataWrapper.py
+++ b/paddle/trainer/tests/testPyDataWrapper.py
@@ -20,28 +20,6 @@ import random
 import json
 import string
 
-
-@provider(slots=[
-    SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1),
-    IndexSlot(3)
-])
-def processNonSequenceData(obj, filename):
-    with open(filename, "rb") as f:
-        for line in f:
-            slots_str = line.split(';')
-            index = int(slots_str[0])
-            non_values = map(int, slots_str[1].split()[1:])
-            dense = map(float, slots_str[2].split()[1:])
-            strs = slots_str[4].strip().split(' ', 1)[1]
-
-            def __values_mapper__(s):
-                s = s.split(":")
-                return int(s[0]), float(s[1])
-
-            values = map(__values_mapper__, slots_str[3].split()[1:])
-            yield [non_values, dense, values, strs, index]
-
-
 SPARSE_ID_LIMIT = 1000
 SPARSE_ID_COUNT = 100
 SEQUENCE_LIMIT = 50
@@ -146,8 +124,6 @@ def processSubSeqAndGenerateData(obj, name):
 
 
 if __name__ == "__main__":
-    pvd = processNonSequenceData("test.txt")
-    print pvd.getNextBatch(100)
     pvd = processSeqAndGenerateData("_")
     print pvd.getNextBatch(100)
     pvd = processSubSeqAndGenerateData("_")
diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp
deleted file mode 100644
index 383505f8131264844069d6f0fa13f4e0ac1f97af..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/trainer/Trainer.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-
-DECLARE_bool(local);
-DECLARE_bool(use_gpu);
-
-DECLARE_string(config);
-DECLARE_string(nics);
-
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
-DEFINE_bool(need_high_accuracy,
-            true,
-            "whether need to run in double accuracy (recommended)");
-DEFINE_double(
-    max_diff_ratio,
-    0.0f,
-    "max diff ratio allowed for outputs and parameters (value/gradient)");
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-void calcGradient(ComData& data, const string configFile) {
-  FLAGS_config = configFile;
-
-  FLAGS_local = true;
-  FLAGS_use_gpu = false;
-
-  FLAGS_nics = "";
-
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
-
-  data.parameters = trainer.getGradientMachine()->getParameters();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.train();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    if (diff > 0.0f &&
-        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_max_diff_ratio) {
-      nNum++;
-      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
-                << "    " << desB << " : " << B[i];
-    }
-  }
-  EXPECT_EQ(0, nNum);
-  LOG(INFO) << "\n\n";
-}
-
-void compareGradient(ComData& comDataA, ComData& comDataB) {
-  vector<Argument> outArgsA = comDataA.outArgs;
-  vector<Argument> outArgsB = comDataB.outArgs;
-
-  for (size_t i = 0; i < outArgsA.size(); ++i) {
-    CpuMatrix matA(outArgsA[i].value->getHeight(),
-                   outArgsA[i].value->getWidth());
-    CpuMatrix matB(outArgsB[i].value->getHeight(),
-                   outArgsB[i].value->getWidth());
-
-    matA.copyFrom(*outArgsA[i].value);
-    matB.copyFrom(*outArgsB[i].value);
-
-    LOG(INFO) << "\n--------------------------------"
-              << " Check Network Output_" << i << ":"
-              << " -------------------------------------\n";
-    checkBuffer(matA.getData(),
-                "network A output",
-                matB.getData(),
-                "network B output",
-                matA.getElementCnt(),
-                matA.getWidth());
-  }
-
-  vector<ParameterPtr>& parametersA = comDataA.parameters;
-  vector<ParameterPtr>& parametersB = comDataB.parameters;
-
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "Network A",
-                paraB.getData(),
-                "Network B",
-                paraA.getSize());
-
-    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
-    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
-              << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(),
-                "Network A",
-                gradB.getData(),
-                "Network B",
-                gradA.getSize());
-  }
-}
-
-TEST(Trainer, create) {
-  ComData dataA;
-  calcGradient(dataA, FLAGS_config_file_a);
-  LOG(INFO) << "\n\ntraining of Network A is finished\n\n";
-
-  ComData dataB;
-  calcGradient(dataB, FLAGS_config_file_b);
-  LOG(INFO) << "\n\ntraining of the Network B is finished\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  if (FLAGS_need_high_accuracy) {
-    LOG(INFO) << "skip test due to it's need high accuracy";
-    return 0;
-  }
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 2e-4;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in low accuracy mode";
-  }
-#else
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 2e-7;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in high accuracy mode";
-  }
-#endif
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
index 66ec65e340a435a7260028611828fb28845e0728..92dc8aa9ec5ce281d1950d84260c1b9555e686a7 100644
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -25,45 +25,9 @@ limitations under the License. */
 #include <unordered_set>
 #include "picojson.h"
 
-void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual);
 void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
 const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/";
 
-TEST(PyDataProviderWrapper, NoSequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module(std::string("testPyDataWrapper"));
-  conf.set_load_data_object(std::string("processNonSequenceData"));
-  conf.set_async_load_data(false);
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(100, &batchFromPy);
-
-  paddle::DataConfig conf2;
-  conf2.set_type("proto");
-  conf2.set_async_load_data(false);
-  conf2.clear_files();
-  conf2.set_files(kDir + "test_pydata_provider_wrapper.protolist");
-
-  provider.reset(paddle::DataProvider::create(conf2, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromProto;
-  provider->getNextBatch(100, &batchFromProto);
-
-  std::vector<paddle::Argument>& pyArguments = batchFromPy.getStreams();
-  std::vector<paddle::Argument>& protoArguments = batchFromProto.getStreams();
-  EXPECT_EQ(pyArguments.size(), protoArguments.size());
-
-  for (size_t i = 0; i < pyArguments.size(); ++i) {
-    checkEqual(protoArguments[i], pyArguments[i]);
-  }
-}
-
 TEST(PyDataProviderWrapper, SequenceData) {
   paddle::DataConfig conf;
   conf.set_type("py");
@@ -148,66 +112,6 @@ int main(int argc, char** argv) {
   return RUN_ALL_TESTS();
 }
 
-void checkEqual(const paddle::Argument& expect,
-                const paddle::Argument& actual) {
-  if (expect.value) {
-    EXPECT_TRUE(actual.value != nullptr);
-    paddle::Matrix* e = expect.value.get();
-    paddle::Matrix* a = actual.value.get();
-    EXPECT_EQ(e->getWidth(), a->getWidth());
-    EXPECT_EQ(e->getHeight(), a->getHeight());
-    if (dynamic_cast<paddle::CpuSparseMatrix*>(e)) {
-      paddle::CpuSparseMatrix* se = dynamic_cast<paddle::CpuSparseMatrix*>(e);
-      paddle::CpuSparseMatrix* sa = dynamic_cast<paddle::CpuSparseMatrix*>(a);
-      EXPECT_EQ(se->getFormat(), sa->getFormat());
-      EXPECT_EQ(se->getElementCnt(), sa->getElementCnt());
-      size_t rowSize = se->getFormat() == paddle::SPARSE_CSC
-                           ? se->getElementCnt()
-                           : se->getHeight() + 1;
-      size_t colSize = se->getFormat() == paddle::SPARSE_CSC
-                           ? se->getWidth() + 1
-                           : se->getElementCnt();
-      for (size_t i = 0; i < rowSize; ++i) {
-        EXPECT_EQ(se->getRows()[i], sa->getRows()[i]);
-      }
-      for (size_t i = 0; i < colSize; ++i) {
-        EXPECT_EQ(se->getCols()[i], sa->getCols()[i]);
-      }
-      if (se->getValueType() == paddle::FLOAT_VALUE) {
-        EXPECT_EQ(paddle::FLOAT_VALUE, sa->getValueType());
-        for (size_t i = 0; i < se->getElementCnt(); ++i) {
-          EXPECT_EQ(se->getValue()[i], sa->getValue()[i]);
-        }
-      }
-    } else if (dynamic_cast<paddle::CpuMatrix*>(e)) {
-      EXPECT_EQ(e->getElementCnt(), a->getElementCnt());
-      for (size_t i = 0; i < e->getElementCnt(); ++i) {
-        EXPECT_EQ(e->getData()[i], a->getData()[i]);
-      }
-    }
-  }
-
-  if (expect.ids) {
-    EXPECT_TRUE(actual.ids != nullptr);
-    paddle::VectorT<int>* e = expect.ids.get();
-    paddle::VectorT<int>* a = actual.ids.get();
-    EXPECT_EQ(e->getSize(), a->getSize());
-    for (size_t i = 0; i < e->getSize(); ++i) {
-      EXPECT_EQ(e->getData()[i], a->getData()[i]);
-    }
-  }
-
-  if (expect.strs) {
-    EXPECT_TRUE(actual.strs != nullptr);
-    std::vector<std::string>* e = expect.strs.get();
-    std::vector<std::string>* a = actual.strs.get();
-    EXPECT_EQ(e->size(), a->size());
-    for (size_t i = 0; i < e->size(); ++i) {
-      EXPECT_EQ((*e)[i], (*a)[i]);
-    }
-  }
-}
-
 void checkValue(std::vector<paddle::Argument>& arguments,
                 picojson::array& arr) {
   // CHECK SLOT 0, Sparse Value.
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 5bd68e211ac1c8e05f40dc3ca37eef99f32af47f..d6128dd7692a2faebf453d239744c4893d84e369 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1826,7 +1826,7 @@ class FCLayer(LayerBase):
             self.layer_type = 'mkldnn_fc'
             config_assert(
                 len(inputs) == 1,
-                "MkldnnFCLayer support one and only one input!")
+                "MKLDNNFCLayer support one and only one input!")
         super(FCLayer, self).__init__(
             name, self.layer_type, size, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
@@ -1837,7 +1837,7 @@ class FCLayer(LayerBase):
             sparse = format == "csr" or format == "csc"
             if use_mkldnn:
                 config_assert(not sparse,
-                              "MkldnnFCLayer do not support sparse format yet")
+                              "MKLDNNFCLayer do not support sparse format yet")
                 if use_mkldnn_wgt:
                     dims = [self.config.size, input_layer.size]
             if sparse:
@@ -1853,7 +1853,7 @@ class FCLayer(LayerBase):
 
 
 @config_layer('mkldnn_fc')
-class MkldnnFcLayer(FCLayer):
+class MKLDNNFcLayer(FCLayer):
     layer_type = 'mkldnn_fc'
 
 
@@ -3209,6 +3209,18 @@ class SubNestedSequenceLayer(LayerBase):
         self.set_layer_size(size)
 
 
+@config_layer('dot_prod')
+class DotProdLayer(LayerBase):
+    def __init__(self, name, inputs, device=None):
+        super(DotProdLayer, self).__init__(
+            name, 'dot_prod', 0, inputs, device=device)
+        config_assert(len(inputs) == 2, 'DotProdLayer must have 2 inputs.')
+        config_assert(
+            self.get_input_layer(0).size == self.get_input_layer(1).size,
+            "Two inputs should have the same size.")
+        self.set_layer_size(1)
+
+
 @config_layer('out_prod')
 class OuterProdLayer(LayerBase):
     def __init__(self, name, inputs, device=None):
@@ -3506,11 +3518,17 @@ def ExpressionLayer(name, inputs, **xargs):
 
 @config_layer('concat')
 class ConcatenateLayer(LayerBase):
+    layer_type = 'concat'
+
     def __init__(self, name, inputs, bias=False, **xargs):
         config_assert(inputs, 'inputs cannot be empty')
         config_assert(not bias, 'ConcatenateLayer cannot support bias.')
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        if self.layer_type == "mkldnn_concat":
+            config_assert(use_mkldnn, "mkldnn_concat only support MKLDNN")
+        self.layer_type = 'mkldnn_concat' if use_mkldnn else 'concat'
         super(ConcatenateLayer, self).__init__(
-            name, 'concat', 0, inputs=inputs, **xargs)
+            name, self.layer_type, 0, inputs=inputs, **xargs)
         size = 0
         for input_index in xrange(len(self.inputs)):
             assert self.get_input_layer(0).height == self.get_input_layer(
@@ -3530,6 +3548,11 @@ class ConcatenateLayer(LayerBase):
         self.set_layer_size(size)
 
 
+@config_layer('mkldnn_concat')
+class MKLDNNConcatLayer(ConcatenateLayer):
+    layer_type = 'mkldnn_concat'
+
+
 # like concat layer, but each input layer was processed by a Projection.
 @config_layer('concat2')
 class ConcatenateLayer2(LayerBase):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index a02eba007ddf929ff92df995df253f5a386bac7b..388535d53a9d1d6747ac89cb698f3a1f496b5f7c 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -115,6 +115,7 @@ __all__ = [
     'huber_classification_cost',
     'block_expand_layer',
     'maxout_layer',
+    'dot_prod_layer',
     'out_prod_layer',
     'printer_layer',
     'print_layer',
@@ -197,6 +198,7 @@ class LayerType(object):
     SCALING_LAYER = 'scaling'
     TRANS_LAYER = 'trans'
     ROTATE_LAYER = 'rotate'
+    DOT_PROD_LAYER = 'dot_prod'
     OUT_PROD_LAYER = 'out_prod'
     FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
 
@@ -4140,6 +4142,45 @@ def maxid_layer(input, name=None, layer_attr=None):
         size=l.config.size)
 
 
+@wrap_name_default()
+def dot_prod_layer(input1, input2, name=None, layer_attr=None):
+    """
+    A layer for computing the dot product of two vectors.
+
+    The example usage is:
+
+    .. code-block:: python
+
+        dot_prod = dot_prod_layer(input1=vec1, input2=vec2)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input1: The first input layer.
+    :type input: LayerOutput
+    :param input2: The second input layer.
+    :type input2: LayerOutput
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input1, LayerOutput)
+    assert isinstance(input2, LayerOutput)
+    assert input1.size == input2.size, ("Two inputs should have the same size.")
+
+    l = Layer(
+        name=name,
+        type=LayerType.DOT_PROD_LAYER,
+        inputs=[input1.name, input2.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.DOT_PROD_LAYER,
+        parents=[input1, input2],
+        size=l.config.size)
+
+
 @wrap_name_default()
 def out_prod_layer(input1, input2, name=None, layer_attr=None):
     """
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 1c7451e0abf5dc1b99671f292e2ffc2d2282abe9..0b269a1ff76530774b4d23b0867350fd95e081a3 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -10,6 +10,7 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
 test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
-test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer)
+test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer
+test_dot_prod_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..f1530c382c3d81a82592af2c43c06eb4278e2b4a
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
@@ -0,0 +1,38 @@
+type: "nn"
+layers {
+  name: "vector1"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "vector2"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__dot_prod_layer_0__"
+  type: "dot_prod"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "vector1"
+  }
+  inputs {
+    input_layer_name: "vector2"
+  }
+}
+input_layer_names: "vector1"
+input_layer_names: "vector2"
+output_layer_names: "__dot_prod_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "vector1"
+  layer_names: "vector2"
+  layer_names: "__dot_prod_layer_0__"
+  input_layer_names: "vector1"
+  input_layer_names: "vector2"
+  output_layer_names: "__dot_prod_layer_0__"
+  is_recurrent_layer_group: false
+}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e52d48dde0084aacd3f7874cc384d59287a0c7d5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+vec1 = data_layer(name='vector1', size=10)
+vec2 = data_layer(name='vector2', size=10)
+dot_product = dot_prod_layer(input1=vec1, input2=vec2)
+
+outputs(dot_product)
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index f20567243ae67baecbdbac13f879f4cf2f66d298..acca6ba35ced8674d4eec7dc57e41673c90cf8f8 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -4,7 +4,10 @@ import collections
 import numpy as np
 import copy
 
-__all__ = ['Block', 'Variable', 'Program', 'Operator', 'default_startup_program', 'default_main_program']
+__all__ = [
+    'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
+    'default_main_program'
+]
 
 
 def unique_name(prefix):
@@ -12,9 +15,9 @@ def unique_name(prefix):
     return "_".join([prefix, str(uid)])
 
 
-def _debug_string_(proto):
+def _debug_string_(proto, throw_on_error=True):
     error_fields = list()
-    if not proto.IsInitialized(error_fields):
+    if not proto.IsInitialized(error_fields) and throw_on_error:
         raise ValueError("{0} are not initialized\nThe message is {1}".format(
             error_fields, proto))
     return proto.__str__()
@@ -101,9 +104,12 @@ class Variable(object):
         self.stop_gradient = stop_gradient
 
     def __str__(self):
+        return self.to_string(True)
+
+    def to_string(self, throw_on_error):
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.VarDesc.FromString(str(protostr))
-        return _debug_string_(proto)
+        return _debug_string_(proto, throw_on_error)
 
     __repr__ = __str__
 
@@ -229,17 +235,17 @@ class Operator(object):
                     in_proto.name)
 
                 if found:
-                    in_argus = inputs[in_proto.name]
-                    if not isinstance(in_argus, list):
-                        in_argus = [in_argus]
-                    if not in_proto.duplicable and len(in_argus) > 1:
+                    in_args = inputs[in_proto.name]
+                    if not isinstance(in_args, list):
+                        in_args = [in_args]
+                    if not in_proto.duplicable and len(in_args) > 1:
                         raise ValueError(
                             "Input %s expects only one input, but %d are given."
-                            % (in_proto.name, len(in_argus)))
-                    in_argu_names = []
-                    for argu in in_argus:
-                        in_argu_names.append(argu.name)
-                    self.desc.set_input(in_proto.name, in_argu_names)
+                            % (in_proto.name, len(in_args)))
+                    in_arg_names = []
+                    for arg in in_args:
+                        in_arg_names.append(arg.name)
+                    self.desc.set_input(in_proto.name, in_arg_names)
                 else:
                     self.desc.set_input(in_proto.name, [])
 
@@ -257,18 +263,18 @@ class Operator(object):
                         str(e) for e in given)))
 
             for out_proto in proto.outputs:
-                out_argus = outputs[out_proto.name]
-                if not isinstance(out_argus, list):
-                    out_argus = [out_argus]
-                if not out_proto.duplicable and len(out_argus) > 1:
+                out_args = outputs[out_proto.name]
+                if not isinstance(out_args, list):
+                    out_args = [out_args]
+                if not out_proto.duplicable and len(out_args) > 1:
                     raise ValueError(
                         "Output %s expects only one output, but %d are given." %
-                        (out_proto.name, len(out_argus)))
-                out_argu_names = []
-                for argu in out_argus:
-                    out_argu_names.append(argu.name)
-                    argu.op = self
-                self.desc.set_output(out_proto.name, out_argu_names)
+                        (out_proto.name, len(out_args)))
+                out_arg_names = []
+                for arg in out_args:
+                    out_arg_names.append(arg.name)
+                    arg.op = self
+                self.desc.set_output(out_proto.name, out_arg_names)
 
         if attrs is not None:
             if not isinstance(attrs, dict):
@@ -291,10 +297,13 @@ class Operator(object):
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
 
-    def __str__(self):
+    def to_string(self, throw_on_error):
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.OpDesc.FromString(str(protostr))
-        return _debug_string_(proto)
+        return _debug_string_(proto, throw_on_error)
+
+    def __str__(self):
+        return self.to_string(True)
 
     __repr__ = __str__
 
@@ -349,9 +358,12 @@ class Block(object):
         self.program = program
 
     def __str__(self):
+        return self.to_string(True)
+
+    def to_string(self, throw_on_error):
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.BlockDesc.FromString(str(protostr))
-        return _debug_string_(proto)
+        return _debug_string_(proto, throw_on_error)
 
     __repr__ = __str__
 
@@ -454,9 +466,12 @@ class Program(object):
         self.current_block_idx = 0
 
     def __str__(self):
+        return self.to_string(True)
+
+    def to_string(self, throw_on_error):
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.ProgramDesc.FromString(str(protostr))
-        return _debug_string_(proto)
+        return _debug_string_(proto, throw_on_error)
 
     def clone(self):
         p = Program()
@@ -512,7 +527,14 @@ class Program(object):
         assert isinstance(target, Variable)
         if no_grad_set is None:
             no_grad_set = set()
-        param_to_grad_info = self.desc.append_backward(target.desc, no_grad_set)
+        try:
+            param_to_grad_info = self.desc.append_backward(target.desc,
+                                                           no_grad_set)
+        except Exception as e:
+            raise core.EnforceNotMet(
+                str(e) + "\nCurrent protobuf is\n{0}".format(
+                    self.to_string(False)))
+
         self.sync_with_cpp()
         return param_to_grad_info
 
@@ -563,8 +585,10 @@ class Parameter(Variable):
 g_main_program = Program()
 g_startup_program = Program()
 
+
 def default_startup_program():
     return g_startup_program
 
+
 def default_main_program():
     return g_main_program
diff --git a/python/paddle/v2/fluid/net_drawer.py b/python/paddle/v2/fluid/net_drawer.py
index 17ad547c2bb5b79ef8225dd1a8f1ef49a6572508..94fdd5e38970b309580de6fc934b158a3c46e464 100644
--- a/python/paddle/v2/fluid/net_drawer.py
+++ b/python/paddle/v2/fluid/net_drawer.py
@@ -66,10 +66,13 @@ def parse_graph(program, graph, var_dict, **kwargs):
             if not var_dict.has_key(var):
                 var_dict[var] = "Feed"
 
+    temp_id = 0
     proto = framework_pb2.ProgramDesc.FromString(
         program.desc.serialize_to_string())
     for block in proto.blocks:
         for op in block.ops:
+            op.type = op.type + "_" + str(temp_id)
+            temp_id += 1
             graph.node(**draw_node(op))
             for o in op.outputs:
                 for arg in o.arguments:
@@ -78,6 +81,7 @@ def parse_graph(program, graph, var_dict, **kwargs):
                 for arg in e.arguments:
                     if var_dict.has_key(arg):
                         graph.edge(**draw_edge(var_dict, op, e, arg))
+        break  # only plot the first block
 
 
 def draw_graph(startup_program, main_program, **kwargs):
diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
index ee677a2c5670a092c509b9ce1c555223bf22957f..a7f3bfc0caf76302674a00c80c2bd9ebf834f872 100644
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -1,33 +1,22 @@
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.io import save_persistables, load_persistables
+import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.io import save_persistables, load_persistables
+from paddle.v2.fluid.optimizer import SGDOptimizer
 
-import numpy as np
-
-x = layers.data(
-    name='x',
-    shape=[13],
-    data_type='float32')
+x = layers.data(name='x', shape=[13], data_type='float32')
 
-y_predict = layers.fc(input=x,
-                      size=1,
-                      act=None)
+y_predict = layers.fc(input=x, size=1, act=None)
 
-y = layers.data(
-    name='y',
-    shape=[1],
-    data_type='float32')
+y = layers.data(name='y', shape=[1], data_type='float32')
 
-cost = layers.square_error_cost(
-    input=y_predict,
-    label=y)
+cost = layers.square_error_cost(input=y_predict, label=y)
 avg_cost = layers.mean(x=cost)
 
-sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+sgd_optimizer = SGDOptimizer(learning_rate=0.001)
 opts = sgd_optimizer.minimize(avg_cost)
 
 BATCH_SIZE = 20
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
index f4be835b3ad57d5b0076e8a816c2c3def46e0663..b8506125501b6e533c4594b37943ec36ca8e7d30 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -1,21 +1,16 @@
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
-import paddle.v2.fluid.optimizer as optimizer
 from paddle.v2.fluid.executor import Executor
-import paddle.v2.fluid.framework as framework
 from paddle.v2.fluid.initializer import XavierInitializer
+from paddle.v2.fluid.optimizer import AdamOptimizer
 
 
 def resnet_cifar10(input, depth=32):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu'):
+    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
         tmp = layers.conv2d(
             input=input,
             filter_size=filter_size,
@@ -24,9 +19,7 @@ def resnet_cifar10(input, depth=32):
             padding=padding,
             act=None,
             bias_attr=False)
-        return layers.batch_norm(
-            input=tmp,
-            act=act)
+        return layers.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride, program, init_program):
         if ch_in != ch_out:
@@ -35,28 +28,11 @@ def resnet_cifar10(input, depth=32):
         else:
             return input
 
-    def basicblock(input,
-                   ch_in,
-                   ch_out,
-                   stride):
-        tmp = conv_bn_layer(
-            input,
-            ch_out,
-            3,
-            stride,
-            1)
-        tmp = conv_bn_layer(
-            tmp,
-            ch_out,
-            3,
-            1,
-            1,
-            act=None)
+    def basicblock(input, ch_in, ch_out, stride):
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
         short = shortcut(input, ch_in, ch_out, stride)
-        return layers.elementwise_add(
-            x=tmp,
-            y=short,
-            act='relu')
+        return layers.elementwise_add(x=tmp, y=short, act='relu')
 
     def layer_warp(block_func, input, ch_in, ch_out, count, stride):
         tmp = block_func(input, ch_in, ch_out, stride)
@@ -67,45 +43,17 @@ def resnet_cifar10(input, depth=32):
     assert (depth - 2) % 6 == 0
     n = (depth - 2) / 6
     conv1 = conv_bn_layer(
-        input=input,
-        ch_out=16,
-        filter_size=3,
-        stride=1,
-        padding=1)
-    res1 = layer_warp(
-        basicblock,
-        conv1,
-        16,
-        16,
-        n,
-        1)
-    res2 = layer_warp(
-        basicblock,
-        res1,
-        16,
-        32,
-        n,
-        2)
-    res3 = layer_warp(
-        basicblock,
-        res2,
-        32,
-        64,
-        n,
-        2)
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
     pool = layers.pool2d(
-        input=res3,
-        pool_size=8,
-        pool_type='avg',
-        pool_stride=1)
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
     return pool
 
 
 def vgg16_bn_drop(input):
-    def conv_block(input,
-                   num_filter,
-                   groups,
-                   dropouts):
+    def conv_block(input, num_filter, groups, dropouts):
         return nets.img_conv_group(
             input=input,
             pool_size=2,
@@ -123,22 +71,14 @@ def vgg16_bn_drop(input):
     conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
     conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
 
-    drop = layers.dropout(
-        x=conv5,
-        dropout_prob=0.5)
+    drop = layers.dropout(x=conv5, dropout_prob=0.5)
     fc1 = layers.fc(input=drop,
                     size=512,
                     act=None,
                     param_attr={"initializer": XavierInitializer()})
-    reshape1 = layers.reshape(
-        x=fc1,
-        shape=list(fc1.shape + (1, 1)))
-    bn = layers.batch_norm(
-        input=reshape1,
-        act='relu')
-    drop2 = layers.dropout(
-        x=bn,
-        dropout_prob=0.5)
+    reshape1 = layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1)))
+    bn = layers.batch_norm(input=reshape1, act='relu')
+    drop2 = layers.dropout(x=bn, dropout_prob=0.5)
     fc2 = layers.fc(input=drop2,
                     size=512,
                     act=None,
@@ -165,8 +105,8 @@ cost = layers.cross_entropy(input=predict, label=label)
 avg_cost = layers.mean(x=cost)
 accuracy = layers.accuracy(input=predict, label=label)
 
-# optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-optimizer = optimizer.AdamOptimizer(learning_rate=0.001)
+# optimizer = SGDOptimizer(learning_rate=0.001)
+optimizer = AdamOptimizer(learning_rate=0.001)
 opts = optimizer.minimize(avg_cost)
 
 BATCH_SIZE = 128
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
index f330ff58137068e429008bc7aa07bbc8d2e35ac4..75fbaf83e8f3e62eb0d0abef9cfa267b65e72973 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
@@ -1,22 +1,15 @@
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.evaluator as evaluator
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.optimizer import AdamOptimizer
 
-import numpy as np
-
-images = layers.data(
-    name='pixel',
-    shape=[1, 28, 28],
-    data_type='float32')
-label = layers.data(
-    name='label',
-    shape=[1],
-    data_type='int64')
+images = layers.data(name='pixel', shape=[1, 28, 28], data_type='float32')
+label = layers.data(name='label', shape=[1], data_type='int64')
 conv_pool_1 = nets.simple_img_conv_pool(
     input=images,
     filter_size=5,
@@ -32,17 +25,13 @@ conv_pool_2 = nets.simple_img_conv_pool(
     pool_stride=2,
     act="relu")
 
-predict = layers.fc(input=conv_pool_2,
-                    size=10,
-                    act="softmax")
+predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
 cost = layers.cross_entropy(input=predict, label=label)
 avg_cost = layers.mean(x=cost)
-optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
+optimizer = AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
 opts = optimizer.minimize(avg_cost)
 
-accuracy, acc_out = evaluator.accuracy(
-    input=predict,
-    label=label)
+accuracy, acc_out = evaluator.accuracy(input=predict, label=label)
 
 BATCH_SIZE = 50
 PASS_NUM = 3
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
index b0164e3e3659c19edf2af45e706fb48ac1fe2b1c..cf10b1942e6a8243b18b0ae4586fdd7ec1a665fb 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -1,19 +1,15 @@
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.regularizer import L2DecayRegularizer
 from paddle.v2.fluid.initializer import UniformInitializer
-
-import numpy as np
+from paddle.v2.fluid.optimizer import MomentumOptimizer
+from paddle.v2.fluid.regularizer import L2DecayRegularizer
 
 BATCH_SIZE = 128
-image = layers.data(
-    name='x',
-    shape=[784],
-    data_type='float32')
+image = layers.data(name='x', shape=[784], data_type='float32')
 
 param_attr = {
     'name': None,
@@ -22,32 +18,21 @@ param_attr = {
     'regularization': L2DecayRegularizer(0.0005 * BATCH_SIZE)
 }
 
-hidden1 = layers.fc(input=image,
-                    size=128,
-                    act='relu',
-                    param_attr=param_attr)
-hidden2 = layers.fc(input=hidden1,
-                    size=64,
-                    act='relu',
-                    param_attr=param_attr)
+hidden1 = layers.fc(input=image, size=128, act='relu', param_attr=param_attr)
+hidden2 = layers.fc(input=hidden1, size=64, act='relu', param_attr=param_attr)
 
 predict = layers.fc(input=hidden2,
                     size=10,
                     act='softmax',
                     param_attr=param_attr)
 
-label = layers.data(
-    name='y',
-    shape=[1],
-    data_type='int64')
+label = layers.data(name='y', shape=[1], data_type='int64')
 
 cost = layers.cross_entropy(input=predict, label=label)
 avg_cost = layers.mean(x=cost)
-accuracy = layers.accuracy(
-    input=predict,
-    label=label)
+accuracy = layers.accuracy(input=predict, label=label)
 
-optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
 opts = optimizer.minimize(avg_cost)
 
 train_reader = paddle.batch(
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
index eefcb55bebff41eb9c67d9f0c8e83a5f1d4599bd..55ded3aed3a23c8cd7795f915dc1cbd512c6d945 100644
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -1,12 +1,11 @@
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.executor import Executor
-
-import numpy as np
+from paddle.v2.fluid.optimizer import SGDOptimizer
 
 IS_SPARSE = True
 USE_GPU = False
@@ -19,10 +18,7 @@ def get_usr_combined_features():
 
     USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
 
-    uid = layers.data(
-        name='user_id',
-        shape=[1],
-        data_type='int64')
+    uid = layers.data(name='user_id', shape=[1], data_type='int64')
 
     usr_emb = layers.embedding(
         input=uid,
@@ -31,15 +27,11 @@ def get_usr_combined_features():
         param_attr={'name': 'user_table'},
         is_sparse=IS_SPARSE)
 
-    usr_fc = layers.fc(input=usr_emb,
-                       size=32)
+    usr_fc = layers.fc(input=usr_emb, size=32)
 
     USR_GENDER_DICT_SIZE = 2
 
-    usr_gender_id = layers.data(
-        name='gender_id',
-        shape=[1],
-        data_type='int64')
+    usr_gender_id = layers.data(name='gender_id', shape=[1], data_type='int64')
 
     usr_gender_emb = layers.embedding(
         input=usr_gender_id,
@@ -47,14 +39,10 @@ def get_usr_combined_features():
         param_attr={'name': 'gender_table'},
         is_sparse=IS_SPARSE)
 
-    usr_gender_fc = layers.fc(input=usr_gender_emb,
-                              size=16)
+    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
 
     USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
-    usr_age_id = layers.data(
-        name='age_id',
-        shape=[1],
-        data_type="int64")
+    usr_age_id = layers.data(name='age_id', shape=[1], data_type="int64")
 
     usr_age_emb = layers.embedding(
         input=usr_age_id,
@@ -62,14 +50,10 @@ def get_usr_combined_features():
         is_sparse=IS_SPARSE,
         param_attr={'name': 'age_table'})
 
-    usr_age_fc = layers.fc(input=usr_age_emb,
-                           size=16)
+    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
 
     USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
-    usr_job_id = layers.data(
-        name='job_id',
-        shape=[1],
-        data_type="int64")
+    usr_job_id = layers.data(name='job_id', shape=[1], data_type="int64")
 
     usr_job_emb = layers.embedding(
         input=usr_job_id,
@@ -77,16 +61,12 @@ def get_usr_combined_features():
         param_attr={'name': 'job_table'},
         is_sparse=IS_SPARSE)
 
-    usr_job_fc = layers.fc(input=usr_job_emb,
-                           size=16)
+    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
 
     concat_embed = layers.concat(
-        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
-        axis=1)
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
 
-    usr_combined_features = layers.fc(input=concat_embed,
-                                      size=200,
-                                      act="tanh")
+    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
 
     return usr_combined_features
 
@@ -95,10 +75,7 @@ def get_mov_combined_features():
 
     MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
 
-    mov_id = layers.data(
-        name='movie_id',
-        shape=[1],
-        data_type='int64')
+    mov_id = layers.data(name='movie_id', shape=[1], data_type='int64')
 
     mov_emb = layers.embedding(
         input=mov_id,
@@ -107,36 +84,24 @@ def get_mov_combined_features():
         param_attr={'name': 'movie_table'},
         is_sparse=IS_SPARSE)
 
-    mov_fc = layers.fc(input=mov_emb,
-                       size=32)
+    mov_fc = layers.fc(input=mov_emb, size=32)
 
     CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
 
-    category_id = layers.data(
-        name='category_id',
-        shape=[1],
-        data_type='int64')
+    category_id = layers.data(name='category_id', shape=[1], data_type='int64')
 
     mov_categories_emb = layers.embedding(
-        input=category_id,
-        size=[CATEGORY_DICT_SIZE, 32],
-        is_sparse=IS_SPARSE)
+        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
 
     mov_categories_hidden = layers.sequence_pool(
-        input=mov_categories_emb,
-        pool_type="sum")
+        input=mov_categories_emb, pool_type="sum")
 
     MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
 
-    mov_title_id = layers.data(
-        name='movie_title',
-        shape=[1],
-        data_type='int64')
+    mov_title_id = layers.data(name='movie_title', shape=[1], data_type='int64')
 
     mov_title_emb = layers.embedding(
-        input=mov_title_id,
-        size=[MOV_TITLE_DICT_SIZE, 32],
-        is_sparse=IS_SPARSE)
+        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
 
     mov_title_conv = nets.sequence_conv_pool(
         input=mov_title_emb,
@@ -146,13 +111,10 @@ def get_mov_combined_features():
         pool_type="sum")
 
     concat_embed = layers.concat(
-        input=[mov_fc, mov_categories_hidden, mov_title_conv],
-        axis=1)
+        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
 
     # FIXME(dzh) : need tanh operator
-    mov_combined_features = layers.fc(input=concat_embed,
-                                      size=200,
-                                      act="tanh")
+    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
 
     return mov_combined_features
 
@@ -162,18 +124,11 @@ def model():
     mov_combined_features = get_mov_combined_features()
 
     # need cos sim
-    inference = layers.cos_sim(
-        X=usr_combined_features,
-        Y=mov_combined_features)
+    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
 
-    label = layers.data(
-        name='score',
-        shape=[1],
-        data_type='float32')
+    label = layers.data(name='score', shape=[1], data_type='float32')
 
-    square_cost = layers.square_error_cost(
-        input=inference,
-        label=label)
+    square_cost = layers.square_error_cost(input=inference, label=label)
 
     avg_cost = layers.mean(x=square_cost)
 
@@ -182,7 +137,7 @@ def model():
 
 def main():
     cost = model()
-    sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.2)
+    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
     opts = sgd_optimizer.minimize(cost)
 
     if USE_GPU:
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
index 91fc79a9870a31205098d8a40de6c033d5bf60b9..e69b915a9cfaf9e06075991975563a1fc1196661 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
@@ -1,12 +1,11 @@
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.executor import Executor
-
-import numpy as np
+from paddle.v2.fluid.optimizer import AdamOptimizer
 
 
 def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32):
@@ -31,7 +30,7 @@ def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32):
                            act="softmax")
     cost = layers.cross_entropy(input=prediction, label=label)
     avg_cost = layers.mean(x=cost)
-    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
+    adam_optimizer = AdamOptimizer(learning_rate=0.002)
     opts = adam_optimizer.minimize(avg_cost)
     acc = layers.accuracy(input=prediction, label=label)
     return avg_cost, acc
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
index 8c3d4488354eb363cd1d378ebd4cb8069e7c1b1d..65d44542501e6531fc1912cbc726a1d903b9c031 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
@@ -1,12 +1,10 @@
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
-
-import numpy as np
+from paddle.v2.fluid.optimizer import AdamOptimizer
 
 
 def stacked_lstm_net(input_dim,
@@ -41,7 +39,7 @@ def stacked_lstm_net(input_dim,
                            act='softmax')
     cost = layers.cross_entropy(input=prediction, label=label)
     avg_cost = layers.mean(x=cost)
-    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
+    adam_optimizer = AdamOptimizer(learning_rate=0.002)
     opts = adam_optimizer.minimize(avg_cost)
     acc = layers.accuracy(input=prediction, label=label)
     return avg_cost, acc
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
index a7d791c1f38d4843f084127e879d613b21ae8daf..280f6e902c34512735a27586221c2be68963ef2b 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -1,11 +1,10 @@
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
-
-import numpy as np
+from paddle.v2.fluid.optimizer import AdamOptimizer
 
 
 def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
@@ -33,7 +32,7 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
     cost = layers.cross_entropy(input=prediction, label=label)
 
     avg_cost = layers.mean(x=cost)
-    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
+    adam_optimizer = AdamOptimizer(learning_rate=0.002)
     opts = adam_optimizer.minimize(avg_cost)
     acc = layers.accuracy(input=prediction, label=label)
 
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
index 9dcb6f2fea06ea8cd061be4f148854408779f990..afa7b285198e0349317e123e4bd98e8336217afa 100644
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -1,11 +1,10 @@
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
-
-import numpy as np
+from paddle.v2.fluid.optimizer import SGDOptimizer
 
 PASS_NUM = 100
 EMBED_SIZE = 32
@@ -17,26 +16,11 @@ IS_SPARSE = True
 word_dict = paddle.dataset.imikolov.build_dict()
 dict_size = len(word_dict)
 
-first_word = layers.data(
-    name='firstw',
-    shape=[1],
-    data_type='int64')
-second_word = layers.data(
-    name='secondw',
-    shape=[1],
-    data_type='int64')
-third_word = layers.data(
-    name='thirdw',
-    shape=[1],
-    data_type='int64')
-forth_word = layers.data(
-    name='forthw',
-    shape=[1],
-    data_type='int64')
-next_word = layers.data(
-    name='nextw',
-    shape=[1],
-    data_type='int64')
+first_word = layers.data(name='firstw', shape=[1], data_type='int64')
+second_word = layers.data(name='secondw', shape=[1], data_type='int64')
+third_word = layers.data(name='thirdw', shape=[1], data_type='int64')
+forth_word = layers.data(name='forthw', shape=[1], data_type='int64')
+next_word = layers.data(name='nextw', shape=[1], data_type='int64')
 
 embed_first = layers.embedding(
     input=first_word,
@@ -64,19 +48,12 @@ embed_forth = layers.embedding(
     param_attr={'name': 'shared_w'})
 
 concat_embed = layers.concat(
-    input=[embed_first, embed_second, embed_third, embed_forth],
-    axis=1)
-hidden1 = layers.fc(input=concat_embed,
-                    size=HIDDEN_SIZE,
-                    act='sigmoid')
-predict_word = layers.fc(input=hidden1,
-                         size=dict_size,
-                         act='softmax')
-cost = layers.cross_entropy(
-    input=predict_word,
-    label=next_word)
+    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+hidden1 = layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
+predict_word = layers.fc(input=hidden1, size=dict_size, act='softmax')
+cost = layers.cross_entropy(input=predict_word, label=next_word)
 avg_cost = layers.mean(x=cost)
-sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+sgd_optimizer = SGDOptimizer(learning_rate=0.001)
 opts = sgd_optimizer.minimize(avg_cost)
 
 train_reader = paddle.batch(
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
index 907b52c405d9e5c02c70f611e4c777ba21948c40..2240dc73cdd31f320fed174dd811e93c6640137f 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -110,13 +110,30 @@ class TestConv2dOp(OpTest):
         self.op_type = "conv2d"
 
 
+class TestWithPad(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
+class TestWithStride(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
 class TestWithGroup(TestConv2dOp):
     def init_group(self):
         self.groups = 3
 
-    def init_op_type(self):
-        self.op_type = "conv2d"
-
 
 class TestWith1x1(TestConv2dOp):
     def init_test_case(self):
@@ -127,15 +144,9 @@ class TestWith1x1(TestConv2dOp):
         f_c = self.input_size[1] / self.groups
         self.filter_size = [6, f_c, 1, 1]
 
-    def init_dilation(self):
-        self.dilations = [1, 1]
-
     def init_group(self):
         self.groups = 3
 
-    def init_op_type(self):
-        self.op_type = "conv2d"
-
 
 class TestWithDilation(TestConv2dOp):
     def init_test_case(self):
@@ -152,14 +163,19 @@ class TestWithDilation(TestConv2dOp):
     def init_group(self):
         self.groups = 3
 
+
+#----------------Conv2dCudnn----------------
+class TestCudnn(TestConv2dOp):
     def init_op_type(self):
-        self.op_type = "conv2d"
+        self.op_type = "conv_cudnn"
 
 
-#----------------Conv2dCudnn----------------
+class TestCudnnWithPad(TestWithPad):
+    def init_op_type(self):
+        self.op_type = "conv_cudnn"
 
 
-class TestCudnn(TestConv2dOp):
+class TestCudnnWithStride(TestWithStride):
     def init_op_type(self):
         self.op_type = "conv_cudnn"
 
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
index 54349c018c4a53b8767d6cd4f94d99c719dc0237..d7b1f2f2a3abf6335998742dbbef8e17794170fa 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
@@ -4,9 +4,7 @@ from op_test import OpTest
 
 
 def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
-    # [2, 3, 5, 5]
     in_n, in_c, in_h, in_w = input_.shape
-    # [3, 6, 3, 3]
     f_c, out_c, f_h, f_w = filter_.shape
     assert in_c == f_c
 
@@ -29,6 +27,7 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
                     j1, j2 = j * stride[0], j * stride[0] + f_w
                     out[n, k, i1:i2, j1:j2] += tmp_out
 
+    out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
     return out
 
 
@@ -36,8 +35,6 @@ class TestConv2dTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
         self.init_op_type()
-
-        # [2, 3, 5, 5] -> kernel [3, 6, 3, 3] -> output [2, 6, 7, 7]
         self.init_test_case()
 
         conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad}
@@ -55,7 +52,6 @@ class TestConv2dTransposeOp(OpTest):
         self.outputs = {'Output': output}
 
     def test_check_output(self):
-        print 'check output here for', self.op_type
         self.check_output()
 
     def test_check_grad_no_input(self):
@@ -88,6 +84,26 @@ class TestConv2dTransposeOp(OpTest):
         self.op_type = "conv2d_transpose"
 
 
+class TestWithPad(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
+class TestWithStride(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
 # ------------ test_cudnn ------------
 class TestCudnn(TestConv2dTransposeOp):
     def init_op_type(self):
diff --git a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
index 132fe7931438a30cf02e4ad2894c0838e48ffc9f..8fd34b87bfea91307f52fdcbb9f71f2e1a9c6c56 100644
--- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
@@ -4,9 +4,7 @@ from op_test import OpTest
 
 
 def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
-    # [2, 3, 5, 5, 5]
     in_n, in_c, in_d, in_h, in_w = input_.shape
-    # [3, 6, 3, 3, 3]
     f_c, out_c, f_d, f_h, f_w = filter_.shape
     assert in_c == f_c
 
@@ -14,7 +12,6 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
     out_d = (in_d - 1) * stride[0] + f_d
     out_h = (in_h - 1) * stride[1] + f_h
     out_w = (in_w - 1) * stride[2] + f_w
-
     out = np.zeros((in_n, out_c, out_d, out_h, out_w))
 
     for n in range(in_n):
@@ -33,6 +30,8 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
                         j1, j2 = j * stride[2], j * stride[2] + f_w
                         out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
 
+    out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
+              pad[2]]
     return out
 
 
@@ -40,8 +39,6 @@ class TestConv3dTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
         self.init_op_type()
-
-        # [2, 3, 5, 5, 5] -> kernel [3, 6, 3, 3, 3] -> output [2, 6, 7, 7, 7]
         self.init_test_case()
 
         conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
@@ -49,7 +46,6 @@ class TestConv3dTransposeOp(OpTest):
         filter_ = np.random.random(self.filter_size).astype("float32")
         output = conv3dtranspose_forward_naive(
             input_, filter_, conv3dtranspose_param).astype("float32")
-        # print 'deconv output py', output, output.shape
 
         self.inputs = {'Input': input_, 'Filter': filter_}
         self.attrs = {
@@ -60,7 +56,6 @@ class TestConv3dTransposeOp(OpTest):
         self.outputs = {'Output': output}
 
     def test_check_output(self):
-        print 'check output here'
         self.check_output()
 
     def test_check_grad(self):
@@ -85,7 +80,7 @@ class TestConv3dTransposeOp(OpTest):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
         self.dilations = [1, 1, 1]
-        self.input_size = [2, 3, 5, 5, 5]  # NCHW
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
 
@@ -93,5 +88,31 @@ class TestConv3dTransposeOp(OpTest):
         self.op_type = "conv3d_transpose"
 
 
+class TestWithPad(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithStride(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+# ------------ test_cudnn ------------
+class TestCudnn(TestConv3dTransposeOp):
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose_cudnn"
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_gru_op.py b/python/paddle/v2/fluid/tests/test_gru_op.py
index b2474cff94c6c71cc62bc8e69a5d83e38d51c511..fa2c5a53ec4a01b6545e25f773c11277a4d24706 100644
--- a/python/paddle/v2/fluid/tests/test_gru_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_op.py
@@ -6,7 +6,8 @@ from test_lstm_op import identity, sigmoid, tanh, relu
 
 
 class TestGRUOp(OpTest):
-    batch_size = 9
+    lod = [[0, 2, 6, 9]]
+    batch_size = lod[0][-1]
     frame_size = 5
     activate = {
         'identity': identity,
@@ -35,7 +36,7 @@ class TestGRUOp(OpTest):
                            seq_starts[sorted_seqs[i]] + batch_idx)
                 idx_in_seq.append(idx)
             idx_in_seq_list.append(idx_in_seq)
-        return idx_in_seq_list
+        return idx_in_seq_list, sorted_seqs
 
     def gru_step(self, x, h_p, w, b):
         batch_size = x.shape[0]
@@ -66,8 +67,8 @@ class TestGRUOp(OpTest):
         batch_hidden = self.outputs['BatchHidden']
         hidden = self.outputs['Hidden']
         idx_in_seq_list = self.idx_in_seq_list
-        h_p = self.inputs['H0'] if self.inputs.has_key('H0') else np.zeros(
-            (len(idx_in_seq_list[0]), self.frame_size))
+        h_p = self.inputs['H0'][self.sorted_seqs] if self.inputs.has_key(
+            'H0') else np.zeros((len(idx_in_seq_list[0]), self.frame_size))
         num_batch = len(idx_in_seq_list)
         end_idx = 0
         for batch_idx in range(num_batch):
@@ -84,8 +85,9 @@ class TestGRUOp(OpTest):
         return batch_gate, batch_reset_hidden_prev, hidden
 
     def set_data(self):
-        lod = [[0, 2, 6, self.batch_size]]
-        self.idx_in_seq_list = self.seq_to_batch(lod, self.is_reverse)
+        lod = self.lod
+        self.idx_in_seq_list, self.sorted_seqs = self.seq_to_batch(
+            lod, self.is_reverse)
         batch_size = self.batch_size
         frame_size = self.frame_size
         input = np.random.rand(batch_size, frame_size * 3).astype('float64')
@@ -146,7 +148,7 @@ class TestGRUOpReverse(TestGRUOp):
     def set_confs(self):
         self.is_reverse = True
         self.attrs = {
-            'activation': 'identity',
+            'activation': 'tanh',
             'gate_activation': 'sigmoid',
             'is_reverse': self.is_reverse
         }
diff --git a/python/paddle/v2/fluid/tests/test_is_empty_op.py b/python/paddle/v2/fluid/tests/test_is_empty_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed6e3fe24f6333c9c90d760787eb13241a7e1868
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py
@@ -0,0 +1,43 @@
+import unittest
+import numpy as np
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
+
+
+def create_tensor(scope, name, np_data):
+    tensor = scope.var(name).get_tensor()
+    tensor.set_dims(np_data.shape)
+    tensor.set(np_data, core.CPUPlace())
+    return tensor
+
+
+class TestIsEmptyOp(unittest.TestCase):
+    def setUp(self):
+        self.scope = core.Scope()
+        # create input variables
+        np_data0 = np.array([0, 1, 2])
+        create_tensor(self.scope, "X0", np_data0)
+
+        np_data1 = np.array([1])
+        t = create_tensor(self.scope, "X1", np_data1)
+        t.set_dims([0])
+
+        # create output variables
+        self.scope.var("out")
+
+    def test_no_empty(self):
+        self.one_case("X0", False)
+
+    def test_empty(self):
+        self.one_case("X1", True)
+
+    def one_case(self, input, target):
+        op = Operator(type="is_empty", X=input, Out="out")
+        ctx = core.DeviceContext.create(core.CPUPlace())
+        op.run(self.scope, ctx)
+        out = self.scope.var("out").get_tensor()
+        self.assertEqual(np.array(out)[0], target)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py
index 0f01acb3b94dc55a3536e751108e785ddc6e47bb..84b432333f950f754a97bc1a051b59c16fb22aed 100644
--- a/python/paddle/v2/fluid/tests/test_while_op.py
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
@@ -2,6 +2,7 @@ import unittest
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.core as core
+from paddle.v2.fluid.backward import append_backward_ops
 import numpy
 
 
@@ -16,7 +17,7 @@ class TestWhileOp(unittest.TestCase):
         i = layers.zeros(shape=[1], dtype='int64')
         i.stop_gradient = True
         init = layers.zeros(shape=[10], dtype='float32')
-        mem_array = layers.array_write(init, i=i)
+        mem_array = layers.array_write(x=init, i=i)
         data_array = layers.array_write(x=d0, i=i)
 
         i = layers.increment(i)
@@ -29,17 +30,23 @@ class TestWhileOp(unittest.TestCase):
         i.stop_gradient = True
 
         array_len = layers.fill_constant(shape=[1], dtype='int64', value=3)
+        array_len.stop_gradient = True
         cond = layers.less_than(x=i, y=array_len)
 
         while_op = layers.While(cond=cond)
         with while_op.block():
             d = layers.array_read(array=data_array, i=i)
             prev = layers.array_read(array=mem_array, i=i)
-            i = layers.increment(x=i, in_place=True)
             result = layers.sums(input=[d, prev])
+
+            i = layers.increment(x=i, in_place=True)
             layers.array_write(result, i=i, array=mem_array)
             layers.less_than(x=i, y=array_len, cond=cond)
-        sum_result = layers.array_read(mem_array, i=array_len)
+
+        sum_result = layers.array_read(array=mem_array, i=i)
+        loss = layers.mean(x=sum_result)
+
+        append_backward_ops(loss)
 
         cpu = core.CPUPlace()
         exe = Executor(cpu)