Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_cudnn_pool3d

dec61ab6 · chengduoZH · 7e91da41 · 2e7ffbd1 · dec61ab6 · dec61ab6
114 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,8 +36,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
+option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -82,10 +81,8 @@ if(ANDROID OR IOS)
        "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
    set(WITH_RDMA OFF CACHE STRING
        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKLDNN OFF CACHE STRING
+    set(WITH_MKL OFF CACHE STRING
-        "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
+        "Disable MKL when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKLML OFF CACHE STRING
-        "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
    # Compile PaddlePaddle mobile inference library
    if (NOT WITH_C_API)
@@ -111,6 +108,14 @@ else()
    set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
+set(WITH_MKLML ${WITH_MKL})
+if (WITH_MKL AND ${AVX2_FOUND})
+    set(WITH_MKLDNN ON)
+else()
+    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
+    set(WITH_MKLDNN OFF)
+endif()
 ########################################################################################
 include(external/mklml)     # download mklml package
@@ -158,14 +163,15 @@ set(EXTERNAL_LIBS
 )
 if(WITH_GPU)
-    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+  include(cuda)
-    if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
-    endif(NOT WITH_DSO)
 endif(WITH_GPU)
+if(WITH_MKLML)
+    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
+endif()
 if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
 if(USE_NNPACK)

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -76,27 +76,14 @@ else()
    include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
-if(WITH_MKLDNN)
+if (WITH_MKLML AND MKLML_IOMP_LIB)
-    add_definitions(-DPADDLE_USE_MKLDNN)
+    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    if (WITH_MKLML AND MKLDNN_IOMP_DIR)
+    set(OPENMP_FLAGS "-fopenmp")
-        message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
+    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(OPENMP_FLAGS "-fopenmp")
+    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
-        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+endif()
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
-    else()
-        find_package(OpenMP)
-        if(OPENMP_FOUND)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-        else()
-            message(WARNING "Can not find OpenMP."
-                 "Some performance features in MKLDNN may not be available")
-        endif()
-    endif()
-endif(WITH_MKLDNN)
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")

--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
 # Set the architecture for iOS
 if(NOT DEFINED IOS_ARCH)
  if(IOS_PLATFORM STREQUAL "OS")
-    # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
+    set(IOS_ARCH "armv7;armv7s;arm64")
-    set(IOS_ARCH "arm64")
  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    # FIXME(liuyiqun): support "i386;x86_64" future
+    set(IOS_ARCH "i386;x86_64")
-    set(IOS_ARCH "x86_64")
  endif()
 endif()
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_
 # Hidden visibilty is required for cxx on iOS 
 set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
+set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
 set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
+if(NOT WITH_GPU)
+    return()
+endif()
+set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs7 "30 35 50 52")
+set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
+######################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   detect_installed_gpus(out_variable)
+function(detect_installed_gpus out_variable)
+  if(NOT CUDA_gpu_detect_output)
+    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+    file(WRITE ${cufile} ""
+      "#include <cstdio>\n"
+      "int main() {\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device) {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
+                    "--run" "${cufile}"
+                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(nvcc_res EQUAL 0)
+      # only keep the last line of nvcc_out
+      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
+      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
+      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
+    endif()
+  endif()
+  if(NOT CUDA_gpu_detect_output)
+    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
+    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+########################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
+# Usage:
+#   select_nvcc_arch_flags(out_variable)
+function(select_nvcc_arch_flags out_variable)
+  # List of arch names
+  set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
+  set(archs_name_default "All")
+  if(NOT CMAKE_CROSSCOMPILING)
+    list(APPEND archs_names "Auto")
+  endif()
+  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
+  set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
+  mark_as_advanced(CUDA_ARCH_NAME)
+  # verify CUDA_ARCH_NAME value
+  if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
+    string(REPLACE ";" ", " archs_names "${archs_names}")
+    message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
+  endif()
+  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
+  else()
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
+    set(cuda_arch_bin "30 35")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+    set(cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(cuda_arch_bin "60 61")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
+    set(cuda_arch_bin "70")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
+    set(cuda_arch_bin ${paddle_known_gpu_archs})
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
+    detect_installed_gpus(cuda_arch_bin)
+  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(cuda_arch_bin ${CUDA_ARCH_BIN})
+  endif()
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+  list(REMOVE_DUPLICATES cuda_arch_bin)
+  list(REMOVE_DUPLICATES cuda_arch_ptx)
+  set(nvcc_flags "")
+  set(nvcc_archs_readable "")
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(arch ${cuda_arch_bin})
+    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified PTX for the concrete BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+      list(APPEND nvcc_archs_readable sm_${arch})
+    endif()
+  endforeach()
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(arch ${cuda_arch_ptx})
+    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+    list(APPEND nvcc_archs_readable compute_${arch})
+  endforeach()
+  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
+message(STATUS "CUDA detected: " ${CUDA_VERSION})
+if (${CUDA_VERSION} LESS 7.0)
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
+  # warning for now.
+  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+endif()
+include_directories(${CUDA_INCLUDE_DIRS})
+list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+if(NOT WITH_DSO)
+    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+endif(NOT WITH_DSO)
+# setting nvcc arch flags
+select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
+list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
+# Set C++11 support
+set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
+# So, don't set these flags here.
+list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
+list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+# Set :expt-relaxed-constexpr to suppress Eigen warnings
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
+mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
-    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
+    MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
-    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
+ELSE()
-    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
+    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
-    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
 ENDIF()
 SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
@@ -57,15 +56,16 @@ ExternalProject_Add(
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
+                        -DMKLROOT:PATH=${MKLML_ROOT}
 )
 ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
+MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
+add_definitions(-DPADDLE_USE_MKLDNN)
 LIST(APPEND external_project_dependencies mkldnn)
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND})
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
        CACHE FILEPATH "openblas library." FORCE)
-    SET(OPENBLAS_CC "${CMAKE_C_COMPILER}")
+    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
    IF(CMAKE_CROSSCOMPILING)
        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND})
                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
            ENDIF()
        ELSEIF(IOS)
-            # FIXME(liuyiqun): support multiple architectures
+            IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
-            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
+                SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
-            SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
-            IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
-            ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
+            ELSE()
+                MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
+                       "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
            ENDIF()
        ELSEIF(RPI)
            # use hardfp

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
 INCLUDE(ExternalProject)
 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -149,58 +149,3 @@ endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
    safe_set_nvflag(${flag})
 endforeach()
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
-# So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
-LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
-endif()
-function(specify_cuda_arch cuda_version cuda_arch)
-    if(${cuda_version} VERSION_GREATER "8.0")
-        foreach(capability 61 62)
-          if(${cuda_arch} STREQUAL ${capability})
-            list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-          endif()
-        endforeach()
-    elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
-        list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-    endif()
-endfunction()
-# Common gpu architectures: Kepler, Maxwell
-foreach(capability 30 35 50)
-      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
-endforeach()
-if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
-      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
-endif()
-# Modern gpu architectures: Pascal
-if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
-      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
-      list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
-endif()
-# Custom gpu architecture
-set(CUDA_ARCH)
-if(CUDA_ARCH)
-  specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
-endif()
-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME)
        target_link_libraries(${TARGET_NAME} log)
    endif(ANDROID)
-    if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
+    if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
-      target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+      target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
    endif()
    add_dependencies(${TARGET_NAME} ${external_project_dependencies})

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -335,6 +335,16 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
    :noindex:
+dot_prod
+---------
+.. autoclass:: paddle.v2.layer.dot_prod
+    :noindex:
+out_prod
+--------
+.. autoclass:: paddle.v2.layer.out_prod
+    :noindex:
 power
 -----
 ..  autoclass:: paddle.v2.layer.power

--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA.
 我们把集成方案大致分为了如下几个方面。
 ### CMake
-我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项，当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
+我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关，他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。
-同时，我们会引入`WITH_MKLML`选项，用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用，但是建议在开启MKL-DNN的同时也打开MKLML的开关，这样才能发挥最好的性能。
+当打开`WITH_MKL`时，会开启MKLML的功能，作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上，同时会开启MKL-DNN功能。
-所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
+当关闭`WITH_MKL`时，MKLML和MKL-DNN功能会同时关闭。
-**备注**：当`WITH_MKLML=ON`的时候，会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库，所以会稍微改动`cmake/cblas.cmake`中的逻辑。
+所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
 ### Layers
 所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在

--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -34,7 +34,7 @@ PaddlePaddle的文档构建有两种方式。
    cd TO_YOUR_PADDLE_CLONE_PATH
    mkdir -p build
    cd build
-    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
    make gen_proto_py
    make paddle_docs paddle_docs_cn

--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
-# 构建Android平台上的PaddlePaddle库
+# Android平台编译指南
 用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
 - 基于Docker容器的编译方式

--- a/doc/mobile/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
-# 构建iOS平台上的PaddlePaddle库
+# iOS平台编译指南
 交叉编译iOS平台上适用的PaddlePaddle库，需要在MacOS系统上进行。本文的将介绍在MacOS上，从源码交叉编译iOS平台上适用的PaddlePaddle库。
 ## 准备交叉编译环境
@@ -25,7 +25,7 @@ iOS平台可选配置参数：
 - `IOS_PLATFORM`，可设置为`OS/SIMULATOR`，默认值为`OS`。
  - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
  - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示：
+- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示，默认编译所有架构：
    <table class="docutils">
    <colgroup>
@@ -41,11 +41,11 @@ iOS平台可选配置参数：
    <tbody valign="top">
      <tr class="row-even">
      <td>OS</td>
-      <td>armv7, armv7s, arm64 (默认)</td>
+      <td>armv7, armv7s, arm64 </td>
    </tr>
    <tr class="row-odd">
      <td>SIMULATOR</td>
-      <td>i386, x86_64 (默认)</td>
+      <td>i386, x86_64 </td>
    </tr>
    </tbody>
    </table>
@@ -66,7 +66,7 @@ iOS平台可选配置参数：
 ```bash
 cmake -DCMAKE_SYSTEM_NAME=iOS \
      -DIOS_PLATFORM=OS \
-      -DIOS_ARCH="arm64" \
+      -DIOS_ARCH="armv7;arm64" \
      -DIOS_ENABLE_BITCODE=ON \
      -DIOS_USE_VECLIB_FOR_BLAS=ON \
      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
@@ -112,6 +112,6 @@ $ make install
 - `lib`目录，其中包含PaddlePaddle的C-API静态库
 - `third_party`目录，其中包含所依赖的所有第三方库
-注意，不同架构的PaddlePaddle库建议安装到不同的目录下，然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。
+注意，如果PaddlePaddle库需要同时支持真机和模拟器，则需要分别编译真机和模拟器版本，然后使用`lipo`工具合并fat库。
 自此，PaddlePaddle库已经安装完成，用户可将合成的fat库用于深度学习相关的iOS App中，调用方法见C-API文档。
--- a/doc/mobile/cross_compiling_for_raspberry_cn.md
+++ b/doc/mobile/cross_compiling_for_raspberry_cn.md
-# 构建Raspberry Pi平台上的PaddlePaddle库
+# Raspberry Pi平台编译指南
 通常有两个方法来构建基于 Rasspberry Pi 的版本：

--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -25,7 +25,9 @@ limitations under the License. */
 #include "hl_matrix.h"
 #include "hl_sequence.h"
 #include "hl_sparse.h"
+#ifndef PADDLE_MOBILE_INFERENCE
 #include "hl_warpctc_wrap.h"
+#endif
 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_aggregate_stub.h"

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -270,6 +270,19 @@ static bool AllGradInSet(const std::vector<std::string>& names,
      return false;
    }
  }
+  if (VLOG_IS_ON(10)) {
+    std::ostringstream sout;
+    sout << "All input {";
+    for (auto& name : names) {
+      sout << name << ",";
+    }
+    sout << "} is in {";
+    for (auto& name : set) {
+      sout << name << ",";
+    }
+    sout << "}";
+    VLOG(10) << sout.str();
+  }
  return true;
 }
@@ -290,14 +303,12 @@ static void CreateGradVarInBlock(
  auto ops = block_desc->AllOps();
  for (size_t op_index = grad_op_start_index; op_index < ops.size();
       ++op_index) {
-    bool need_infer_shape = false;
    std::unordered_set<std::string> new_vars;
    ForEachVarName(ops[op_index]->Outputs(),
                   [&](const std::string& grad_var_name) {
                     if (block_desc->HasVar(grad_var_name)) {
                       return false;
                     }
-                     need_infer_shape = true;
                     auto var = block_desc->Var(grad_var_name);
                     new_vars.insert(var->Name());
                     auto it = param_name_map.find(grad_var_name);
@@ -311,23 +322,21 @@ static void CreateGradVarInBlock(
                     grad_record.op_idx_ = static_cast<int>(op_index);
                     return false; /* not break */
                   });
-    if (need_infer_shape) {
+    ops[op_index]->InferVarType(block_desc);
-      ops[op_index]->InferVarType(block_desc);
+    for (auto& arg : ops[op_index]->OutputArgumentNames()) {
-      for (auto& arg : ops[op_index]->OutputArgumentNames()) {
+      if (new_vars.find(arg) == new_vars.end()) {
-        if (new_vars.find(arg) == new_vars.end()) {
+        continue;
-          continue;
+      }
-        }
+      auto pname = FwdName(arg);
-        auto pname = FwdName(arg);
+      auto* param = block_desc->FindVarRecursive(pname);
-        auto* param = block_desc->FindVarRecursive(pname);
+      auto* grad = block_desc->FindVar(arg);
-        auto* grad = block_desc->FindVar(arg);
+      if (param == nullptr) {
-        if (param == nullptr) {
+        grad->SetDataType(DataType::FP32);
-          grad->SetDataType(DataType::FP32);
+      } else {
-        } else {
+        grad->SetDataType(param->GetDataType());
-          grad->SetDataType(param->GetDataType());
-        }
      }
-      ops[op_index]->InferShape(*block_desc);
    }
+    ops[op_index]->InferShape(*block_desc);
  }
 }
@@ -387,6 +396,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
    ProgramDescBind& program_desc, int block_idx,
    std::unordered_set<std::string>* no_grad_vars,
    std::unordered_map<std::string, std::string>* grad_to_var) {
+  VLOG(5) << "MakeBlockBackward";
  BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
  std::vector<OpDescBind*> op_descs = cur_block->AllOps();
  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
@@ -394,9 +404,10 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
  std::vector<std::unique_ptr<OpDescBind>> backward_descs;
  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
+    VLOG(5) << "Making backward " << (*it)->Type() << " op";
    std::vector<std::unique_ptr<OpDescBind>> op_grads;
-    if ((*it)->Type() == "recurrent") {
+    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
      int step_block_idx = (*it)->GetBlockAttr("step_block");
      BlockDescBind* backward_block = CreateStepBlock(
          program_desc, no_grad_vars, grad_to_var, step_block_idx);
@@ -410,6 +421,15 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
    }
+    if (VLOG_IS_ON(10)) {
+      std::ostringstream sout;
+      sout << "Made ";
+      for (auto& op_grad : op_grads) {
+        sout << op_grad->Type() << " ";
+      }
+      VLOG(10) << sout.str();
+    }
    for (const auto& desc : op_grads) {
      for (const std::string& out_name : desc->OutputArgumentNames()) {
        if (out_name.find("@GRAD") == std::string::npos) {
@@ -425,6 +445,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
        op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
        [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
  }
+  VLOG(5) << "Appending Sums";
  // Check whether some variables are written more than once
  std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
  for (const auto& dup : dup_out_ops) {
@@ -432,16 +454,22 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
    const std::vector<size_t> dup_op = dup.second;
    if (out_name != kEmptyVarName && dup_op.size() > 1) {
      std::vector<std::string> sum_op_inputs;
+      std::string next_g_name = out_name;
      for (size_t i = 0; i < dup_op.size(); ++i) {
+        VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
+                 << " duplicated";
        std::string new_name = out_name + "@RENAME@" + std::to_string(i);
-        backward_descs[dup_op[i]]->Rename(out_name, new_name);
+        backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
+        backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
        sum_op_inputs.emplace_back(new_name);
+        next_g_name = sum_op_inputs.back();
      }
      std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
          "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
    }
  }
  pending_sum_ops.sort(
      [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
         const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
@@ -452,6 +480,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
                          std::move(p.second));
  }
+  VLOG(5) << "MakeBlockBackward Finished";
  return backward_descs;
 }

--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -29,6 +29,8 @@ inline DataType ToDataType(std::type_index type) {
    return DataType::INT32;
  } else if (typeid(int64_t).hash_code() == type.hash_code()) {
    return DataType::INT64;
+  } else if (typeid(bool).hash_code() == type.hash_code()) {
+    return DataType::BOOL;
  } else {
    PADDLE_THROW("Not supported");
  }

--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -60,8 +60,7 @@ void make_ddim(DDim& ddim, const int64_t* dims, int n) {
      ddim = make_dim<9>(dims);
      break;
    default:
-      throw std::invalid_argument(
+      PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
-          "Dynamic dimensions must have between [1, 9] dimensions.");
  }
 }

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -120,6 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
  for (auto& op_desc : block.AllOps()) {
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
+    VLOG(10) << op->DebugString();
    op->Run(*local_scope, *device);
  }
  if (create_local_scope) {

--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -235,6 +235,23 @@ void OpDescBind::Rename(const std::string &old_name,
  need_update_ = true;
 }
+void OpDescBind::RenameOutput(const std::string &old_name,
+                              const std::string &new_name) {
+  for (auto &output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+  need_update_ = true;
+}
+void OpDescBind::RenameInput(const std::string &old_name,
+                             const std::string &new_name) {
+  for (auto &input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  need_update_ = true;
+}
 struct SetAttrDescVisitor : public boost::static_visitor<void> {
  explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
  mutable OpDesc::Attr *attr_;
@@ -448,7 +465,12 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
 DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
  auto var = block_.FindVarRecursive(name);
  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
-  return framework::make_ddim(var->Shape());
+  try {
+    return framework::make_ddim(var->Shape());
+  } catch (...) {
+    VLOG(5) << "GetDim of variable " << name << " error";
+    std::rethrow_exception(std::current_exception());
+  }
 }
 void CompileTimeInferShapeContext::SetDim(const std::string &name,

--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -73,6 +73,10 @@ class OpDescBind {
  void Rename(const std::string &old_name, const std::string &new_name);
+  void RenameOutput(const std::string &old_name, const std::string &new_name);
+  void RenameInput(const std::string &old_name, const std::string &new_name);
  // Only be used in C++
  const AttributeMap &GetAttrMap() const;

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -403,19 +403,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
 void OperatorWithKernel::Run(const Scope& scope,
                             const platform::DeviceContext& dev_ctx) const {
-  if (VLOG_IS_ON(1)) {
-    auto inputs = this->InputVars();
-    auto outputs = this->OutputVars(true);
-    std::ostringstream sout;
-    sout << "Run operator " << this->Type() << " From [";
-    std::ostream_iterator<std::string> out_it(sout, ",");
-    std::copy(inputs.begin(), inputs.end(), out_it);
-    sout << "] to [";
-    std::copy(outputs.begin(), outputs.end(), out_it);
-    sout << "]";
-    VLOG(1) << sout.str();
-  }
  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
  this->InferShape(&infer_shape_ctx);

--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -38,11 +38,12 @@ Scope& Scope::NewScope() const {
 Variable* Scope::Var(const std::string& name) {
  auto iter = vars_.find(name);
  if (iter != vars_.end()) {
+    VLOG(3) << "Get existing variable " << name;
    return iter->second;
  }
  Variable* v = new Variable();
  vars_[name] = v;
-  VLOG(3) << "Create variable " << name << " on scope";
+  VLOG(3) << "Create variable " << name;
  v->name_ = &(vars_.find(name)->first);
  return v;
 }

--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -53,6 +53,10 @@ class InferShapeContext {
  virtual bool IsRuntime() const = 0;
+  // Note: In while op, we need this to be public
+  void SetDims(const std::vector<std::string> &names,
+               const std::vector<framework::DDim> &dims);
 protected:
  virtual framework::DDim GetDim(const std::string &name) const = 0;
  virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
@@ -60,9 +64,6 @@ class InferShapeContext {
  std::vector<framework::DDim> GetDims(
      const std::vector<std::string> &names) const;
-  void SetDims(const std::vector<std::string> &names,
-               const std::vector<framework::DDim> &dims);
  std::vector<VarDesc::VarType> GetVarTypes(
      const std::vector<std::string> &names) const;

--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -73,7 +73,6 @@ if(MOBILE_INFERENCE)
    list(REMOVE_ITEM GSERVER_SOURCES
         dataproviders/DataProvider.cpp
         dataproviders/MultiDataProvider.cpp
-         dataproviders/ProtoDataProvider.cpp
         dataproviders/PyDataProvider2.cpp
         dataproviders/PyDataProvider.cpp)

--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <unistd.h>
 #include <algorithm>
-#include "ProtoDataProvider.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
 #include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
@@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config,
 REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
 REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
-REGISTER_DATA_PROVIDER(proto, ProtoDataProvider);
-REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider);
 int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
  int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)

--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "ProtoDataProvider.h"
-#include <algorithm>
-#include <fstream>
-#include <istream>
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-#include "DataProviderGroup.h"
-#include "paddle/utils/Logging.h"
-DEFINE_double(memory_threshold_on_load_data,
-              1.0,
-              "stop loading data when memory is not sufficient");
-namespace paddle {
-REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup<ProtoDataProvider>);
-REGISTER_DATA_PROVIDER(proto_sequence_group,
-                       DataProviderGroup<ProtoSequenceDataProvider>);
-ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
-                                     bool useGpu,
-                                     bool loadDataAll)
-    : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
-  if (loadDataAll) {
-    loadData(config_.files());
-  }
-}
-void ProtoDataProvider::loadData(const std::vector<std::string>& fileList) {
-  for (auto& file : fileList) {
-    if (FLAGS_memory_threshold_on_load_data < 1.0) {
-      double memUsage = getMemoryUsage();
-      if (memUsage > FLAGS_memory_threshold_on_load_data) {
-        LOG(INFO) << "memUsage is " << memUsage << ", > "
-                  << FLAGS_memory_threshold_on_load_data
-                  << " therefore SKIP ALL REMAINING file.";
-        break;
-      }
-    }
-    LOG(INFO) << "load data file " << file;
-    loadDataFile(file);
-  }
-  if (sequenceStartPositions_.size() == sampleNums_) {
-    // This means that each sample is one sequence
-    shuffledSequenceIds_.swap(sequenceStartPositions_);
-  } else {
-    sequenceStartPositions_.push_back(sampleNums_);
-    shuffledSequenceIds_.reserve(sequenceStartPositions_.size() - 1);
-    for (size_t i = 0; i < sequenceStartPositions_.size() - 1; ++i) {
-      shuffledSequenceIds_.push_back(i);
-    }
-  }
-  LOG(INFO) << "read done, num of instance=" << sampleNums_;
-  showDataStats();
-}
-void ProtoDataProvider::loadData(const std::string& fileName) {
-  std::vector<std::string> fileList;
-  loadFileList(fileName, fileList);
-  loadData(fileList);
-}
-void ProtoDataProvider::checkDataHeader(const DataHeader& header) {
-  if (header_.slot_defs_size()) {
-    // header_ is already set. Need to check consistency.
-    CHECK_EQ(header_.slot_defs_size(), header.slot_defs_size())
-        << "Different header";
-    for (int i = 0; i < header.slot_defs_size(); ++i) {
-      CHECK_EQ(header_.slot_defs(i).type(), header.slot_defs(i).type());
-      CHECK_EQ(header_.slot_defs(i).dim(), header.slot_defs(i).dim());
-    }
-    return;
-  }
-  // header_ is not set before
-  CHECK(header.slot_defs_size()) << "Invalid header: no slot is defined";
-  int i;
-  for (i = 0; i < header.slot_defs_size(); ++i) {
-    if (header.slot_defs(i).type() == SlotDef::INDEX ||
-        header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX) {
-      break;
-    }
-    constexpr int kBufLen = 100;
-    char buf[kBufLen];
-    snprintf(buf, kBufLen, "slot%d_nnz", i);
-    nnzStats_.push_back(getStat(buf));
-  }
-  numVecSlots_ = i;
-  // Check that INDEX slots are after VECTOR slots
-  for (int i = numVecSlots_; i < header.slot_defs_size(); ++i) {
-    CHECK(header.slot_defs(i).type() == SlotDef::INDEX ||
-          header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX);
-  }
-  slots_.clear();
-  slots_.reserve(header.slot_defs_size());
-  for (int i = 0; i < header.slot_defs_size(); ++i) {
-    slots_.emplace_back();
-    slots_.back().type = header.slot_defs(i).type();
-    slots_.back().dim = header.slot_defs(i).dim();
-    if (SlotDef::VECTOR_SPARSE_NON_VALUE == header.slot_defs(i).type() ||
-        SlotDef::VECTOR_SPARSE_VALUE == header.slot_defs(i).type()) {
-      slots_.back().indices.push_back(0);
-    }
-  }
-  header_ = header;
-}
-void ProtoDataProvider::checkSample(const DataSample& sample) {
-  CHECK_EQ(numVecSlots_, sample.vector_slots_size());
-  CHECK(header_.slot_defs_size() == numVecSlots_ + sample.id_slots_size() ||
-        header_.slot_defs_size() == numVecSlots_ + sample.var_id_slots_size());
-  for (int i = 0; i < numVecSlots_; ++i) {
-    uint32_t dim = header_.slot_defs(i).dim();
-    switch (header_.slot_defs(i).type()) {
-      case SlotDef::VECTOR_DENSE: {
-        CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
-        CHECK_EQ(0, sample.vector_slots(i).ids_size());
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          break;
-        }
-        CHECK_LT(0, sample.vector_slots(i).ids_size());
-        CHECK_EQ(0, sample.vector_slots(i).values_size());
-        auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
-                                       sample.vector_slots(i).ids().end());
-        CHECK_GT(dim, maxId);
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          CHECK_EQ(0, sample.vector_slots(i).values_size());
-          break;
-        }
-        CHECK_LT(0, sample.vector_slots(i).values_size());
-        CHECK_GE(static_cast<int>(dim), sample.vector_slots(i).values_size());
-        CHECK_EQ(sample.vector_slots(i).values_size(),
-                 sample.vector_slots(i).ids_size());
-        auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
-                                       sample.vector_slots(i).ids().end());
-        CHECK_GT(dim, maxId);
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        if (static_cast<int>(dim) != 0) {
-          CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
-          if (sample.vector_slots(i).dims_size() != 0) {
-            int totalDim = sample.vector_slots(i).dims(0);
-            for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
-              totalDim *= sample.vector_slots(i).dims(j);
-            }
-            CHECK_EQ(static_cast<int>(dim), totalDim);
-          }
-        } else {
-          CHECK_NE(sample.vector_slots(i).dims_size(), 0);
-          int totalDim = sample.vector_slots(i).dims(0);
-          for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
-            totalDim *= sample.vector_slots(i).dims(j);
-          }
-          CHECK_EQ(totalDim, sample.vector_slots(i).values_size());
-        }
-        break;
-      }
-      case SlotDef::STRING: {
-        CHECK_EQ(static_cast<int>(1), sample.vector_slots(i).strs_size());
-        CHECK_EQ(0, sample.vector_slots(i).ids_size());
-        CHECK_EQ(0, sample.vector_slots(i).values_size());
-        break;
-      }
-      default:
-        LOG(FATAL) << "BUG: Should not reach here";
-    }
-  }
-  for (int i = numVecSlots_; i < header_.slot_defs_size(); ++i) {
-    if (header_.slot_defs(i).type() != SlotDef::VAR_MDIM_INDEX) {
-      uint32_t id = sample.id_slots(i - numVecSlots_);
-      if (id == -1U) continue;
-      CHECK_LT(id, header_.slot_defs(i).dim());
-    } else {
-      for (int j = 0; j < sample.var_id_slots(i - numVecSlots_).ids_size();
-           ++j) {
-        uint32_t id = sample.var_id_slots(i - numVecSlots_).ids(j);
-        CHECK_LT(id, header_.slot_defs(i).dim());
-      }
-    }
-  }
-}
-void ProtoDataProvider::loadDataFile(const std::string& fileName) {
-  std::ifstream is(fileName);
-  CHECK(is) << "Fail to open " << fileName;
-  bool dataCompression = str::endsWith(fileName, ".gz");
-  std::unique_ptr<ProtoReader> reader(new ProtoReader(&is, dataCompression));
-  CHECK(reader) << "Fail to create proto data input stream";
-  DataHeader header;
-  CHECK(reader->read(&header));
-  checkDataHeader(header);
-  DataSample sample;
-  do {
-    if (!reader->read(&sample)) {
-      break;
-    }
-    checkSample(sample);
-    if (sample.is_beginning()) {
-      sequenceStartPositions_.push_back(sampleNums_);
-    }
-    fillSlots(sample);
-    ++sampleNums_;
-  } while (true);
-  CHECK(is.eof()) << "Fail to read file";
-  reader.reset(nullptr);
-  is.close();
-}
-// checkSample has done before, no check here
-void ProtoDataProvider::fillSlots(const DataSample& sample) {
-  for (size_t i = 0; i < slots_.size(); ++i) {
-    auto& slot = slots_[i];
-    int dim = slot.dim;
-    switch (slot.type) {
-      case SlotDef::VECTOR_DENSE: {
-        size_t oldSize = slot.denseData.size();
-        slot.denseData.resize(oldSize + dim);
-        const float* values = sample.vector_slots(i).values().data();
-#ifdef PADDLE_TYPE_DOUBLE
-        std::copy(values, values + dim, slot.denseData.begin() + oldSize);
-#else
-        memcpy(slot.denseData.data() + oldSize, values, sizeof(real) * dim);
-#endif
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        int slotSize = sample.vector_slots(i).ids_size();
-        int subSlotSize = 0;
-        int id = 0;  // the slot id
-        // find whether this vector_slots has subseq. If not has subseq,
-        // subSlotSize = 0.
-        for (id = 0; id < sample.subseq_slots_size(); id++) {
-          if (sample.subseq_slots(id).slot_id() == i) {
-            subSlotSize = sample.subseq_slots(id).lens_size();
-            break;
-          }
-        }
-        if (subSlotSize && slot.subIndices.size() == 0UL) {
-          // If has subSeq, the first element of subIndices = 0.
-          slot.subIndices.push_back(0);
-        }
-        if (slotSize == 0UL) {
-          // if has no id, new indices = old indices.
-          slot.indices.push_back(slot.indices.back());
-          // if has subSeq, new subIndices = old subIndices.
-          if (slot.subIndices.size()) {
-            slot.subIndices.push_back(slot.subIndices.back());
-          }
-          break;
-        }
-        slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
-        const unsigned int* ids = sample.vector_slots(i).ids().data();
-        memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
-               ids,
-               sizeof(*ids) * slotSize);
-        slot.indices.push_back(slot.indices.back() + slotSize);
-        if (subSlotSize) {
-          for (int ii = 0; ii < subSlotSize; ++ii) {
-            slot.subIndices.push_back(slot.subIndices.back() +
-                                      sample.subseq_slots(id).lens(ii));
-          }
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          slot.indices.push_back(slot.indices.back());
-          break;
-        }
-        int slotSize = sample.vector_slots(i).ids_size();
-        slot.sparseFloatValueData.resize(slot.indices.back() + slotSize);
-        const unsigned int* ids = sample.vector_slots(i).ids().data();
-        const float* values = sample.vector_slots(i).values().data();
-        for (int ii = 0; ii < slotSize; ++ii) {
-          slot.sparseFloatValueData[slot.indices.back() + ii].col = ids[ii];
-          slot.sparseFloatValueData[slot.indices.back() + ii].value =
-              values[ii];
-        }
-        slot.indices.push_back(slot.indices.back() + slotSize);
-        break;
-      }
-      case SlotDef::INDEX: {
-        slot.indexData.push_back(sample.id_slots(i - numVecSlots_));
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        size_t oldSize = slot.varDenseData.size();
-        slot.varDenseData.resize(oldSize + 1);
-        size_t varDim = sample.vector_slots(i).values_size();
-        slot.varDenseData[oldSize].data.resize(varDim);
-        const float* values = sample.vector_slots(i).values().data();
-#ifdef PADDLE_TYPE_DOUBLE
-        std::copy(
-            values, values + varDim, slot.varDenseData[oldSize].data.data());
-#else
-        memcpy(slot.varDenseData[oldSize].data.data(),
-               values,
-               sizeof(real) * varDim);
-#endif
-        slot.varDenseData[oldSize].dims.resize(
-            sample.vector_slots(i).dims_size());
-        memcpy(slot.varDenseData[oldSize].dims.data(),
-               sample.vector_slots(i).dims().data(),
-               sizeof(uint32_t) * sample.vector_slots(i).dims_size());
-        break;
-      }
-      case SlotDef::VAR_MDIM_INDEX: {
-        size_t oldSize = slot.varIndices.size();
-        slot.varIndices.resize(oldSize + 1);
-        size_t varDim = sample.var_id_slots(i - numVecSlots_).ids_size();
-        slot.varIndices[oldSize].resize(varDim);
-        memcpy(slot.varIndices[oldSize].data(),
-               sample.var_id_slots(i - numVecSlots_).ids().data(),
-               sizeof(uint32_t) * varDim);
-        break;
-      }
-      case SlotDef::STRING: {
-        slot.strData.push_back(sample.vector_slots(i).strs(0));
-        break;
-      }
-    }
-  }
-}
-void ProtoDataProvider::showDataStats() {
-  std::ostringstream oss;
-  for (size_t i = 0; i < slots_.size(); ++i) {
-    auto& slot = slots_[i];
-    if (slot.type == SlotDef::VECTOR_SPARSE_NON_VALUE) {
-      size_t nnz = slot.sparseNonValueData.size();
-      oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
-    } else if (slot.type == SlotDef::VECTOR_SPARSE_VALUE) {
-      size_t nnz = slot.sparseFloatValueData.size();
-      oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
-    }
-  }
-  LOG(INFO) << oss.str();
-}
-void ProtoDataProvider::reset() {
-  currentSequenceIndex_ = 0;
-  if (!skipShuffle_) {
-    shuffle();
-  }
-  DataProvider::reset();
-}
-void ProtoDataProvider::shuffle() {
-  std::shuffle(shuffledSequenceIds_.begin(),
-               shuffledSequenceIds_.end(),
-               ThreadLocalRandomEngine::get());
-}
-/*
-  Loop through sequences starting from currentSequenceIndex_
-  for at most size samples. For each sequence ranging from [begin, end),
-  op(begin, end) will be called.
-  return the number of sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::sequenceLoop(Op op, int64_t size) {
-  int64_t sz = 0;
-  size_t i;
-  size_t sequenceCount = shuffledSequenceIds_.size();
-  if (usageRatio_ < 1.0f) {
-    sequenceCount = static_cast<int64_t>(sequenceCount * usageRatio_);
-  }
-  for (i = currentSequenceIndex_; i < sequenceCount; ++i) {
-    size_t id = shuffledSequenceIds_[i];
-    int64_t begin = sequenceStartPositions_[id];
-    int64_t end = sequenceStartPositions_[id + 1];
-    int64_t len = end - begin;
-    if (sz + len > size && sz > 0) break;
-    sz += len;
-    op(begin, end);
-  }
-  return i - currentSequenceIndex_;
-}
-/*
-  Loop through sequences starting from currentSequenceIndex_
-  for at most size samples. For each sample of each sequence at position
-  pos, op(pos) will be called.
-  return the number of sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::sampleLoop(Op op, int64_t size) {
-  if (iidData()) {
-    size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
-    for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
-         ++i) {
-      size_t pos = shuffledSequenceIds_[i];
-      op(pos);
-    }
-    return size;
-  } else {
-    auto f = [op](int64_t begin, int64_t end) {
-      for (int64_t pos = begin; pos < end; ++pos) {
-        op(pos);
-      }
-    };
-    return sequenceLoop(f, size);
-  }
-}
-/*
-  Loop through sub-sequences starting from currentSequenceIndex_
-  for at most size samples. For each sample of each sub-sequence at position
-  pos, op(pos) will be called.
-  return the number of sub-sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::subSampleLoop(Op op, int64_t size, int slot) {
-  CHECK(iidData()) << "subSampleLoop only accepts iid data";
-  size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
-  int subSize = 0;
-  for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
-       ++i) {
-    size_t pos = shuffledSequenceIds_[i];
-    int64_t* indexs = slots_[slot].indices.data();
-    int64_t* subIndexs = slots_[slot].subIndices.data();
-    int64_t subSeqStart = 0;
-    int64_t subSeqEnd = 0;
-    for (int j = 0; j < (int)slots_[slot].subIndices.size(); j++) {
-      if (subIndexs[j] == indexs[pos]) {
-        subSeqStart = j;
-        if (subIndexs[pos] == subIndexs[pos + 1]) {
-          subSeqEnd = j + 1;
-          break;
-        }
-      } else if (subIndexs[j] == indexs[pos + 1]) {
-        subSeqEnd = j;
-        break;
-      }
-    }
-    for (int j = subSeqStart; j < subSeqEnd; j++) {
-      op(j);
-    }
-    subSize += subSeqEnd - subSeqStart;
-  }
-  return subSize;
-}
-int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
-                                                DataBatch* batch) {
-  int64_t numSequences = 0;  // actual number of sequences in the batch
-  // the number of sequences scanned, including those skipped because too long
-  int64_t numScannedSeqs = 0;
-  std::lock_guard<RWLock> guard(lock_);
-  if (iidData()) {
-    size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
-    numScannedSeqs = numSequences = size;
-  } else {
-    int64_t sz = 0;
-    auto op = [&sz, &numSequences](int64_t begin, int64_t end) {
-      ++numSequences;
-      sz += end - begin;
-    };
-    numScannedSeqs = sequenceLoop(op, size);
-    VLOG_IF(1, numScannedSeqs > numSequences)
-        << numScannedSeqs - numSequences
-        << " sequences are skipped because longer than " << size;
-    size = sz;
-  }
-  if (size <= 0) return 0;
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(header_.slot_defs_size());
-  if (!iidData()) {
-    ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
-                                  numSequences + 1,
-                                  /* useGpu= */ false);
-    int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
-    int pos = 0;
-    int i = 0;
-    auto op = [buf, &pos, &i](int64_t begin, int64_t end) {
-      buf[i] = pos;
-      pos += end - begin;
-      ++i;
-    };
-    sequenceLoop(op, size);
-    buf[i] = size;
-    for (size_t slot = 1; slot < cpuArguments.size(); ++slot) {
-      cpuArguments[slot].sequenceStartPositions =
-          cpuArguments[0].sequenceStartPositions;
-    }
-  }
-  for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
-    size_t dim = header_.slot_defs(slot).dim();
-    SlotDef::SlotType slotType = header_.slot_defs(slot).type();
-    std::vector<int64_t> dataPos;
-    dataPos.reserve(size);
-    auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
-    sampleLoop(op, size);
-    switch (slotType) {
-      case SlotDef::VECTOR_DENSE: {
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               dim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        for (int i = 0; i < size; ++i) {
-          memcpy(buf + i * dim,
-                 slots_[slot].denseData.data() + dataPos[i] * dim,
-                 sizeof(real) * dim);
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value =
-              Matrix::createSparseMatrix(size,
-                                         dim,
-                                         size /*DEFAULT_AVG_WIDTH = 1*/,
-                                         NO_VALUE,
-                                         SPARSE_CSR,
-                                         false,
-                                         useGpu_);
-        }
-        auto mat = cpuArguments[slot].value;
-        mat->resize(size, dim);
-        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseNonValueData.data(),
-              HPPL_STREAM_1);
-        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseNonValueData.data());
-        } else {
-          LOG(FATAL) << "Not Supported";
-        }
-        size_t numElements = 0;
-        for (auto pos : dataPos) {
-          numElements +=
-              slots_[slot].indices[pos + 1] - slots_[slot].indices[pos];
-        }
-        nnzStats_[slot]->addSample(numElements);
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value =
-              Matrix::createSparseMatrix(size,
-                                         dim,
-                                         size /*DEFAULT_AVG_WIDTH = 1*/,
-                                         FLOAT_VALUE,
-                                         SPARSE_CSR,
-                                         false,
-                                         useGpu_);
-        }
-        auto mat = cpuArguments[slot].value;
-        mat->resize(size, dim);
-        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data(),
-              HPPL_STREAM_1);
-        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data());
-        } else {
-          LOG(FATAL) << "Not Supported";
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                size,
-                                /*  useGpu= */ false);
-        int* buf = cpuArguments[slot].ids->getData();
-        for (int i = 0; i < size; ++i) {
-          buf[i] = slots_[slot].indexData[dataPos[i]];
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        CHECK_EQ(size, 1);
-        auto mat = cpuArguments[slot].value;
-        size_t totalDim = slots_[slot].varDenseData[dataPos[0]].data.size();
-        CHECK_EQ(slots_[slot].varDenseData[dataPos[0]].dims.size(), size_t(3));
-        size_t height, width, depth, oldWidth;
-        /* dims[2] is depth, will be changed to dims[0] in future */
-        depth = slots_[slot].varDenseData[dataPos[0]].dims[2];
-        height = slots_[slot].varDenseData[dataPos[0]].dims[1];
-        width = slots_[slot].varDenseData[dataPos[0]].dims[0];
-        oldWidth = width;
-        /* process the undesirable sample */
-        if (oldWidth < height) {
-          width = height;
-        }
-        cpuArguments[slot].setFrameHeight(height);
-        cpuArguments[slot].setFrameWidth(width);
-        if (oldWidth < height) {
-          totalDim = width * height * depth;
-        }
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               totalDim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        cpuArguments[slot].value->zeroMem();
-        if (oldWidth < height) {
-          real* srcBuf = slots_[slot].varDenseData[dataPos[0]].data.data();
-          for (size_t i = 0; i < depth; i++) {
-            for (size_t j = 0; j < height; j++) {
-              for (size_t k = 0; k < oldWidth; k++) {
-                buf[i * height * width + j * width + k] =
-                    srcBuf[i * height * oldWidth + j * oldWidth + k];
-              }
-            }
-          }
-        } else {
-          memcpy(buf,
-                 slots_[slot].varDenseData[dataPos[0]].data.data(),
-                 sizeof(real) * totalDim);
-        }
-        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                      size + 1, /* size == 1 currently */
-                                      /* useGpu= */ false);
-        int* bufStarts =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        bufStarts[0] = 0;
-        bufStarts[1] = 1;
-        break;
-      }
-      case SlotDef::VAR_MDIM_INDEX: {
-        CHECK_EQ(size, 1);
-        size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                totalDim,
-                                /*  useGpu= */ false);
-        int* buf = cpuArguments[slot].ids->getData();
-        memcpy(buf,
-               slots_[slot].varIndices[dataPos[0]].data(),
-               sizeof(int) * totalDim);
-        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                      size + 1, /* size == 1 currently */
-                                      /* useGpu= */ false);
-        int* bufStarts =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        bufStarts[0] = 0;
-        /* we expand the convolutinal feature map to a sequence data,
-         * so there should be a corresponding sequence labels */
-        bufStarts[1] = totalDim;
-        break;
-      }
-      case SlotDef::STRING: {
-        if (cpuArguments[slot].strs) {
-          cpuArguments[slot].strs->resize(size);
-        } else {
-          cpuArguments[slot].strs =
-              std::make_shared<std::vector<std::string>>(size);
-        }
-        for (int i = 0; i < size; ++i) {
-          (*cpuArguments[slot].strs)[i] = slots_[slot].strData[dataPos[i]];
-        }
-        break;
-      }
-    }
-  }
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (int i = 0; i < header_.slot_defs_size(); ++i) {
-      SlotDef::SlotType slotType = header_.slot_defs(i).type();
-      if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
-          SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
-        gpuArguments[i] = cpuArguments[i];
-        gpuArguments[i].sequenceStartPositions =
-            cpuArguments[i].sequenceStartPositions;
-      } else {
-        gpuArguments[i].resizeAndCopyFrom(
-            cpuArguments[i], useGpu_, HPPL_STREAM_1);
-      }
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-  currentSequenceIndex_ += numScannedSeqs;
-  return batch->getSize();
-}
-ProtoSequenceDataProvider::ProtoSequenceDataProvider(const DataConfig& config,
-                                                     bool useGpu,
-                                                     bool loadDataAll)
-    : ProtoDataProvider(config, useGpu, loadDataAll) {}
-int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
-                                                        DataBatch* batch) {
-  CHECK(iidData()) << "ProtoSequenceDataProvider only accepts iid data";
-  int64_t numSequences = 0;  // actual number of sequences in the batch
-  // the number of sequences scanned, including those skipped because too long
-  int64_t numScannedSeqs = 0;
-  std::lock_guard<RWLock> guard(lock_);
-  size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
-  numScannedSeqs = numSequences = size;
-  if (size <= 0) return 0;
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(header_.slot_defs_size());
-  for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
-    SlotDef::SlotType slotType = header_.slot_defs(slot).type();
-    std::vector<int64_t> dataPos;
-    dataPos.reserve(size);
-    auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
-    sampleLoop(op, size);
-    // current slot: sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                  size + 1,
-                                  /* useGpu= */ false);
-    switch (slotType) {
-      case SlotDef::VECTOR_SPARSE_VALUE:
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "ProtoSequenceDataProvider only support"
-                   << " VECTOR_DENSE, VECTOR_SPARSE_NON_VALUE and INDEX slots";
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        // copy to IDS, not value
-        // pointers used in current slot
-        sparse_non_value_t* data = slots_[slot].sparseNonValueData.data();
-        int64_t* indexs = slots_[slot].indices.data();
-        int64_t* seqs = dataPos.data();
-        // current slot: i need size instances. what is the total length?
-        int totalFeatureInCurrentSlot = 0;
-        for (int ins = 0; ins < size; ins++) {
-          int64_t currInsId = seqs[ins];
-          totalFeatureInCurrentSlot +=
-              indexs[currInsId + 1] - indexs[currInsId];
-          // special: if current instance has NO feature in current slot
-          if (indexs[currInsId + 1] == indexs[currInsId]) {
-            totalFeatureInCurrentSlot++;
-          }
-        }
-        // done
-        // current slot: ids
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                totalFeatureInCurrentSlot,
-                                /* useGpu= */ false);
-        // where to write
-        int* currPosOfArgumentId = cpuArguments[slot].ids->getData();
-        int* currPosOfArgumentSeqStart =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        int allSequenceLength = 0;
-        currPosOfArgumentSeqStart[0] = 0;
-        // for each instance, copy data and fill sequence positions
-        for (int instance = 0; instance < size; instance++) {
-          int64_t currInstanceId = seqs[instance];
-          int64_t currInstanceLength =
-              indexs[currInstanceId + 1] - indexs[currInstanceId];
-          sparse_non_value_t* currInstanceData = data + indexs[currInstanceId];
-          // write sequenceStartPositions
-          allSequenceLength += currInstanceLength;
-          currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
-          // copy features
-          for (int featCopier = 0; featCopier < currInstanceLength;
-               featCopier++) {
-            currPosOfArgumentId[featCopier] = currInstanceData[featCopier].col;
-          }
-          currPosOfArgumentId += currInstanceLength;
-          // special: if current instance has NO feature in current slot
-          if (currInstanceLength == 0) {
-            allSequenceLength++;
-            currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
-            currPosOfArgumentId[0] = -1;
-            currPosOfArgumentId++;
-          }
-          // done
-        }
-        if (slots_[slot].subIndices.size()) {
-          std::vector<int64_t> dataSubPos;
-          auto op = [this, &dataSubPos](int64_t pos) {
-            dataSubPos.push_back(pos);
-          };
-          int subSize = subSampleLoop(op, size, slot);
-          ICpuGpuVector::resizeOrCreate(
-              cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
-          int* currPosOfArgumentSubSeqStart =
-              cpuArguments[slot].subSequenceStartPositions->getMutableData(
-                  false);
-          int64_t* subSeqs = dataSubPos.data();
-          int64_t* subIndexs = slots_[slot].subIndices.data();
-          int allSubSequenceLength = 0;
-          currPosOfArgumentSubSeqStart[0] = 0;
-          // for each instance, compute sub-sequence number
-          for (int instance = 0; instance < subSize; instance++) {
-            int64_t currSubInstanceId = subSeqs[instance];
-            int64_t currSubInstanceLength =
-                subIndexs[currSubInstanceId + 1] - subIndexs[currSubInstanceId];
-            // write subSequenceStartPositions
-            allSubSequenceLength += currSubInstanceLength;
-            currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
-            // special: if current instance has NO feature in current slot
-            if (currSubInstanceLength == 0) {
-              allSubSequenceLength++;
-              currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
-            }
-          }
-          cpuArguments[slot].checkSubset();
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        // label slot
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                size,
-                                /* useGpu= */ false);
-        // fill labels
-        int* buf = cpuArguments[slot].ids->getData();
-        for (int i = 0; i < size; ++i) {
-          buf[i] = slots_[slot].indexData[dataPos[i]];
-        }
-        // label HAS sequence structure
-        cpuArguments[slot].sequenceStartPositions->fillSequence(false);
-        break;
-      }
-      case SlotDef::VECTOR_DENSE: {
-        // copy values
-        size_t dim = header_.slot_defs(slot).dim();
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               dim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        for (int i = 0; i < size; ++i) {
-          memcpy(buf + i * dim,
-                 slots_[slot].denseData.data() + dataPos[i] * dim,
-                 sizeof(real) * dim);
-        }
-        // sequence structure
-        cpuArguments[slot].sequenceStartPositions->fillSequence(false);
-        break;
-      }
-      default: { LOG(FATAL) << "should not reach here"; }
-    }
-  }
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (size_t i = 0; i < cpuArguments.size(); ++i) {
-      gpuArguments[i].resizeAndCopyFrom(
-          cpuArguments[i], useGpu_, HPPL_STREAM_1);
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-  currentSequenceIndex_ += numScannedSeqs;
-  return batch->getSize();
-}
-}  // namespace paddle
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <vector>
-#include "DataFormat.pb.h"
-#include "paddle/utils/Stat.h"
-#include "DataProvider.h"
-#include "ProtoReader.h"
-namespace paddle {
-/**
- * @brief Provider data from protobuf data file with each sample
- * specified by proto message
- *
- * DataSample defined in DataFormat.proto.
- *
- * The file format is
- *
- *    header
- *
- *    sample1
- *
- *    sample2
- *
- *    ...
- *
- *    sampleN
- *
- * @note: In the data file, each message is prefixed with its length.
- * The read/write of the protbuf are implemented in ProtoReader.h
- */
-class ProtoDataProvider : public DataProvider {
-public:
-  ProtoDataProvider(const DataConfig& config,
-                    bool useGpu,
-                    bool loadDataAll = true);
-  virtual void reset();
-  /**
-   * @note this size includes the sequences which are skipped because they
-   * are longer than the batch size.
-   */
-  virtual int64_t getSize() {
-    int64_t size = sampleNums_;
-    if (usageRatio_ < 1.0f) {
-      size = static_cast<int64_t>(size * usageRatio_);
-    }
-    return size;
-  }
-  virtual void shuffle();
-  void loadData(const std::vector<std::string>& fileList);
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-protected:
-  /**
-   * @brief load protobuf data from a list of file
-   * @param[in]  fileName  file name of a file which contains
-   * a list of file names
-   */
-  void loadData(const std::string& fileName);
-  /**
-   * @brief load protobuf data from file
-   * @param[in]  fileName   data file name
-   */
-  void loadDataFile(const std::string& fileName);
-  /** @brief check data header of each data sample
-   *  @param[in] header     data header read from protobuf data
-   */
-  void checkDataHeader(const DataHeader& header);
-  /**
-   * @brief fill protobuf data into slot_,
-   * slot_ is a vector of ProtoSlot in memory.
-   * @param[in]  sample     data sample read from protobuf data
-   */
-  void fillSlots(const DataSample& sample);
-  /**
-   * @brief return true if each sample is one sequence, i.e., independent
-   * of other samples.
-   */
-  inline bool iidData() const { return sequenceStartPositions_.empty(); }
-  /**
-   * @brief check that sample is consistent with header_
-   */
-  void checkSample(const DataSample& sample);
-  template <class Op>
-  int64_t sequenceLoop(Op op, int64_t size);
-  template <class Op>
-  int64_t sampleLoop(Op op, int64_t size);
-  template <class Op>
-  int64_t subSampleLoop(Op op, int64_t size, int slot);
-  void showDataStats();
-protected:
-  struct ProtoVarSlot {
-    std::vector<real> data;
-    std::vector<int> dims;
-  };
-  struct ProtoSlot {
-    SlotDef::SlotType type;
-    int dim;
-    std::vector<int> indexData;
-    std::vector<real> denseData;
-    std::vector<sparse_non_value_t> sparseNonValueData;
-    std::vector<sparse_float_value_t> sparseFloatValueData;
-    std::vector<int64_t> indices;
-    std::vector<int64_t> subIndices;
-    std::vector<ProtoVarSlot> varDenseData;
-    std::vector<std::vector<int>> varIndices;
-    std::vector<std::string> strData;
-  };
-  DataHeader header_;
-  int numVecSlots_;
-  std::vector<ProtoSlot> slots_;
-  size_t sampleNums_;
-  /**
-   * The starting position of each sequence in samples.
-   * The last element should be num of samples.
-   * If empty, each sample is one sequence.
-   */
-  std::vector<size_t> sequenceStartPositions_;
-  int64_t currentSequenceIndex_;
-  // The size should be the number of sequences.
-  std::vector<size_t> shuffledSequenceIds_;
-  ThreadLocalD<DataBatch> cpuBatch_;
-  ThreadLocalD<DataBatch> gpuBatch_;
-  RWLock lock_;
-  std::vector<StatPtr> nnzStats_;  // stats for number of none-zeros entries
-};
-/**
- * @brief Special use for Proto data: instances should contain sparse-non-value
- * slots
- * and label.
- *
- * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
- */
-class ProtoSequenceDataProvider : public ProtoDataProvider {
-public:
-  ProtoSequenceDataProvider(const DataConfig& config,
-                            bool useGpu,
-                            bool loadDataAll = true);
-  ~ProtoSequenceDataProvider() {}
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-};
-}  // namespace paddle
--- a/paddle/gserver/layers/DotProdLayer.cpp
+++ b/paddle/gserver/layers/DotProdLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+/**
+ * @brief A layer for computing the dot product of two vectors.
+ * Input1: vector (batchSize * dim)
+ * Input2: vector (batchSize * dim)
+ * Output: a matrix: (batchSize * 1)
+ */
+class DotProdLayer : public Layer {
+public:
+  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
+  ~DotProdLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+REGISTER_LAYER(dot_prod, DotProdLayer);
+bool DotProdLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 2U);
+  CHECK_EQ(1UL, getSize())
+      << "The output dimensionality of this layer should be fixed to 1.";
+  return true;
+}
+void DotProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  size_t batchSize = inV0->getHeight();
+  CHECK_EQ(inV1->getHeight(), batchSize);
+  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, 1);
+  }
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
+    outV->sumOfProducts(*inV0, *inV1, 1, 0);
+  }
+}
+void DotProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  {
+    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
+    if (inG0) {
+      inG0->addRowScale(0, *inV1, *outG);
+    }
+    if (inG1) {
+      inG1->addRowScale(0, *inV0, *outG);
+    }
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNConcatLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "MKLDNNConcatLayer.h"
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+namespace paddle {
+REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer);
+bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  CHECK_GT(inputLayers_.size(), 1UL);
+  CHECK(!biasParameter_);
+  return true;
+}
+void MKLDNNConcatLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  ic = inputLayers_[0]->getSize() / ih / iw;
+  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
+  CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+  CHECK_GT(inputLayers_.size(), 1UL);
+  channels_.resize(inputLayers_.size());
+  channels_[0] = ic;
+  // need change the output channel, so use oc_ instead
+  // TODO(TJ): change API, use &oc
+  oc_ = ic;
+  for (size_t i = 1; i < inputLayers_.size(); i++) {
+    int batchsize, height, witdh;
+    reshapeInput(batchsize, height, witdh, i);
+    CHECK_EQ(bs, batchsize);
+    CHECK_EQ(ih, height);
+    CHECK_EQ(iw, witdh);
+    channels_[i] = inputLayers_[i]->getSize() / height / witdh;
+    CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
+    oc_ += channels_[i];
+  }
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc_ * oh * ow);
+}
+void MKLDNNConcatLayer::resetFwd(std::vector<primitive>& pipeline,
+                                 MKLDNNMatrixPtr& in,
+                                 MKLDNNMatrixPtr& wgt,
+                                 MKLDNNMatrixPtr& bias,
+                                 MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inVals_, out);
+  in = inVals_[0];
+  std::shared_ptr<concat::primitive_desc> fwdPD;
+  resetFwdPD(fwdPD, inVals_, out);
+  resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+}
+void MKLDNNConcatLayer::resetBwd(std::vector<primitive>& pipeline,
+                                 MKLDNNMatrixPtr& in,
+                                 MKLDNNMatrixPtr& wgt,
+                                 MKLDNNMatrixPtr& bias,
+                                 MKLDNNMatrixPtr& out) {
+  resetBwdBuffers(inGrads_, out);
+  in = inGrads_[0];
+  resetBwdPipeline(pipeline, bwds_, inGrads_, out);
+}
+void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                        MKLDNNMatrixPtr& out) {
+  inputs.resize(inputLayers_.size());
+  bool has8c = false, has16c = false, hasnc = false;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    // resetInValue will use ic_ so temporary change as current input's channel
+    // TODO(TJ): change ic_ as vector then can remove channels_
+    ic_ = channels_[i];
+    resetInValue(inputs[i], nullptr, i);
+    CHECK(inputs[i]);
+    auto dm = inputs[i]->getDims();
+    // inputs format can be different, but ndims must equal
+    CHECK(i == 0 || dm.size() == inputs[0]->getDims().size());
+    CHECK_EQ(bs_, dm[0]);
+    CHECK_EQ(channels_[i], dm[1]);
+    if (dm.size() > 2) {
+      CHECK_EQ(ih_, dm[2]);
+      CHECK_EQ(iw_, dm[3]);
+    }
+    if (inputs[i]->getFormat() == format::nc) {
+      hasnc = true;
+    }
+    if (inputs[i]->getFormat() == format::nChw8c) {
+      has8c = true;
+    }
+    if (inputs[i]->getFormat() == format::nChw16c) {
+      has16c = true;
+    }
+  }
+  // change back, ic_ always save the input 0 size
+  ic_ = channels_[0];
+  format outFmt;
+  if (has16c && oc_ % 16 == 0) {
+    outFmt = format::nChw16c;
+  } else if (has8c && oc_ % 8 == 0) {
+    outFmt = format::nChw8c;
+  } else if (hasnc) {
+    CHECK(oh_ == 1 && ow_ == 1);
+    outFmt = format::nc;
+  } else {
+    outFmt = format::nchw;
+  }
+  memory::dims outDims =
+      hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_};
+  auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_);
+  resetOutValue(out, outPD);
+}
+void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr<concat::primitive_desc>& pd,
+                                   std::vector<MKLDNNMatrixPtr>& inputs,
+                                   MKLDNNMatrixPtr out) {
+  std::vector<memory::primitive_desc> srcPDs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
+  }
+  CHECK(out);
+  pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs));
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+}
+void MKLDNNConcatLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<concat::primitive_desc>& pd,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  std::vector<primitive::at> srcs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcs.push_back(*(inputs[i]));
+  }
+  fwd_.reset(new concat(*pd, srcs, *out));
+  pipeline.push_back(*fwd_);
+}
+void MKLDNNConcatLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                        MKLDNNMatrixPtr& out) {
+  CHECK(outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  CHECK(out);
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    CHECK(inVals_[i]);
+    // resetInGrad will use inVal_
+    // TODO(TJ): change move inVals_ to MKLDNNLayer ans remove inVal_
+    inVal_ = inVals_[i];
+    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
+  }
+  // change back, inVal_ always save the input 0
+  inVal_ = inVals_[0];
+}
+void MKLDNNConcatLayer::resetBwdPipeline(
+    std::vector<mkldnn::primitive>& pipeline,
+    std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  // reset the backward primitives
+  memory::dims offsets = {0, 0, 0, 0};
+  prims.resize(inputs.size());
+  CHECK_EQ(inputs.size(), channels_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto viewPD = view::primitive_desc(
+        out->getPrimitiveDesc(), inputs[i]->getDims(), offsets);
+    auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(),
+                                         inputs[i]->getPrimitiveDesc());
+    prims[i].reset(new reorder(bwdPD, *out, *(inputs[i])));
+    offsets[axis_] += channels_[i];
+    // push to pipeline
+    pipeline.push_back(*prims[i]);
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNConcatLayer.h
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+namespace paddle {
+/**
+ * @brief A subclass of MKLDNNLayer Concatenate layer.
+ *
+ * The config file api is mkldnn_concat
+ */
+class MKLDNNConcatLayer : public MKLDNNLayer {
+protected:
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
+  std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
+  // input channel numbers
+  std::vector<int> channels_;
+  // concat_dimension in MKLDNN
+  // if axis_ == 0, concat batchsize
+  // if axis_ == 1, concat channel (default)
+  int axis_;
+public:
+  explicit MKLDNNConcatLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), axis_(1) {}
+  ~MKLDNNConcatLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+  void printSizeInfo() override {
+    CHECK_EQ(channels_.size(), inputLayers_.size());
+    for (size_t i = 0; i < channels_.size(); ++i) {
+      VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
+                         << ": " << bs_ << ", " << channels_[i] << ", " << ih_
+                         << ", " << iw_;
+    }
+    VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
+                       << ", " << ow_;
+  }
+  void printValueFormat() override {
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << inVals_[i]->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
+    }
+    if (extOutVal_) {
+      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+    }
+  }
+  void printGradFormat() override {
+    if (extOutGrad_) {
+      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
+    }
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+    }
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << inGrads_[i]->getFormat() << "<<<";
+    }
+  }
+protected:
+  /**
+   * Forward functions: reset buffers(inputs, output, bias),
+   *                    reset primitive descriptor,
+   *                    reset pipeline.
+   */
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+  /**
+   * Backward functions: reset buffers(inputs, output, bias)
+   *                     reset primitives and pipeline
+   */
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -21,8 +21,8 @@ namespace paddle {
 bool MKLDNNLayer::init(const LayerMap& layerMap,
                       const ParameterMap& parameterMap) {
-  CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
+  CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn."
-                          << "Please set WITH_MKLDNN=ON "
+                          << "Please set WITH_MKL=ON "
                          << "and set use_mkldnn=True";
  CHECK(!useGpu_) << "Do not support GPU yet";
@@ -138,8 +138,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
  }
 }
-void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
+void MKLDNNLayer::reshapeInput(int& batchsize,
-  const Argument& input = inputLayers_[0]->getOutput();
+                               int& height,
+                               int& width,
+                               size_t inputIdx) {
+  const Argument& input = inputLayers_[inputIdx]->getOutput();
  batchsize = input.getBatchSize();
  int h = input.getFrameHeight();
  int w = input.getFrameWidth();

--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -178,7 +178,10 @@ protected:
  /**
   * reshape the input image sizes and input batchsize
   */
-  void reshapeInput(int& batchsize, int& height, int& width);
+  void reshapeInput(int& batchsize,
+                    int& height,
+                    int& width,
+                    size_t inputIdx = 0);
  /**
   * reshape output image sizes

--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -29,7 +29,7 @@ gserver_test(test_KmaxSeqScore)
 gserver_test(test_Expand)
 gserver_test(test_MaxPoolingWithMaskOutput)
-########## test_Mkldnn layers and activations ##########
+########## test_MKLDNN layers and activations ##########
 if(WITH_MKLDNN)
    add_unittest_without_exec(test_MKLDNN
        test_MKLDNN.cpp
@@ -62,17 +62,6 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
 endif()
 if(NOT MOBILE_INFERENCE)
-################### test_ProtoDataProvider ############
-    add_unittest_without_exec(test_ProtoDataProvider
-        test_ProtoDataProvider.cpp)
-    # test_ProtoDataProvider will mkdir as same name,
-    # so if WORKING_DIRECTORY is default directory, then
-    # mkdir will get error.
-    add_test(NAME test_ProtoDataProvider
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 ################## test_Evaluator #######################
    add_unittest(test_Evaluator
        test_Evaluator.cpp)
@@ -110,3 +99,24 @@ add_test(NAME test_PyDataProvider2
   COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
 )
+################# test_CompareSparse ##################
+add_unittest_without_exec(test_CompareSparse
+    test_CompareSparse.cpp)
+if(NOT ON_TRAVIS)
+  add_test(NAME test_CompareSparse
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+          ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+              ./.set_port.sh -p port -n 6
+                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endif()
+################ test_CompareTwoNets ######################
+add_unittest_without_exec(test_CompareTwoNets
+    test_CompareTwoNets.cpp)
+add_test(NAME test_CompareTwoNets
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+        ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 /**
- * @brief test the functionality of Mkldnnlayers
+ * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
 * refer to paddle original function
 */
 class MKLDNNTester {

--- a/paddle/gserver/tests/proto_files.txt
+++ b/paddle/gserver/tests/proto_files.txt
-./test_ProtoDataProvider/data1.bin
-./test_ProtoDataProvider/data2.bin
--- a/paddle/gserver/tests/proto_files_compressed.txt
+++ b/paddle/gserver/tests/proto_files_compressed.txt
-./test_ProtoDataProvider/data1.bin.gz
-./test_ProtoDataProvider/data2.bin.gz
--- a/paddle/trainer/tests/sample_trainer_config_opt_a.conf
+++ b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
+#!/usr/bin/env python
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,27 +15,50 @@
 from paddle.trainer_config_helpers import *
-################################### Data Configuration ###################################
+######################## data source ################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-################################### Algorithm Configuration ###################################
+dict_file = dict()
-settings(batch_size = 1000,
+for line_count, line in enumerate(open(dict_path, "r")):
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
+    dict_file[line.strip()] = line_count
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-fc1 = fc_layer(input=data, size=800,
+define_py_data_sources2(
-               bias_attr=True,
+    train_list='gserver/tests/Sequence/train.list',
-               act=SigmoidActivation())
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
-fc2 = fc_layer(input=fc1, size=800,
+settings(batch_size=5)
-               bias_attr=True,
+######################## network configure ################################
-               act=SigmoidActivation())
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+sparse_update = get_config_arg("sparse_update", bool, False)
-output = fc_layer(input=[fc1, fc2], size=10,
+data = data_layer(name="word", size=dict_dim)
-                  bias_attr=True,
-                  act=SoftmaxActivation())
-lbl = data_layer(name ="label", size=1)
+emb = embedding_layer(
+    input=data,
+    size=word_dim,
+    param_attr=ParamAttr(sparse_update=sparse_update))
-cost = classification_cost(input=output, label=lbl)
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
-outputs(cost)
+    lstm_input += full_matrix_projection(input=emb)
+lstm = lstmemory(
+    input=lstm_input,
+    act=TanhActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=TanhActivation())
+lstm_last = last_seq(input=lstm)
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
--- a/paddle/trainer/tests/sample_trainer_config_opt_b.conf
+++ b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
+#!/usr/bin/env python
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,27 +15,42 @@
 from paddle.trainer_config_helpers import *
-################################### Data Configuration ###################################
+######################## data source ################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-################################### Algorithm Configuration ###################################
+dict_file = dict()
-settings(batch_size = 1000,
+for line_count, line in enumerate(open(dict_path, "r")):
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
+    dict_file[line.strip()] = line_count
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-fc1 = fc_layer(input=data, size=800,
+define_py_data_sources2(
-               bias_attr=True,
+    train_list='gserver/tests/Sequence/train.list',
-               act=SigmoidActivation())
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
-fc2 = fc_layer(input=fc1, size=800,
+settings(batch_size=5)
-               bias_attr=True,
+######################## network configure ################################
-               act=SigmoidActivation())
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
-output = fc_layer(input=[fc1, fc2], size=10,
+# This config is designed to be equivalent with sequence_recurrent_group.py
-                  bias_attr=True,
-                  act=SoftmaxActivation())
-lbl = data_layer(name ="label", size=1)
+data = data_layer(name="word", size=dict_dim)
-cost = classification_cost(input=output, label=lbl)
+emb = embedding_layer(
-outputs(cost)
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
+recurrent_last = last_seq(input=recurrent)
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
--- a/paddle/gserver/tests/sequence_recurrent_group.py
+++ b/paddle/gserver/tests/sequence_recurrent_group.py
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+# This config is designed to be equivalent with sequence_recurrent.py
+data = data_layer(name="word", size=dict_dim)
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    with mixed_layer(
+            name="rnn_state",
+            size=hidden_dim,
+            bias_attr=False,
+            act=SoftmaxActivation()) as out:
+        out += identity_projection(input=y)
+        out += full_matrix_projection(
+            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
+    return out
+recurrent = recurrent_group(name="rnn", step=step, input=emb)
+recurrent_last = last_seq(input=recurrent)
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -22,8 +22,7 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
-static const string& configFile1 =
+static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
-    "trainer/tests/sample_trainer_config_compare_sparse.conf";
 DECLARE_bool(use_gpu);
 DECLARE_string(config);

--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
@@ -30,8 +30,6 @@ DECLARE_bool(use_gpu);
 DECLARE_string(config);
 DECLARE_string(nics);
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
 DEFINE_bool(need_high_accuracy,
            false,
            "whether need to run in double accuracy");
@@ -42,6 +40,10 @@ DEFINE_double(
 DECLARE_bool(thread_local_rand_use_global_seed);
 DECLARE_int32(seed);
+static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
+static const string& config_file_b =
+    "gserver/tests/sequence_recurrent_group.py";
 struct ComData {
  vector<Argument> outArgs;
  vector<ParameterPtr> parameters;
@@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) {
  DataBatch dataBatch;
  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+  trainer.getDataProvider()->reset();
  trainer.getDataProvider()->setSkipShuffle();
  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
@@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 TEST(Trainer, create) {
  ComData dataA;
-  calcGradient(dataA, FLAGS_config_file_a);
+  calcGradient(dataA, config_file_a);
  LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
  ComData dataB;
-  calcGradient(dataB, FLAGS_config_file_b);
+  calcGradient(dataB, config_file_b);
  LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
  compareGradient(dataA, dataB);

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1081,6 +1081,21 @@ TEST(Layer, InterpolationLayer) {
  }
 }
+TEST(Layer, DotProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("dot_prod");
+  config.layerConfig.set_size(1);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "dot_prod", 10, false, useGpu);
+  }
+}
 TEST(Layer, OuterProdLayer) {
  TestConfig config;
  config.layerConfig.set_type("out_prod");

--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -313,6 +313,47 @@ TEST(MKLDNNLayer, AddtoLayer) {
  testAddtoLayer({4, 12, 1, 1}, 3);
 }
+static void getMKLDNNConcatConfig(TestConfig& cfg,
+                                  const std::vector<testImageDesc>& inputs) {
+  CHECK_GE(inputs.size(), 2) << "at least two inputs";
+  int oc = inputs[0].ic;
+  for (size_t i = 1; i < inputs.size(); ++i) {
+    CHECK_EQ(inputs[i].bs, inputs[0].bs);
+    CHECK_EQ(inputs[i].ih, inputs[0].ih);
+    CHECK_EQ(inputs[i].iw, inputs[0].iw);
+    oc += inputs[i].ic;
+  }
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("mkldnn_concat");
+  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back(
+        {INPUT_DATA,
+         ss.str(),
+         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
+         0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(inputs[i].ic);
+    img_conf->set_img_size_y(inputs[i].ih);
+    img_conf->set_img_size(inputs[i].iw);
+  }
+}
+void testConcatLayer(const std::vector<testImageDesc>& inputs) {
+  TestConfig dnnConfig;
+  getMKLDNNConcatConfig(dnnConfig, inputs);
+  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
+}
+TEST(MKLDNNLayer, ConcatLayer) {
+  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
+  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
+}
 void testActivation(std::string actType, const testImageDesc& pm) {
  // TODO(TJ): remove me when paddle support elu activation
  if (actType == "mkldnn_elu") {

--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <memory>
-#include <string>
-#include <gtest/gtest.h>
-#include "paddle/gserver/dataproviders/ProtoDataProvider.h"
-#include "paddle/utils/Util.h"
-#include "paddle/testing/TestUtil.h"
-using namespace std;  // NOLINT
-std::vector<string> protoFiles{
-    "./test_ProtoDataProvider/data1.bin", "./test_ProtoDataProvider/data2.bin",
-};
-std::vector<string> protoFilesCompressed{
-    "./test_ProtoDataProvider/data1.bin.gz",
-    "./test_ProtoDataProvider/data2.bin.gz",
-};
-const char* kTestDir = "./test_ProtoDataProvider";
-const char kProtoFileList[] = "gserver/tests/proto_files.txt";
-const char kProtoFileListCompressed[] =
-    "gserver/tests/proto_files_compressed.txt";
-const int kSpraseMatrixDim = 1024;
-using namespace paddle;  // NOLINT
-void prepareData(DataBatch* batch,
-                 const int* numPerSlotType,
-                 bool iid,
-                 bool useGpu) {
-  batch->clear();
-  int64_t size = uniformRandom(100) + 10;
-  batch->setSize(size);
-  ICpuGpuVectorPtr sequenceStartPositions;
-  ICpuGpuVectorPtr subSequenceStartPositions;
-  if (!iid) {
-    int numSeqs = uniformRandom(10) + 1;
-    sequenceStartPositions =
-        ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
-    int* buf = sequenceStartPositions->getMutableData(false);
-    subSequenceStartPositions =
-        ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
-    int* subBuf = subSequenceStartPositions->getMutableData(false);
-    int64_t pos = 0;
-    int maxLen = 2 * size / numSeqs;
-    for (int i = 0; i < numSeqs; ++i) {
-      int len =
-          uniformRandom(min<int64_t>(maxLen, size - pos - numSeqs + i)) + 1;
-      buf[i] = pos;
-      subBuf[i] = pos;
-      pos += len;
-      VLOG(1) << " len=" << len;
-    }
-    buf[numSeqs] = size;
-    subBuf[numSeqs] = size;
-  }
-  vector<Argument>& arguments = batch->getStreams();
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_DENSE]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    MatrixPtr mat = Matrix::create(size, dim, /* trans= */ false, false);
-    mat->randomizeUniform();
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE]; ++i) {
-    MatrixPtr mat =
-        makeRandomSparseMatrix(size, kSpraseMatrixDim, false, useGpu);
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arg.subSequenceStartPositions = subSequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE]; ++i) {
-    MatrixPtr mat =
-        makeRandomSparseMatrix(size, kSpraseMatrixDim, true, useGpu);
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::STRING]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    SVectorPtr vec = std::make_shared<std::vector<std::string>>();
-    for (int j = 0; j < size; ++j) {
-      vec->push_back(randStr(dim));
-    }
-    Argument arg;
-    arg.strs = vec;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::INDEX]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    IVectorPtr vec = IVector::create(size, /* useGpu= */ false);
-    int* buf = vec->getData();
-    for (int j = 0; j < size; ++j) {
-      buf[j] = uniformRandom(dim);
-    }
-    Argument arg;
-    arg.ids = vec;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-}
-inline int getSlotDim(const Argument& arg) {
-  if (arg.value) {
-    return arg.value->getWidth();
-  } else if (arg.ids) {
-    return arg.ids->getMax() + 1;
-  } else if (arg.strs) {
-    return 1;
-  }
-  LOG(FATAL) << "Invalid argument";
-  return 0;
-}
-inline SlotDef::SlotType getSlotType(const Argument& arg) {
-  if (arg.value) {
-    auto& m = *arg.value;
-    auto& type = typeid(m);
-    if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) {
-      return SlotDef::VECTOR_DENSE;
-    }
-    if (type == typeid(CpuSparseMatrix)) {
-      auto valueType =
-          std::dynamic_pointer_cast<CpuSparseMatrix>(arg.value)->getValueType();
-      if (NO_VALUE == valueType) {
-        return SlotDef::VECTOR_SPARSE_NON_VALUE;
-      } else {
-        return SlotDef::VECTOR_SPARSE_VALUE;
-      }
-    }
-    if (type == typeid(GpuSparseMatrix)) {
-      auto valueType =
-          std::dynamic_pointer_cast<GpuSparseMatrix>(arg.value)->getValueType();
-      if (NO_VALUE == valueType) {
-        return SlotDef::VECTOR_SPARSE_NON_VALUE;
-      } else {
-        return SlotDef::VECTOR_SPARSE_VALUE;
-      }
-    }
-    LOG(FATAL) << "Unknown matrix type";
-  }
-  if (arg.ids) return SlotDef::INDEX;
-  if (arg.strs) return SlotDef::STRING;
-  LOG(FATAL) << "Invalid argument";
-  return SlotDef::VECTOR_DENSE;
-}
-void getColRow(const Argument& arg,
-               int64_t pos,
-               bool useGpu,
-               int* colNum,
-               const int** rowCols,
-               const real** rowValues) {
-  SlotDef::SlotType type = getSlotType(arg);
-  GpuSparseMatrixPtr matGpu;
-  CpuSparseMatrixPtr matCpu;
-  if (useGpu) {
-    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(arg.value);
-    ASSERT_TRUE(matGpu != NULL);
-  } else {
-    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(arg.value);
-    ASSERT_TRUE(matCpu != NULL);
-  }
-  *colNum = useGpu ? matGpu->getColNum(pos) : matCpu->getColNum(pos);
-  *rowCols = useGpu ? matGpu->getRowCols(pos) : matCpu->getRowCols(pos);
-  if (type == SlotDef::VECTOR_SPARSE_VALUE) {
-    *rowValues = useGpu ? matGpu->getRowValues(pos) : matCpu->getRowValues(pos);
-  } else {
-    *rowValues = NULL;
-  }
-}
-void makeSample(const vector<Argument>& arguments,
-                int64_t pos,
-                bool isBeginning,
-                DataSample* sample,
-                bool useGpu) {
-  sample->set_is_beginning(isBeginning);
-  int slotid = 0;
-  for (auto& arg : arguments) {
-    SlotDef::SlotType type = getSlotType(arg);
-    int64_t dim = getSlotDim(arg);
-    switch (type) {
-      case SlotDef::VECTOR_DENSE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto values = vecSlot->mutable_values();
-        values->Reserve(dim);
-        for (int i = 0; i < dim; ++i) {
-          values->AddAlreadyReserved(
-              static_cast<float>(arg.value->getElement(pos, i)));
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        sample->add_id_slots(arg.ids->get(pos));
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto ids = vecSlot->mutable_ids();
-        int colNum;
-        const int* rowCols;
-        const real* rowValues;  // nullptr
-        getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
-        ids->Reserve(colNum);
-        for (int i = 0; i < colNum; ++i) {
-          ids->AddAlreadyReserved(rowCols[i]);
-        }
-        SubseqSlot* subseqSlot = sample->add_subseq_slots();  // subseq
-        subseqSlot->set_slot_id(slotid);
-        auto lens = subseqSlot->mutable_lens();
-        lens->Add(colNum);
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto values = vecSlot->mutable_values();
-        auto ids = vecSlot->mutable_ids();
-        int colNum;
-        const int* rowCols;
-        const real* rowValues;
-        getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
-        ids->Reserve(colNum);
-        values->Reserve(colNum);
-        for (int i = 0; i < colNum; ++i) {
-          ids->AddAlreadyReserved(rowCols[i]);
-          values->AddAlreadyReserved(rowValues[i]);
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "Not implemented";
-        break;
-      }
-      case SlotDef::STRING: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        vecSlot->add_strs((*arg.strs)[pos]);
-        break;
-      }
-    }
-    slotid++;
-  }
-}
-void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
-  DataHeader header;
-  const vector<Argument>& arguments = batch.getStreams();
-  for (auto& argument : arguments) {
-    SlotDef* slotDef = header.add_slot_defs();
-    slotDef->set_type(getSlotType(argument));
-    slotDef->set_dim(getSlotDim(argument));
-  }
-  VLOG(1) << "header=" << header.DebugString();
-  int64_t totalSeqs = batch.getNumSequences();
-  int64_t seq = 0;
-  ICpuGpuVectorPtr sequenceStartPositions = arguments[0].sequenceStartPositions;
-  int64_t numWritten = 0;
-  vector<string> curProtoFiles =
-      dataCompression ? protoFilesCompressed : protoFiles;
-  for (size_t i = 0; i < curProtoFiles.size(); ++i) {
-    int64_t numSeqs = totalSeqs * (i + 1) / curProtoFiles.size() -
-                      totalSeqs * i / curProtoFiles.size();
-    ofstream os(curProtoFiles[i]);
-    CHECK(os) << "Fail to open " << curProtoFiles[i];
-    unique_ptr<ProtoWriter> writer(new ProtoWriter(&os, dataCompression));
-    CHECK(writer->write(header));
-    for (int j = 0; j < numSeqs; ++j, ++seq) {
-      int64_t begin = seq;
-      int64_t end = seq + 1;
-      if (sequenceStartPositions) {
-        begin = sequenceStartPositions->getElement(seq);
-        end = sequenceStartPositions->getElement(seq + 1);
-      }
-      for (int pos = begin; pos < end; ++pos) {
-        DataSample sample;
-        makeSample(arguments, pos, pos == begin, &sample, useGpu);
-        CHECK(writer->write(sample));
-        ++numWritten;
-      }
-    }
-    writer.reset(nullptr);
-    os.close();
-  }
-  CHECK_EQ(arguments[0].getBatchSize(), numWritten);
-}
-// check that the sample at pos1 in args1 is same as the sample at pos2 in args2
-void checkSample(const vector<Argument>& args1,
-                 int64_t pos1,
-                 const vector<Argument>& args2,
-                 int64_t pos2,
-                 bool useGpu) {
-  EXPECT_EQ(args1.size(), args2.size());
-  VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2;
-  for (size_t i = 0; i < args1.size(); ++i) {
-    auto type = getSlotType(args1[i]);
-    int dim = getSlotDim(args1[i]);
-    EXPECT_EQ(type, getSlotType(args2[i]));
-    if (type == SlotDef::INDEX) {
-      EXPECT_GE(dim, getSlotDim(args2[i]));
-    } else {
-      EXPECT_EQ(dim, getSlotDim(args2[i]));
-    }
-    switch (type) {
-      case SlotDef::VECTOR_DENSE: {
-        for (int j = 0; j < dim; ++j) {
-          EXPECT_EQ(static_cast<float>(args1[i].value->getElement(pos1, j)),
-                    static_cast<float>(args2[i].value->getElement(pos2, j)));
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        EXPECT_EQ(args1[i].ids->get(pos1), args2[i].ids->get(pos2));
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE:
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        int colNum1, colNum2;
-        const int *rowCols1, *rowCols2;
-        const real *rowValues1, *rowValues2;
-        getColRow(args1[i], pos1, useGpu, &colNum1, &rowCols1, &rowValues1);
-        getColRow(args2[i], pos2, useGpu, &colNum2, &rowCols2, &rowValues2);
-        EXPECT_EQ(colNum1, colNum2);
-        for (int j = 0; j < colNum1; ++j) {
-          EXPECT_EQ(rowCols1[j], rowCols2[j]);
-          if (type == SlotDef::VECTOR_SPARSE_VALUE) {
-            EXPECT_EQ(rowValues1[j], rowValues2[j]);
-          }
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "Not implemented";
-        break;
-      }
-      case SlotDef::STRING: {
-        EXPECT_EQ((*args1[i].strs)[pos1], (*args2[i].strs)[pos2]);
-        break;
-      }
-    }
-  }
-}
-void testProtoDataProvider(int* numPerSlotType,
-                           bool iid,
-                           bool async,
-                           bool useGpu,
-                           bool dataCompression,
-                           int numConstantSlots = 0) {
-  mkDir(kTestDir);
-  DataBatch data;
-  prepareData(&data, numPerSlotType, iid, useGpu);
-  writeData(data, useGpu, dataCompression);
-  DataConfig config;
-  config.set_type("proto");
-  config.set_files(dataCompression ? kProtoFileListCompressed : kProtoFileList);
-  config.set_async_load_data(async);
-  for (int i = 0; i < numConstantSlots; ++i) {
-    config.add_constant_slots(i + 11);
-    MatrixPtr w = Matrix::create(data.getSize(),
-                                 1,
-                                 /* trans= */ false,
-                                 /* useGpu= */ false);
-    w->assign(config.constant_slots(i));
-    data.appendData(w);
-  }
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  dataProvider->setSkipShuffle();
-  EXPECT_EQ(data.getSize(), dataProvider->getSize());
-  int64_t batchSize = 10;
-  DataBatch batch;
-  size_t seq1 = 0;
-  vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
-  dataProvider->reset();
-  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
-    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
-    vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
-    for (auto& arg : args2) {
-      EXPECT_EQ(iid, !arg.sequenceStartPositions);
-    }
-    size_t numSeqs = batch.getNumSequences();
-    VLOG(1) << "numSeqs=" << numSeqs;
-    for (size_t seq2 = 0; seq2 < numSeqs; ++seq1, ++seq2) {
-      int64_t begin1 = seq1;
-      int64_t end1 = seq1 + 1;
-      if (sequenceStartPositions1) {
-        begin1 = sequenceStartPositions1->getElement(seq1);
-        end1 = sequenceStartPositions1->getElement(seq1 + 1);
-        EXPECT_LT(seq1, sequenceStartPositions1->getSize() - 1);
-      }
-      int64_t begin2 = seq2;
-      int64_t end2 = seq2 + 1;
-      if (sequenceStartPositions2) {
-        begin2 = sequenceStartPositions2->getElement(seq2);
-        end2 = sequenceStartPositions2->getElement(seq2 + 1);
-      }
-      VLOG(1) << " begin1=" << begin1 << " end1=" << end1
-              << " begin2=" << begin2 << " end2=" << end2;
-      EXPECT_EQ(end1 - begin1, end2 - begin2);
-      for (int i = 0; i < end1 - begin1; ++i) {
-        checkSample(args1, begin1 + i, args2, begin2 + i, useGpu);
-      }
-    }
-  }
-  EXPECT_EQ(seq1, (size_t)data.getNumSequences());
-  rmDir(kTestDir);
-}
-TEST(ProtoDataProvider, test) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  int numSlotsArraySize = sizeof(numSlotsArray) / sizeof(numSlotsArray[0]);
-  const int numSlot = 5;
-  int combination[numSlot] = {0};
-  int k = numSlot - 1;
-  while (k >= 0) {
-    int numDenseVecSlots = numSlotsArray[combination[0]];
-    int numSparseNonValueVecSlots = numSlotsArray[combination[1]];
-    int numSparseValueVectorSlots = numSlotsArray[combination[2]];
-    int numStrSlots = numSlotsArray[combination[3]];
-    int numIdSlots = numSlotsArray[combination[4]];
-    // while loop : traverse all cases
-    k = numSlot - 1;
-    while (k >= 0) {
-      if (combination[k] < (numSlotsArraySize - 1)) {
-        ++combination[k];
-        break;
-      } else {
-        combination[k] = 0;
-        --k;
-      }
-    }
-    if (numDenseVecSlots + numSparseNonValueVecSlots +
-            numSparseValueVectorSlots + numStrSlots + numIdSlots <
-        1)
-      continue;
-    for (int iid : numTwoArray) {
-      for (int async : numTwoArray) {
-        for (int useGpu : numTwoArray) {
-          for (int dataCompression : numTwoArray) {
-            if (async && useGpu) {
-              // Currently in async mode, useGpu is not supported
-              continue;
-            }
-#ifndef PADDLE_WITH_CUDA
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numSparseValueVectorSlots="
-                      << numSparseValueVectorSlots
-                      << " numStrSlots=" << numStrSlots
-                      << " numIdSlots=" << numIdSlots << " iid=" << iid
-                      << " async=" << async << " useGpu=" << useGpu
-                      << " dataCompression=" << dataCompression;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] =
-                numSparseValueVectorSlots;
-            numPerSlotType[SlotDef::INDEX] = numIdSlots;
-            numPerSlotType[SlotDef::STRING] = numStrSlots;
-            testProtoDataProvider(
-                numPerSlotType, iid, async, useGpu, dataCompression);
-          }  // end for (int dataCompression : numTwoArray)
-        }    // end for (int useGpu : numTwoArray)
-      }      // end for (int async : numTwoArray)
-    }        // end for (int iid : numTwoArray)
-  }          // end for (while, traverse all slots)
-}
-TEST(ProtoDataProvider, constant_slots) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  for (int numDenseVecSlots : numSlotsArray) {
-    for (int numSparseNonValueVecSlots : numSlotsArray) {
-      if (numDenseVecSlots + numSparseNonValueVecSlots < 1) continue;
-      for (int numConstantSlots : {1, 2}) {
-        for (int useGpu : numTwoArray) {
-          for (int dataCompression : numTwoArray) {
-#ifndef PADDLE_WITH_CUDA
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numConstantSlogs=" << numConstantSlots
-                      << " useGpu=" << useGpu
-                      << " dataCompression=" << dataCompression;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] = 1;
-            numPerSlotType[SlotDef::INDEX] = 1;
-            testProtoDataProvider(numPerSlotType,
-                                  /* iid= */ true,
-                                  /* async= */ false,
-                                  useGpu,
-                                  dataCompression,
-                                  numConstantSlots);
-          }  // end for (int dataCompression : numTwoArray)
-        }    // end for (int useGpu : numTwoArray)
-      }      // end for (int numConstantSlots : {1, 2})
-    }        // end for (int numSparseNonValueVecSlots : numSlotsArray)
-  }          // end for (int numDenseVecSlots : numSlotsArray)
-}
-void checkSampleSequence(const vector<Argument>& args1,
-                         const vector<Argument>& args2,
-                         int64_t offset,
-                         int64_t numSeqs,
-                         bool useGpu) {
-  // check slot num are equal
-  EXPECT_EQ(args1.size(), args2.size());
-  for (size_t i = 0; i < args1.size(); i++) {
-    auto type = getSlotType(args1[i]);
-    // check for args2: sequenceStartPositions vs numSeqs
-    // (1) size
-    EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), (size_t)numSeqs + 1);
-    // (2) content
-    auto checkArgContent = [&](const Argument& args, int numSeqs) {
-      for (int j = 0; j <= numSeqs; j++) {
-        int start_pos = args.sequenceStartPositions->getElement(j);
-        EXPECT_EQ(start_pos, j);
-      }
-    };
-    switch (type) {
-      case SlotDef::INDEX: {
-        // args1: for label
-        checkArgContent(args2[i], numSeqs);
-        // check for args2: ids are equal to args1[offset]
-        // (1) size
-        EXPECT_EQ(args2[i].ids->getSize(), (size_t)numSeqs);
-        // (2) content
-        for (int j = 0; j < numSeqs; j++) {
-          EXPECT_EQ(args2[i].ids->get(j), args1[i].ids->get(offset + j));
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        // args1: for sparse_non_value
-        // args2 should put sparse indexes in ids
-        int colNum1;
-        const int* rowCols1;
-        const real* rowValues1;  // nullptr
-        int totalLength = 0;
-        for (int j = 0; j < numSeqs; j++) {
-          getColRow(
-              args1[i], offset + j, useGpu, &colNum1, &rowCols1, &rowValues1);
-          // (1) lengths
-          EXPECT_EQ(totalLength,
-                    args2[i].sequenceStartPositions->getElement(j));
-          EXPECT_EQ(totalLength,
-                    args2[i].subSequenceStartPositions->getElement(j));
-          // (2) content
-          for (int k = 0; k < colNum1; k++) {
-            EXPECT_EQ(rowCols1[k], args2[i].ids->get(totalLength + k));
-          }
-          totalLength += colNum1;
-          if (colNum1 == 0) {
-            // special case here: we will put a "-1" into ids when column num is
-            // zero. see ProtoSequenceDataProvider::getNextBatchInternal.
-            EXPECT_EQ(-1, args2[i].ids->get(totalLength));
-            totalLength++;
-          }
-        }
-        EXPECT_EQ(totalLength,
-                  args2[i].sequenceStartPositions->getElement(numSeqs));
-        EXPECT_EQ(totalLength,
-                  args2[i].subSequenceStartPositions->getElement(numSeqs));
-        break;
-      }
-      case SlotDef::VECTOR_DENSE: {
-        // args1: for dense vector
-        checkArgContent(args2[i], numSeqs);
-        // check for args2: values are equal to args1[offset]
-        // (1) size
-        EXPECT_EQ(args2[i].value->getHeight(), (size_t)numSeqs);
-        EXPECT_EQ(args2[i].value->getWidth(), (size_t)getSlotDim(args1[i]));
-        // (2) content
-        for (int j = 0; j < numSeqs; j++) {
-          for (size_t k = 0; k < args2[i].value->getWidth(); k++) {
-            EXPECT_EQ(
-                static_cast<float>(args1[i].value->getElement(j + offset, k)),
-                static_cast<float>(args2[i].value->getElement(j, k)));
-          }
-        }
-        break;
-      }
-      default: { EXPECT_EQ(true, false) << "should not reach here"; }
-    }
-  }
-}
-void testProtoSequenceDataProvider(int* numPerSlotType,
-                                   bool async,
-                                   bool useGpu) {
-  mkDir(kTestDir);
-  DataBatch data;
-  prepareData(&data,
-              numPerSlotType,
-              /* iid */ true,
-              useGpu);
-  writeData(data, useGpu, /* dataCompression */ false);
-  DataConfig config;
-  config.set_type("proto_sequence");
-  config.set_files(kProtoFileList);
-  config.set_async_load_data(async);
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  dataProvider->setSkipShuffle();
-  EXPECT_EQ(data.getSize(), dataProvider->getSize());
-  int64_t batchSize = 10;
-  DataBatch batch;
-  vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
-  dataProvider->reset();
-  size_t args1Offset = 0;
-  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
-    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
-    vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
-    for (auto& arg : args1) {
-      // args1 should not has sequence
-      EXPECT_EQ(true, !arg.sequenceStartPositions);
-    }
-    for (auto& arg : args2) {
-      // args2 should has sequence
-      EXPECT_NE(true, !arg.sequenceStartPositions);
-    }
-    size_t numSeqs = batch.getNumSequences();
-    checkSampleSequence(args1, args2, args1Offset, numSeqs, useGpu);
-    args1Offset += numSeqs;
-  }
-  EXPECT_EQ(args1Offset, (size_t)data.getNumSequences());
-  rmDir(kTestDir);
-}
-TEST(ProtoSequenceDataProvider, test) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  for (int numSparseNonValueVecSlots : numSlotsArray) {
-    for (int numIdSlots : numSlotsArray) {
-      for (int numDenseVecSlots : numSlotsArray) {
-        if (numDenseVecSlots + numSparseNonValueVecSlots + numIdSlots < 1)
-          continue;
-        for (int async : numTwoArray) {
-          for (int useGpu : numTwoArray) {
-            if (async && useGpu) {
-              // Currently in async mode, useGpu is not supported
-              continue;
-            }
-#ifndef PADDLE_WITH_CUDA
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numIdSlots=" << numIdSlots << " async=" << async
-                      << " useGpu=" << useGpu;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::INDEX] = numIdSlots;
-            testProtoSequenceDataProvider(numPerSlotType, async, useGpu);
-          }  // end for (int useGpu : numTwoArray)
-        }    // end for (int async : numTwoArray)
-      }      // end for (int numDenseVecSlots : numSlotsArray)
-    }        // end for (int numIdSlots : numSlotsArray)
-  }          // end for (int numSparseNonValueVecSlots : numSlotsArray)
-}
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -17,9 +17,13 @@ limitations under the License. */
 #include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
+#ifndef PADDLE_MOBILE_INFERENCE
 DEFINE_int32(pool_limit_size,
             536870912,
             "maximum memory size managed by a memory pool, default is 512M");
+#else
+DEFINE_int32(pool_limit_size, 0, "default is 0");
+#endif
 namespace paddle {

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -61,6 +61,18 @@ function(op_library TARGET)
        set(pybind_flag 1)
    endif()
+    if ("${TARGET}" STREQUAL "compare_op")
+        set(pybind_flag 1)
+        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
+    endif()
+    # conv_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
+    endif()
    # pool_op contains several operators
    if ("${TARGET}" STREQUAL "pool_op")
        set(pybind_flag 1)
@@ -68,9 +80,11 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
    endif()
-    if ("${TARGET}" STREQUAL "compare_op")
+    # pool_cudnn_op contains several operators
+    if ("${TARGET}" STREQUAL "pool_cudnn_op")
        set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
    endif()
    # pool_with_index_op contains several operators
@@ -80,25 +94,18 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
    endif()
-    # conv_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
-    endif()
    # conv_transpose_op contains several operators
    if ("${TARGET}" STREQUAL "conv_transpose_op")
        set(pybind_flag 1)
        # It's enough to just adding one operator to pybind
        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
    endif()
-    # pool_cudnn_op contains several operators
+    # conv_transpose_cudnn_op contains two operators
-    if ("${TARGET}" STREQUAL "pool_cudnn_op")
+    if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
        set(pybind_flag 1)
        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
+        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
    endif()
    # save_restore_op contains several operators

--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -42,6 +42,7 @@ class ArrayOp : public framework::OperatorBase {
    } else {
      offset = static_cast<size_t>(*i_tensor.data<int64_t>());
    }
+    VLOG(10) << " Offset = " << offset;
    return offset;
  }
 };

--- a/paddle/operators/bilinear_tensor_product_op.h
+++ b/paddle/operators/bilinear_tensor_product_op.h
@@ -174,7 +174,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
    // Caculate the gradient of Input(Bias).
    if (d_bias) {
      d_bias->mutable_data<T>(ctx.GetPlace());
-      auto d_bias_mat = EigenMatrix<T>::From(*d_bias);
+      auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
      d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
    }
  }

--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -226,9 +226,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
    T alpha = 1.0f, beta = 0.0f;
    if (input_grad) {
      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
+      // Because beta is zero, it is unnecessary to reset input_grad.
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
      for (int i = 0; i < groups; i++) {
        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
            handle, &alpha, cudnn_filter_desc,
@@ -241,9 +240,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
    // ------------------- cudnn conv backward filter ---------------------
    if (filter_grad) {
      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*filter_grad);
+      // Because beta is zero, it is unnecessary to reset filter_grad.
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
      for (int i = 0; i < groups; i++) {
        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,

--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -225,11 +225,15 @@ REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
            ops::ConvOpGrad);
 REGISTER_OP_CPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -17,11 +17,15 @@
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/conv2d_transpose_cudnn_op.cc
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cc
@@ -23,7 +23,24 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
                              framework::OpAttrChecker* op_checker)
      : Conv2DTransposeOpMaker(proto, op_checker) {
    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault(std::vector<int>{1, 1});
+        .SetDefault({1, 1});
+    AddAttr<int>("workspace_size_MB",
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardward. This size should be carefully setted.")
+        .SetDefault(4096);
+  }
+};
+class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker {
+ public:
+  CudnnConv3DTransposeOpMaker(framework::OpProto* proto,
+                              framework::OpAttrChecker* op_checker)
+      : Conv3DTransposeOpMaker(proto, op_checker) {
+    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
+        .SetDefault({1, 1, 1});
    AddAttr<int>("workspace_size_MB",
                 "workspace size for cudnn, in MB, "
                 "workspace is a section of GPU memory which will be "
@@ -48,3 +65,14 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
    conv2d_transpose_cudnn_grad,
    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp,
+            ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad,
+            ops::ConvTransposeOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_cudnn,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_cudnn_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv2d_transpose_cudnn_op.cu.cc
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cu.cc
@@ -54,15 +54,21 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
    ScopedTensorDescriptor output_desc;
    ScopedFilterDescriptor filter_desc;
    ScopedConvolutionDescriptor conv_desc;
-    DataLayout layout = DataLayout::kNCHW;
+    DataLayout layout;
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
-    // N, M, H, W
+    // (N, M, H, W) or (N, M, D, H, W)
    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
        layout, framework::vectorize2int(input->dims()));
-    // N, C, O_h, O_w
+    // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
        layout, framework::vectorize2int(output->dims()));
-    // M, C, K_h, K_w
+    // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
        layout, framework::vectorize2int(filter->dims()));
    cudnnConvolutionDescriptor_t cudnn_conv_desc =
@@ -136,13 +142,13 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
    ScopedConvolutionDescriptor conv_desc;
    DataLayout layout = DataLayout::kNCHW;
-    // Input: (N, M, H, W)
+    // Input: (N, M, H, W) or (N, M, D, H, W)
    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
        layout, framework::vectorize2int(input->dims()));
-    // Output: (N, C, O_H, O_W)
+    // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
        layout, framework::vectorize2int(output_grad->dims()));
-    // Filter (M, C, K_H, K_W)
+    // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
        layout, framework::vectorize2int(filter->dims()));
@@ -200,8 +206,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
    T alpha = 1.0f, beta = 0.0f;
    if (input_grad) {
      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      math::set_constant(ctx.device_context(), input_grad, 0);
+      // Because beta is zero, it is unnecessary to reset input_grad.
      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
          handle, &alpha, cudnn_output_desc, output_grad_data,
          cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
@@ -212,8 +217,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
    // ------------------- cudnn conv backward filter ---------------------
    if (filter_grad) {
      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      math::set_constant(ctx.device_context(), filter_grad, 0);
+      // Because beta is zero, it is unnecessary to reset filter_grad.
      // Gradient with respect to the filter
      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
          handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
@@ -234,3 +238,8 @@ REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn,
                       ops::CudnnConvTransposeOpKernel<float>);
 REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad,
                       ops::CudnnConvTransposeGradOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn,
+                       ops::CudnnConvTransposeOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad,
+                       ops::CudnnConvTransposeGradOpKernel<float>);
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -30,11 +30,6 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    PADDLE_ENFORCE_EQ(paddings[i], 0,
-                      "No Padding allowed in conv transpose op.");
-  }
  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                 "ConvTransposeOp intput should be 4-D or 5-D tensor.");
  PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
@@ -52,7 +47,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] +
+    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
                           filter_dims[i + 2]);
  }
  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
@@ -190,17 +185,21 @@ REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
 REGISTER_OP_CPU_KERNEL(
    conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
    conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
 REGISTER_OP_CPU_KERNEL(
    conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
    conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/conv_transpose_op.cu.cc
+++ b/paddle/operators/conv_transpose_op.cu.cc
@@ -18,14 +18,18 @@ namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
    conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
    conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
    conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
    conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -62,7 +62,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
    Tensor* output = context.Output<Tensor>("Output");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    // Actually, no paddings and groups allowed in conv transpose.
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    // TODO(Zhuoyuan): Paddings can be added in future.
    // groups will alway be disabled in conv2dtranspose.
@@ -148,8 +147,8 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
      } else if (filter_shape_vec.size() == 3) {
        // col2vol: col_matrix -> dy
        // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
-        col2vol(context.device_context(), col, dilations, strides,
+        col2vol(context.device_context(), col, dilations, strides, paddings,
-                std::vector<int>{0, 0, 0}, &output_batch);
+                &output_batch);
      }
    }
  }
@@ -173,7 +172,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
    if ((!input_grad) && (!filter_grad)) return;
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    // Actually, no paddings and groups allowed in conv transpose.
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    const int batch_size = static_cast<int>(input->dims()[0]);

--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -132,7 +132,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
      // compute dy
      if (out_grad_y) {
        out_grad_y->mutable_data<T>(context.GetPlace());
-        auto dy = EigenMatrix<T>::Reshape(*out_grad_y, 1);
+        auto dy = EigenVector<T>::Flatten(*out_grad_y);
        auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast;
        dy.device(place) = (dz_bcast * grad).sum(Eigen::array<int, 1>({{0}}));
      }

--- a/paddle/operators/detail/safe_ref.h
+++ b/paddle/operators/detail/safe_ref.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+namespace paddle {
+namespace operators {
+namespace detail {
+/**
+ * Get Reference From Pointer with check. The error message is printf format,
+ * and passed by `args`
+ */
+template <typename T, typename... ARGS>
+inline T &Ref(T *ptr, ARGS &&... args) {
+  PADDLE_ENFORCE(ptr != nullptr, args...);
+  return *ptr;
+}
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -101,4 +101,7 @@ REGISTER_OPERATOR(fill_constant_batch_size_like,
 REGISTER_OP_CPU_KERNEL(
    fill_constant_batch_size_like,
    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, double>);
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace,
+                                           int64_t>);
--- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
@@ -19,4 +19,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
    fill_constant_batch_size_like,
    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, double>);
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace,
+                                           int64_t>);
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -54,5 +54,8 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
                             ops::FillZerosLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    fill_zeros_like,
+    fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>);
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, bool>);
--- a/paddle/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/operators/fill_zeros_like_op.cu.cc
@@ -17,5 +17,8 @@
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    fill_zeros_like,
+    fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int>,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>,
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, double>,
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, bool>);
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -24,8 +24,17 @@
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+template <typename Place, typename T>
+inline void ReorderInitState(const platform::DeviceContext& ctx,
+                             const framework::Tensor& src, const size_t* index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
 template <typename Place, typename T>
 class GRUKernel : public framework::OpKernel<T> {
@@ -33,7 +42,6 @@ class GRUKernel : public framework::OpKernel<T> {
  void BatchCompute(const framework::ExecutionContext& context) const {
    auto* input = context.Input<LoDTensor>("Input");
    auto* h0 = context.Input<Tensor>("H0");
-    const T* h0_data = h0 ? h0->data<T>() : nullptr;
    auto* weight = context.Input<Tensor>("Weight");
    const T* weight_data = weight->data<T>();
    auto* bias = context.Input<Tensor>("Bias");
@@ -66,7 +74,18 @@ class GRUKernel : public framework::OpKernel<T> {
    gru_value.gateWeight = const_cast<T*>(weight_data);
    gru_value.stateWeight =
        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    gru_value.prevOutValue = const_cast<T*>(h0_data);
+    Tensor ordered_h0;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<Place, T>(context.device_context(), *h0, order,
+                                 &ordered_h0, true);
+      gru_value.prevOutValue = ordered_h0.data<T>();
+    } else {
+      gru_value.prevOutValue = nullptr;
+    }
    auto batch_starts = batch_gate->lod()[0];
    size_t num_batch = batch_starts.size() - 1;
    for (size_t n = 0; n < num_batch; n++) {
@@ -102,7 +121,6 @@ class GRUGradKernel : public framework::OpKernel<T> {
 public:
  void BatchCompute(const framework::ExecutionContext& context) const {
    auto* h0 = context.Input<Tensor>("H0");
-    const T* h0_data = h0 ? h0->data<T>() : nullptr;
    auto* weight = context.Input<Tensor>("Weight");
    const T* weight_data = weight->data<T>();
    auto* batch_gate = context.Input<LoDTensor>("BatchGate");
@@ -135,6 +153,17 @@ class GRUGradKernel : public framework::OpKernel<T> {
    zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
    zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
+    Tensor ordered_h0, ordered_h0_grad;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (h0) {
+      ReorderInitState<Place, T>(context.device_context(), *h0, order,
+                                 &ordered_h0, true);
+    }
+    if (h0_grad) {
+      ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
+      zero(context.device_context(), &ordered_h0_grad, static_cast<T>(0.0));
+    }
    bool is_reverse = context.Attr<bool>("is_reverse");
    batch_hidden_grad.set_lod(batch_hidden->lod());
    to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
@@ -176,14 +205,9 @@ class GRUGradKernel : public framework::OpKernel<T> {
          batch_reset_hidden_prev_grad.Slice(bstart, bend);
      gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
      if (n == 0) {
-        gru_value.prevOutValue = const_cast<T*>(h0_data);
+        gru_value.prevOutValue = h0 ? ordered_h0.data<T>() : nullptr;
-        if (h0_grad) {
+        gru_grad.prevOutGrad =
-          T* h0_grad_data = h0_grad->mutable_data<T>(context.GetPlace());
+            h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
-          zero(dev_ctx, h0_grad, static_cast<T>(0.0));
-          gru_grad.prevOutGrad = h0_grad_data;
-        } else {
-          gru_grad.prevOutGrad = nullptr;
-        }
      } else {
        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
@@ -208,6 +232,10 @@ class GRUGradKernel : public framework::OpKernel<T> {
      math::ColwiseSum<Place, T> col_sum;
      col_sum(dev_ctx, batch_gate_grad, bias_grad);
    }
+    if (h0 && h0_grad) {
+      ReorderInitState<Place, T>(context.device_context(), ordered_h0_grad,
+                                 order, h0_grad, false);
+    }
  }
  void Compute(const framework::ExecutionContext& context) const override {

--- a/paddle/operators/is_empty_op.cc
+++ b/paddle/operators/is_empty_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+namespace paddle {
+namespace operators {
+constexpr char kInput[] = "X";
+constexpr char kOutput[] = "Out";
+class IsEmptyOp : public framework::OperatorBase {
+ public:
+  IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    // get input
+    auto *var = scope.FindVar(Input(kInput));
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto &tensor = var->Get<framework::LoDTensor>();
+    // get output
+    auto *out = scope.FindVar(Output(kOutput));
+    PADDLE_ENFORCE_NOT_NULL(out);
+    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+    out_tensor->Resize({1});
+    out_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
+        framework::product(tensor.dims()) == 0;
+  }
+};
+class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IsEmptyOpProtoMaker(framework::OpProto *proto,
+                      framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kInput, "(Tensor) Tensor which is to be checked.");
+    AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
+    AddComment(R"DOC(
+IsEmpty Operator which checks whether a tensor is empty.
+It will just return product(tensor.ddims()) > 0;
+              )DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp,
+                             paddle::operators::IsEmptyOpProtoMaker);
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
 add_subdirectory(detail)
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto)
    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
    nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
@@ -15,7 +15,7 @@ if(WITH_GPU)
    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context)
+    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
    cc_library(softmax SRCS softmax.cc DEPS device_context)
    cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)

--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
@@ -119,8 +119,8 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width,
  if (index < n) {
    T val = 0;
-    int w = index % im_width;
+    int w = index % im_width + padding_width;
-    int h = (index / im_width) % im_height;
+    int h = (index / im_width) % im_height + padding_height;
    int c = index / (im_width * im_height);
    // compute the start and end of the output

--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -250,6 +250,8 @@ void axpy<platform::CPUPlace, double>(const platform::DeviceContext& context,
 template struct SetConstant<platform::CPUPlace, float>;
 template struct SetConstant<platform::CPUPlace, double>;
 template struct SetConstant<platform::CPUPlace, int>;
+template struct SetConstant<platform::CPUPlace, int64_t>;
+template struct SetConstant<platform::CPUPlace, bool>;
 #define DEFINE_CPU_TRANS(RANK)                                \
  template struct Transpose<platform::CPUPlace, float, RANK>; \

--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -256,6 +256,8 @@ void axpy<platform::GPUPlace, double>(const platform::DeviceContext& context,
 template struct SetConstant<platform::GPUPlace, float>;
 template struct SetConstant<platform::GPUPlace, double>;
 template struct SetConstant<platform::GPUPlace, int>;
+template struct SetConstant<platform::GPUPlace, int64_t>;
+template struct SetConstant<platform::GPUPlace, bool>;
 #define DEFINE_GPU_TRANS(RANK)                                \
  template struct Transpose<platform::GPUPlace, float, RANK>; \

--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ b/paddle/operators/pool_cudnn_op.cu.cc
@@ -147,8 +147,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
    if (input_grad) {
      T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<paddle::platform::GPUPlace, T> set_zero;
+      // Because beta is zero, it is unnecessary to reset input_grad.
-      set_zero(ctx.device_context(), input_grad, static_cast<T>(0));
      PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,

--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/operators/sum_op.h"
 #include <vector>
 #include "paddle/framework/var_type_inference.h"
+#include "paddle/operators/detail/safe_ref.h"
 namespace paddle {
 namespace operators {
@@ -59,13 +60,16 @@ class SumOp : public framework::OperatorWithKernel {
              x_vars[0]->Get<framework::SelectedRows>().value().type()),
          ctx.device_context());
    } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
-      auto& array = x_vars[0]->Get<framework::LoDTensorArray>();
+      for (auto& x_var : x_vars) {
-      for (auto& each : array) {
+        auto& array = x_var->Get<framework::LoDTensorArray>();
-        if (each.numel() != 0) {
+        for (auto& each : array) {
-          return framework::OpKernelType(framework::ToDataType(each.type()),
+          if (each.numel() != 0) {
-                                         ctx.device_context());
+            return framework::OpKernelType(framework::ToDataType(each.type()),
+                                           ctx.device_context());
+          }
        }
      }
+      PADDLE_THROW("Cannot find the input data type by all input data");
    }
    PADDLE_THROW("Unexpected branch. Input type is %s",
                 x_vars[0]->Type().name());
@@ -96,6 +100,11 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
    auto& inputs = op_desc.Input("X");
    auto var_type = framework::VarDesc::SELECTED_ROWS;
+    for (auto& name : op_desc.Input("X")) {
+      VLOG(10) << name << " "
+               << block->FindRecursiveOrCreateVar(name)->GetType();
+    }
    bool any_input_is_lod_tensor = std::any_of(
        inputs.begin(), inputs.end(), [block](const std::string& name) {
          return block->FindRecursiveOrCreateVar(name)->GetType() ==
@@ -103,7 +112,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
        });
    auto is_tensor_array = [block](const std::string& name) {
-      return block->FindRecursiveOrCreateVar(name)->GetType() ==
+      return detail::Ref(block->FindRecursiveOrCreateVar(name)).GetType() ==
             framework::VarDesc::LOD_TENSOR_ARRAY;
    };
@@ -113,14 +122,26 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
        std::all_of(inputs.begin(), inputs.end(), is_tensor_array);
    if (any_input_is_tensor_array) {
-      PADDLE_ENFORCE(all_inputs_are_tensor_array);
+      if (!all_inputs_are_tensor_array) {
+        std::ostringstream os;
+        for (auto& each : inputs) {
+          os << "    " << each << " type is "
+             << detail::Ref(block->FindRecursiveOrCreateVar(each)).GetType()
+             << "\n";
+        }
+        PADDLE_ENFORCE(all_inputs_are_tensor_array,
+                       "Not all inputs are tensor array:\n%s", os.str());
+      }
      var_type = framework::VarDesc::LOD_TENSOR_ARRAY;
    } else if (any_input_is_lod_tensor) {
      var_type = framework::VarDesc::LOD_TENSOR;
    }
    auto out_var_name = op_desc.Output("Out").front();
-    block->FindRecursiveOrCreateVar(out_var_name)->SetType(var_type);
+    auto& out_var = detail::Ref(block->FindRecursiveOrCreateVar(out_var_name));
+    out_var.SetType(var_type);
+    auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
+    out_var.SetDataType(in_var.GetDataType());
  }
 };

--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -12,7 +12,7 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
 #include "paddle/operators/array_operator.h"
+#include "paddle/operators/detail/safe_ref.h"
 namespace paddle {
 namespace operators {
@@ -33,6 +33,8 @@ class WriteToArrayOp : public ArrayOp {
    auto *out =
        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
    if (offset >= out->size()) {
+      VLOG(10) << "Resize " << Output("Out") << " from " << out->size()
+               << " to " << offset + 1;
      out->resize(offset + 1);
    }
    auto *out_tensor = &out->at(offset);
@@ -85,11 +87,15 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
 public:
  void operator()(const framework::OpDescBind &op_desc,
                  framework::BlockDescBind *block) const override {
-    for (auto &out_var : op_desc.OutputArgumentNames()) {
+    auto x_name = op_desc.Input("X")[0];
-      VLOG(10) << "Set Variable " << out_var << " as LOD_TENSOR_ARRAY";
+    auto out_name = op_desc.Output("Out")[0];
-      block->FindRecursiveOrCreateVar(out_var)->SetType(
+    VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
-          framework::VarDesc::LOD_TENSOR_ARRAY);
+    auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name),
-    }
+                            "Cannot found %s", out_name);
+    out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+    auto &x =
+        detail::Ref(block->FindVarRecursive(x_name), "Cannot found %s", x_name);
+    out.SetDataType(x.GetDataType());
  }
 };
@@ -107,11 +113,11 @@ class ReadFromArrayOp : public ArrayOp {
    auto &x_array = x->Get<framework::LoDTensorArray>();
    auto *out = scope.FindVar(Output("Out"));
    PADDLE_ENFORCE(out != nullptr, "Out must be set");
-    auto *out_tesnor = out->GetMutable<framework::LoDTensor>();
+    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
    size_t offset = GetOffset(scope, dev_ctx);
    PADDLE_ENFORCE_LT(offset, x_array.size());
-    out_tesnor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx);
+    out_tensor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx);
-    out_tesnor->set_lod(x_array[offset].lod());
+    out_tensor->set_lod(x_array[offset].lod());
  }
 };

--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -14,8 +14,10 @@
 #include <vector>
 #include "paddle/framework/executor.h"
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
+#include "paddle/operators/detail/safe_ref.h"
 namespace paddle {
 namespace operators {
@@ -26,8 +28,9 @@ using LoDTensor = framework::LoDTensor;
 constexpr char kStepBlock[] = "step_block";
 constexpr char kCondition[] = "Condition";
 constexpr char kStepScopes[] = "StepScopes";
-constexpr char kParamGrads[] = "X@Grad";
 constexpr char kParameters[] = "X";
+constexpr char kParamGrads[] = "X@GRAD";
+constexpr char kOutputs[] = "Out";
 class WhileOp : public framework::OperatorBase {
 public:
@@ -71,9 +74,9 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
        kCondition,
        "(Bool) An scalar. When it's False, the While Op will be terminated.")
        .AsDuplicable();
-    AddOutput("Out",
+    AddOutput(kOutputs,
              "A set of variables, which will be assigned with values "
-              "generated by perators inside the block of While Op.")
+              "generated by the operators inside the block of While Op.")
        .AsDuplicable();
    AddOutput(kStepScopes,
              "(StepScopeVar) A vector of local scope, which size equals the "
@@ -104,17 +107,64 @@ class WhileGradOp : public framework::OperatorBase {
    auto *step_scopes =
        scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
+    auto outside_og_names = Inputs(framework::GradVarName(kOutputs));
+    auto inside_og_names =
+        Attr<std::vector<std::string>>("original_output_grad");
+    PADDLE_ENFORCE_EQ(outside_og_names.size(), inside_og_names.size());
    for (auto cur_scope_iter = step_scopes->rbegin();
         cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
+      VLOG(3) << "Start backward at time_step "
+              << cur_scope_iter - step_scopes->rbegin();
+      framework::Scope &cur_scope = **cur_scope_iter;
+      // Link OG from outside to inside
+      for (size_t i = 0; i < outside_og_names.size(); ++i) {
+        auto outside_og_name = outside_og_names[i];
+        auto inside_og_name = inside_og_names[i];
+        VLOG(10) << "Linking outside " << outside_og_name << " --> inside "
+                 << inside_og_name;
+        auto &og_outside = detail::Ref(scope.FindVar(outside_og_name));
+        auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name));
+        if (og_outside.Type().hash_code() ==
+            typeid(framework::LoDTensor).hash_code()) {
+          auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
+          auto &inside_tensor =
+              detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
+          inside_tensor.set_lod(outside_tensor.lod());
+          inside_tensor.ShareDataWith(outside_tensor);
+        } else if (og_outside.Type().hash_code() ==
+                   typeid(framework::LoDTensorArray).hash_code()) {
+          auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
+          auto &inside_array =
+              detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
+          VLOG(10) << outside_og_name << " size = " << outside_array.size();
+          inside_array.resize(outside_array.size());
+          for (size_t j = 0; j < inside_array.size(); ++j) {
+            VLOG(10) << j << " " << outside_array[j].numel();
+            if (outside_array[j].numel() != 0) {
+              inside_array[j].set_lod(outside_array[j].lod());
+              inside_array[j].ShareDataWith(outside_array[j]);
+            } else {
+              PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0);
+            }
+          }
+        }
+      }
      executor.Run(*program, *cur_scope_iter, block->ID(), false);
      auto &pg_names = Outputs(kParamGrads);
      auto &p_names = Inputs(kParameters);
      PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
-      for (size_t prog_id = 0; prog_id < pg_names.size(); ++prog_id) {
+      for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
-        auto inside_grad_name = framework::GradVarName(p_names[prog_id]);
+        if (pg_names[param_id] == framework::kEmptyVarName) {
+          continue;  // iterator doesn't have gradient
+        }
+        auto inside_grad_name = framework::GradVarName(p_names[param_id]);
-        //  // TODO(tonyyang-savil: Not sure we need the following
+        //  // TODO(tonyyang-svail): Not sure we need the following
        //  // If does not compute gradient of that variable inside rnn,
        //  just
        //  // continue
@@ -126,7 +176,7 @@ class WhileGradOp : public framework::OperatorBase {
        // zero gradient variable in step 0
        if (cur_scope_iter == step_scopes->rbegin()) {
          auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
-          PADDLE_ENFORCE_NOT_NULL(var);
+          PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name);
          if (var->IsType<LoDTensor>()) {
            auto &inside_tensor = var->Get<framework::LoDTensor>();
            framework::AttributeMap attrs;
@@ -135,27 +185,18 @@ class WhileGradOp : public framework::OperatorBase {
            attrs["value"] = 0.0f;
            auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", {}, {{"Out", {pg_names[prog_id]}}}, attrs);
+                "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
            zero_op->Run(scope, dev_ctx);
          }
        }
        // sum gradient
-        auto *outside_var = scope.FindVar(pg_names[prog_id]);
+        auto new_inside_name = cur_scope.Rename(inside_grad_name);
-        PADDLE_ENFORCE_NOT_NULL(outside_var);
-        auto &outside_tensor = *outside_var->GetMutable<framework::LoDTensor>();
-        std::string result_var_name;
-        auto *local_result_var = (*cur_scope_iter)->Var(&result_var_name);
-        auto &local_result_tensor =
-            *local_result_var->GetMutable<framework::LoDTensor>();
-        local_result_tensor.ShareDataWith(outside_tensor);
        auto sum_op = framework::OpRegistry::CreateOp(
-            "sum", {{"X", {result_var_name, inside_grad_name}}},
+            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-            {{"Out", {result_var_name}}}, {});
+            {{"Out", {pg_names[param_id]}}}, {});
-        sum_op->Run(**cur_scope_iter, dev_ctx);
+        sum_op->Run(cur_scope, dev_ctx);
+        cur_scope.Rename(new_inside_name, inside_grad_name);
      }
    }
  }
@@ -169,29 +210,110 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
  virtual std::unique_ptr<framework::OpDescBind> Apply() const {
    auto *grad = new framework::OpDescBind();
    grad->SetType("while_grad");
-    for (auto &input_param : this->InputNames()) {
+    grad->SetInput(kParameters, Input(kParameters));
-      grad->SetInput(input_param, this->Input(input_param));
+    grad->SetOutput(
-      grad->SetOutput(framework::GradVarName(input_param),
+        framework::GradVarName(kParameters),
-                      this->InputGrad(input_param));
+        InputGrad(kParameters, /*do not drop empty gradient*/ false));
+    grad->SetInput(kOutputs, Output(kOutputs));
+    // OG should be re-calculated by step blocks, since many outputs of while op
+    // do not need to calculate gradients.
+    std::unordered_set<std::string> block_ins;
+    {
+      for (auto &p : Input(kParameters)) {
+        block_ins.insert(p);
+      }
+      for (auto &o : Output(kOutputs)) {
+        block_ins.insert(o);
+      }
    }
+    std::unordered_set<std::string> extra_inputs;
+    for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
+      for (auto &input_name : grad_block_[0]->Op(i)->InputArgumentNames()) {
+        if (block_ins.find(input_name) != block_ins.end()) {
+          continue;
+        }
+        extra_inputs.insert(input_name);
+      }
-    for (auto &output_param : this->OutputNames()) {
+      for (auto &output_name : grad_block_[0]->Op(i)->OutputArgumentNames()) {
-      grad->SetInput(output_param, this->Output(output_param));
+        block_ins.insert(output_name);
-      if (output_param != kStepScopes) {
-        grad->SetInput(framework::GradVarName(output_param),
-                       this->OutputGrad(output_param));
      }
    }
+    std::vector<std::string> extra_inputs_list;
+    extra_inputs_list.resize(extra_inputs.size());
+    std::copy(extra_inputs.begin(), extra_inputs.end(),
+              extra_inputs_list.begin());
+    grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list);
+    grad->SetInput(kStepScopes, Output(kStepScopes));
    grad->SetAttrMap(this->Attrs());
    grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
+    // record the original output gradient names, since the gradient name of
+    // while operator could be renamed.
+    grad->SetAttr("original_output_grad", extra_inputs_list);
    return std::unique_ptr<framework::OpDescBind>(grad);
  }
 };
+class WhileGradOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDescBind &op_desc,
+                  framework::BlockDescBind *block) const override {
+    auto p_names = op_desc.Input(kParameters);
+    auto pg_names = op_desc.Output(framework::GradVarName(kParameters));
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
+      auto *g_var = block->FindVarRecursive(pg_names[i]);
+      if (g_var != nullptr) {  // Gradient could be @EMPTY@
+        VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i]
+                << " type: " << p_var.GetType();
+        g_var->SetType(p_var.GetType());
+        g_var->SetDataType(p_var.GetDataType());
+      }
+    }
+  }
+};
+class WhileGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    ctx->HasInputs(kParameters);
+    ctx->HasOutputs(framework::GradVarName(kParameters));
+    ctx->HasInputs(kOutputs);
+    ctx->HasInputs(framework::GradVarName(kOutputs));
+    auto p_names = ctx->Inputs(kParameters);
+    auto pg_names = ctx->Outputs(kParamGrads);
+    auto dims = ctx->GetInputsDim(kParameters);
+    auto var_types = ctx->GetInputsVarType(kParameters);
+    std::vector<std::string> names_to_set;
+    std::vector<framework::DDim> dims_to_set;
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      if (pg_names[i] == framework::kEmptyVarName) {
+        continue;
+      }
+      if (var_types[i] == framework::VarDesc::LOD_TENSOR) {
+        names_to_set.push_back(pg_names[i]);
+        dims_to_set.push_back(dims[i]);
+      } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) {
+        // not sure how to set the dim of LOD_TENSOR_ARRAY
+        names_to_set.push_back(pg_names[i]);
+        dims_to_set.push_back(dims[i]);
+      }
+    }
+    ctx->SetDims(names_to_set, dims_to_set);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 REGISTER_OPERATOR(while, paddle::operators::WhileOp,
                  paddle::operators::WhileOpMaker,
                  paddle::operators::WhileGradOpDescMaker);
+REGISTER_OPERATOR(while_grad, paddle::operators::WhileGradOp,
+                  paddle::operators::WhileGradOpShapeInference,
+                  paddle::operators::WhileGradOpVarTypeInference);
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -30,7 +30,7 @@ void sgdUpdateCpu(real learningRate,
                  const real* grad,
                  real* momentumVec) {
  decayRate *= learningRate;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_USE_MKLML
 #pragma omp parallel for
 #endif
  for (size_t i = 0; i < size; ++i) {

--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
@@ -180,9 +180,10 @@ class ScopedFilterDescriptor {
                                            const cudnnDataType_t type,
                                            const std::vector<int>& kernel,
                                            const int groups = 1) {
-    // filter layout: MCHW, where M is the number of
+    // filter layout: MCHW(MCDHW), where M is the number of
    // output image channels, C is the number of input image channels,
-    // H and W is height and width of filter.
+    // D is the depth of the filter, H is the height of the filter, and W is the
+    // width of the filter.
    std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
    if (groups > 1) {
      // M /= groups

--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -57,8 +57,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
 | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
 | `WITH_TESTING` | ON | Build unit tests binaries. |
-| `WITH_MKLDNN` | ON | Build with [Intel® MKL DNN](https://github.com/01org/mkl-dnn) support. |
+| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
-| `WITH_MKLML` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) support. |
 | `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
 | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
 | `WITH_C_API` | OFF | Build capi libraries for inference. |

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -34,8 +34,7 @@ function cmake_gen() {
        ${PYTHON_FLAGS}
        -DWITH_DOC=OFF
        -DWITH_GPU=${WITH_GPU:-OFF}
-        -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
+        -DWITH_MKL=${WITH_MKL:-ON}
-        -DWITH_MKLML=${WITH_MKLML:-ON}
        -DWITH_AVX=${WITH_AVX:-OFF}
        -DWITH_GOLANG=${WITH_GOLANG:-ON}
        -DWITH_SWIG_PY=ON
@@ -56,8 +55,7 @@ EOF
        ${PYTHON_FLAGS} \
        -DWITH_DOC=OFF \
        -DWITH_GPU=${WITH_GPU:-OFF} \
-        -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
+        -DWITH_MKL=${WITH_MKL:-ON} \
-        -DWITH_MKLML=${WITH_MKLML:-ON} \
        -DWITH_AVX=${WITH_AVX:-OFF} \
        -DWITH_GOLANG=${WITH_GOLANG:-ON} \
        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \

--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -18,8 +18,8 @@ function version(){
        echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
        echo "    with_avx: @WITH_AVX@"
        echo "    with_gpu: @WITH_GPU@"
+        echo "    with_mkl: @WITH_MKL@"
        echo "    with_mkldnn: @WITH_MKLDNN@"
-        echo "    with_mklml: @WITH_MKLML@"
        echo "    with_double: @WITH_DOUBLE@"
        echo "    with_python: @WITH_PYTHON@"
        echo "    with_rdma: @WITH_RDMA@"
@@ -45,8 +45,8 @@ function ver2num() {
 function cpu_config() {
  # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
-  # only when MKLDNN or MKLML enabled
+  # only when MKL enabled
-  if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
+  if [ "@WITH_MKL@" == "OFF" ]; then
    return 0
  fi
  ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
@@ -70,8 +70,8 @@ function cpu_config() {
 function threads_config() {
  # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
  # according to trainer_count and total processors
-  # only when MKLDNN or MKLML enabled
+  # only when MKL enabled
-  if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
+  if [ "@WITH_MKL@" == "OFF" ]; then
    return 0
  fi
  processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`

--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -6,7 +6,7 @@ mkdir -p $TRAVIS_BUILD_DIR/build
 cd $TRAVIS_BUILD_DIR/build
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
 make -j `nproc` paddle_docs paddle_docs_cn

--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -137,6 +137,10 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
    }
  }
+  if (FLAGS_use_mkldnn) {
+    CHECK_EQ(FLAGS_trainer_count, 1UL) << "MKLDNN only need 1 trainer";
+  }
  if (testing) {
    LOG(INFO) << "trainer: in testing mode";
    if (config_->getOptConfig().use_sparse_remote_updater() ||

--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -28,35 +28,7 @@ if(WITH_PYTHON)
          ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
-################ test_CompareTwoNets ######################
-add_unittest_without_exec(test_CompareTwoNets
-    test_CompareTwoNets.cpp)
-add_test(NAME test_CompareTwoNets
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
-            --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-############### test_CompareTwoOpts ###################
-add_unittest_without_exec(test_CompareTwoOpts
-    test_CompareTwoOpts.cpp)
-add_test(NAME test_CompareTwoOpts
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts
-            --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf
-            --num_passes=1 --need_high_accuracy=0
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-################# test_CompareSparse ##################
-add_unittest_without_exec(test_CompareSparse
-    test_CompareSparse.cpp)
-if(NOT ON_TRAVIS)
-  add_test(NAME test_CompareSparse
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-              ./.set_port.sh -p port -n 6
-                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-endif()
 ################# test_recurrent_machine_generation ###############
 add_unittest_without_exec(test_recurrent_machine_generation
    test_recurrent_machine_generation.cpp)

--- a/paddle/trainer/tests/mnist.list
+++ b/paddle/trainer/tests/mnist.list
-trainer/tests/mnist_bin_part
--- a/paddle/trainer/tests/mnist_bin_part
+++ b/paddle/trainer/tests/mnist_bin_part
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
+++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
+++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
-./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
--- a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
+++ b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-# Note: when making change to this file, please make sure
-# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-default_initial_std(0.1)
-default_device(0)
-word_dim = 999
-l1 = 0
-l2 = 0
-model_type("nn")
-sparse_update = get_config_arg("sparse_update", bool, False)
-TrainData(ProtoData(        
-            type = "proto_sequence",
-            files = ('trainer/tests/train_sparse.list'), 
-            ))
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-wordvec_dim = 32
-layer2_dim = 16
-layer3_dim = 16
-hidden_dim = 32
-slot_names = ["qb", "qw", "tb", "tw"]
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        Layer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            type = "recurrent",
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            inputs = Input(slot_names[i] + "_embedding_" + network_name,
-                           parameter_name = "rnn1.w0")
-        )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
--- a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
+++ b/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-# Note: when making change to this file, please make sure
-# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-default_initial_std(0.1)
-default_device(0)
-word_dim = 1451594
-l1 = 0
-l2 = 0
-model_type("nn")
-sparse_update = get_config_arg("sparse_update", bool, False)
-TrainData(ProtoData(        
-            type = "proto_sequence",
-            files = ('trainer/tests/train.list'), 
-            ))
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-wordvec_dim = 128
-layer2_dim = 96
-layer3_dim = 96
-hidden_dim = 128
-slot_names = ["qb", "qw", "tb", "tw"]
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        Layer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            type = "recurrent",
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            inputs = Input(slot_names[i] + "_embedding_" + network_name,
-                           parameter_name = "rnn1.w0")
-        )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
--- a/paddle/trainer/tests/sample_trainer_config_rnn.conf
+++ b/paddle/trainer/tests/sample_trainer_config_rnn.conf
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-# Note: when making change to this file, please make sure
-# sample_trainer_config_qb_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-default_initial_std(0.1)
-default_device(0)
-word_dim = 1451594
-l1 = 0
-l2 = 0
-model_type("recurrent_nn")
-sparse_update = get_config_arg("sparse_update", bool, False)
-TrainData(ProtoData(
-            type = "proto_sequence",
-            files = ('trainer/tests/train.list'), 
-            ))
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-wordvec_dim = 128
-layer2_dim = 96
-layer3_dim = 96
-hidden_dim = 128
-slot_names = ["qb", "qw", "tb", "tw"]
-def SimpleRecurrentLayer(name, 
-                         size, 
-                         active_type, 
-                         bias, 
-                         input_layer_name, 
-                         parameter_name,
-                         seq_reversed = False):
-    RecurrentLayerGroupBegin(name + "_layer_group", 
-                             in_links=[input_layer_name], 
-                             out_links=[name],
-                             seq_reversed=seq_reversed)
-    memory_name = Memory(name=name, size=size)
-    Layer(
-        name = name,
-        type = "mixed",
-        size = size,
-        active_type = active_type,
-        bias = bias,
-        inputs = [IdentityProjection(input_layer_name),
-                  FullMatrixProjection(memory_name,
-                                       parameter_name = parameter_name,
-                                       ),
-                  ]
-        )
-    RecurrentLayerGroupEnd(name + "_layer_group")
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        SimpleRecurrentLayer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            size = hidden_dim,
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            input_layer_name = slot_names[i] + "_embedding_" + network_name,
-            parameter_name = "rnn1.w0",
-            )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
--- a/paddle/trainer/tests/testPyDataWrapper.py
+++ b/paddle/trainer/tests/testPyDataWrapper.py
@@ -20,28 +20,6 @@ import random
 import json
 import string
-@provider(slots=[
-    SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1),
-    IndexSlot(3)
-])
-def processNonSequenceData(obj, filename):
-    with open(filename, "rb") as f:
-        for line in f:
-            slots_str = line.split(';')
-            index = int(slots_str[0])
-            non_values = map(int, slots_str[1].split()[1:])
-            dense = map(float, slots_str[2].split()[1:])
-            strs = slots_str[4].strip().split(' ', 1)[1]
-            def __values_mapper__(s):
-                s = s.split(":")
-                return int(s[0]), float(s[1])
-            values = map(__values_mapper__, slots_str[3].split()[1:])
-            yield [non_values, dense, values, strs, index]
 SPARSE_ID_LIMIT = 1000
 SPARSE_ID_COUNT = 100
 SEQUENCE_LIMIT = 50
@@ -146,8 +124,6 @@ def processSubSeqAndGenerateData(obj, name):
 if __name__ == "__main__":
-    pvd = processNonSequenceData("test.txt")
-    print pvd.getNextBatch(100)
    pvd = processSeqAndGenerateData("_")
    print pvd.getNextBatch(100)
    pvd = processSubSeqAndGenerateData("_")

--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ b/paddle/trainer/tests/test_CompareTwoOpts.cpp
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-#include "paddle/trainer/Trainer.h"
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-DECLARE_int32(gpu_id);
-DECLARE_bool(local);
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_string(nics);
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
-DEFINE_bool(need_high_accuracy,
-            true,
-            "whether need to run in double accuracy (recommended)");
-DEFINE_double(
-    max_diff_ratio,
-    0.0f,
-    "max diff ratio allowed for outputs and parameters (value/gradient)");
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-void calcGradient(ComData& data, const string configFile) {
-  FLAGS_config = configFile;
-  FLAGS_local = true;
-  FLAGS_use_gpu = false;
-  FLAGS_nics = "";
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
-  data.parameters = trainer.getGradientMachine()->getParameters();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.train();
-}
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    if (diff > 0.0f &&
-        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_max_diff_ratio) {
-      nNum++;
-      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
-                << "    " << desB << " : " << B[i];
-    }
-  }
-  EXPECT_EQ(0, nNum);
-  LOG(INFO) << "\n\n";
-}
-void compareGradient(ComData& comDataA, ComData& comDataB) {
-  vector<Argument> outArgsA = comDataA.outArgs;
-  vector<Argument> outArgsB = comDataB.outArgs;
-  for (size_t i = 0; i < outArgsA.size(); ++i) {
-    CpuMatrix matA(outArgsA[i].value->getHeight(),
-                   outArgsA[i].value->getWidth());
-    CpuMatrix matB(outArgsB[i].value->getHeight(),
-                   outArgsB[i].value->getWidth());
-    matA.copyFrom(*outArgsA[i].value);
-    matB.copyFrom(*outArgsB[i].value);
-    LOG(INFO) << "\n--------------------------------"
-              << " Check Network Output_" << i << ":"
-              << " -------------------------------------\n";
-    checkBuffer(matA.getData(),
-                "network A output",
-                matB.getData(),
-                "network B output",
-                matA.getElementCnt(),
-                matA.getWidth());
-  }
-  vector<ParameterPtr>& parametersA = comDataA.parameters;
-  vector<ParameterPtr>& parametersB = comDataB.parameters;
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "Network A",
-                paraB.getData(),
-                "Network B",
-                paraA.getSize());
-    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
-    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
-    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
-              << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(),
-                "Network A",
-                gradB.getData(),
-                "Network B",
-                gradA.getSize());
-  }
-}
-TEST(Trainer, create) {
-  ComData dataA;
-  calcGradient(dataA, FLAGS_config_file_a);
-  LOG(INFO) << "\n\ntraining of Network A is finished\n\n";
-  ComData dataB;
-  calcGradient(dataB, FLAGS_config_file_b);
-  LOG(INFO) << "\n\ntraining of the Network B is finished\n\n";
-  compareGradient(dataA, dataB);
-}
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-#ifndef PADDLE_TYPE_DOUBLE
-  if (FLAGS_need_high_accuracy) {
-    LOG(INFO) << "skip test due to it's need high accuracy";
-    return 0;
-  }
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 2e-4;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in low accuracy mode";
-  }
-#else
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 2e-7;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in high accuracy mode";
-  }
-#endif
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -25,45 +25,9 @@ limitations under the License. */
 #include <unordered_set>
 #include "picojson.h"
-void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual);
 void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
 const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/";
-TEST(PyDataProviderWrapper, NoSequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module(std::string("testPyDataWrapper"));
-  conf.set_load_data_object(std::string("processNonSequenceData"));
-  conf.set_async_load_data(false);
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(100, &batchFromPy);
-  paddle::DataConfig conf2;
-  conf2.set_type("proto");
-  conf2.set_async_load_data(false);
-  conf2.clear_files();
-  conf2.set_files(kDir + "test_pydata_provider_wrapper.protolist");
-  provider.reset(paddle::DataProvider::create(conf2, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromProto;
-  provider->getNextBatch(100, &batchFromProto);
-  std::vector<paddle::Argument>& pyArguments = batchFromPy.getStreams();
-  std::vector<paddle::Argument>& protoArguments = batchFromProto.getStreams();
-  EXPECT_EQ(pyArguments.size(), protoArguments.size());
-  for (size_t i = 0; i < pyArguments.size(); ++i) {
-    checkEqual(protoArguments[i], pyArguments[i]);
-  }
-}
 TEST(PyDataProviderWrapper, SequenceData) {
  paddle::DataConfig conf;
  conf.set_type("py");
@@ -148,66 +112,6 @@ int main(int argc, char** argv) {
  return RUN_ALL_TESTS();
 }
-void checkEqual(const paddle::Argument& expect,
-                const paddle::Argument& actual) {
-  if (expect.value) {
-    EXPECT_TRUE(actual.value != nullptr);
-    paddle::Matrix* e = expect.value.get();
-    paddle::Matrix* a = actual.value.get();
-    EXPECT_EQ(e->getWidth(), a->getWidth());
-    EXPECT_EQ(e->getHeight(), a->getHeight());
-    if (dynamic_cast<paddle::CpuSparseMatrix*>(e)) {
-      paddle::CpuSparseMatrix* se = dynamic_cast<paddle::CpuSparseMatrix*>(e);
-      paddle::CpuSparseMatrix* sa = dynamic_cast<paddle::CpuSparseMatrix*>(a);
-      EXPECT_EQ(se->getFormat(), sa->getFormat());
-      EXPECT_EQ(se->getElementCnt(), sa->getElementCnt());
-      size_t rowSize = se->getFormat() == paddle::SPARSE_CSC
-                           ? se->getElementCnt()
-                           : se->getHeight() + 1;
-      size_t colSize = se->getFormat() == paddle::SPARSE_CSC
-                           ? se->getWidth() + 1
-                           : se->getElementCnt();
-      for (size_t i = 0; i < rowSize; ++i) {
-        EXPECT_EQ(se->getRows()[i], sa->getRows()[i]);
-      }
-      for (size_t i = 0; i < colSize; ++i) {
-        EXPECT_EQ(se->getCols()[i], sa->getCols()[i]);
-      }
-      if (se->getValueType() == paddle::FLOAT_VALUE) {
-        EXPECT_EQ(paddle::FLOAT_VALUE, sa->getValueType());
-        for (size_t i = 0; i < se->getElementCnt(); ++i) {
-          EXPECT_EQ(se->getValue()[i], sa->getValue()[i]);
-        }
-      }
-    } else if (dynamic_cast<paddle::CpuMatrix*>(e)) {
-      EXPECT_EQ(e->getElementCnt(), a->getElementCnt());
-      for (size_t i = 0; i < e->getElementCnt(); ++i) {
-        EXPECT_EQ(e->getData()[i], a->getData()[i]);
-      }
-    }
-  }
-  if (expect.ids) {
-    EXPECT_TRUE(actual.ids != nullptr);
-    paddle::VectorT<int>* e = expect.ids.get();
-    paddle::VectorT<int>* a = actual.ids.get();
-    EXPECT_EQ(e->getSize(), a->getSize());
-    for (size_t i = 0; i < e->getSize(); ++i) {
-      EXPECT_EQ(e->getData()[i], a->getData()[i]);
-    }
-  }
-  if (expect.strs) {
-    EXPECT_TRUE(actual.strs != nullptr);
-    std::vector<std::string>* e = expect.strs.get();
-    std::vector<std::string>* a = actual.strs.get();
-    EXPECT_EQ(e->size(), a->size());
-    for (size_t i = 0; i < e->size(); ++i) {
-      EXPECT_EQ((*e)[i], (*a)[i]);
-    }
-  }
-}
 void checkValue(std::vector<paddle::Argument>& arguments,
                picojson::array& arr) {
  // CHECK SLOT 0, Sparse Value.

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1826,7 +1826,7 @@ class FCLayer(LayerBase):
            self.layer_type = 'mkldnn_fc'
            config_assert(
                len(inputs) == 1,
-                "MkldnnFCLayer support one and only one input!")
+                "MKLDNNFCLayer support one and only one input!")
        super(FCLayer, self).__init__(
            name, self.layer_type, size, inputs=inputs, **xargs)
        for input_index in xrange(len(self.inputs)):
@@ -1837,7 +1837,7 @@ class FCLayer(LayerBase):
            sparse = format == "csr" or format == "csc"
            if use_mkldnn:
                config_assert(not sparse,
-                              "MkldnnFCLayer do not support sparse format yet")
+                              "MKLDNNFCLayer do not support sparse format yet")
                if use_mkldnn_wgt:
                    dims = [self.config.size, input_layer.size]
            if sparse:
@@ -1853,7 +1853,7 @@ class FCLayer(LayerBase):
 @config_layer('mkldnn_fc')
-class MkldnnFcLayer(FCLayer):
+class MKLDNNFcLayer(FCLayer):
    layer_type = 'mkldnn_fc'
@@ -3209,6 +3209,18 @@ class SubNestedSequenceLayer(LayerBase):
        self.set_layer_size(size)
+@config_layer('dot_prod')
+class DotProdLayer(LayerBase):
+    def __init__(self, name, inputs, device=None):
+        super(DotProdLayer, self).__init__(
+            name, 'dot_prod', 0, inputs, device=device)
+        config_assert(len(inputs) == 2, 'DotProdLayer must have 2 inputs.')
+        config_assert(
+            self.get_input_layer(0).size == self.get_input_layer(1).size,
+            "Two inputs should have the same size.")
+        self.set_layer_size(1)
 @config_layer('out_prod')
 class OuterProdLayer(LayerBase):
    def __init__(self, name, inputs, device=None):
@@ -3506,11 +3518,17 @@ def ExpressionLayer(name, inputs, **xargs):
 @config_layer('concat')
 class ConcatenateLayer(LayerBase):
+    layer_type = 'concat'
    def __init__(self, name, inputs, bias=False, **xargs):
        config_assert(inputs, 'inputs cannot be empty')
        config_assert(not bias, 'ConcatenateLayer cannot support bias.')
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        if self.layer_type == "mkldnn_concat":
+            config_assert(use_mkldnn, "mkldnn_concat only support MKLDNN")
+        self.layer_type = 'mkldnn_concat' if use_mkldnn else 'concat'
        super(ConcatenateLayer, self).__init__(
-            name, 'concat', 0, inputs=inputs, **xargs)
+            name, self.layer_type, 0, inputs=inputs, **xargs)
        size = 0
        for input_index in xrange(len(self.inputs)):
            assert self.get_input_layer(0).height == self.get_input_layer(
@@ -3530,6 +3548,11 @@ class ConcatenateLayer(LayerBase):
        self.set_layer_size(size)
+@config_layer('mkldnn_concat')
+class MKLDNNConcatLayer(ConcatenateLayer):
+    layer_type = 'mkldnn_concat'
 # like concat layer, but each input layer was processed by a Projection.
 @config_layer('concat2')
 class ConcatenateLayer2(LayerBase):

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -115,6 +115,7 @@ __all__ = [
    'huber_classification_cost',
    'block_expand_layer',
    'maxout_layer',
+    'dot_prod_layer',
    'out_prod_layer',
    'printer_layer',
    'print_layer',
@@ -197,6 +198,7 @@ class LayerType(object):
    SCALING_LAYER = 'scaling'
    TRANS_LAYER = 'trans'
    ROTATE_LAYER = 'rotate'
+    DOT_PROD_LAYER = 'dot_prod'
    OUT_PROD_LAYER = 'out_prod'
    FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
@@ -4140,6 +4142,45 @@ def maxid_layer(input, name=None, layer_attr=None):
        size=l.config.size)
+@wrap_name_default()
+def dot_prod_layer(input1, input2, name=None, layer_attr=None):
+    """
+    A layer for computing the dot product of two vectors.
+    The example usage is:
+    .. code-block:: python
+        dot_prod = dot_prod_layer(input1=vec1, input2=vec2)
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input1: The first input layer.
+    :type input: LayerOutput
+    :param input2: The second input layer.
+    :type input2: LayerOutput
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input1, LayerOutput)
+    assert isinstance(input2, LayerOutput)
+    assert input1.size == input2.size, ("Two inputs should have the same size.")
+    l = Layer(
+        name=name,
+        type=LayerType.DOT_PROD_LAYER,
+        inputs=[input1.name, input2.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.DOT_PROD_LAYER,
+        parents=[input1, input2],
+        size=l.config.size)
 @wrap_name_default()
 def out_prod_layer(input1, input2, name=None, layer_attr=None):
    """

--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -10,6 +10,7 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
 test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
-test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer)
+test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer
+test_dot_prod_layer)
 export whole_configs=(test_split_datasource)
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
+type: "nn"
+layers {
+  name: "vector1"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "vector2"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__dot_prod_layer_0__"
+  type: "dot_prod"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "vector1"
+  }
+  inputs {
+    input_layer_name: "vector2"
+  }
+}
+input_layer_names: "vector1"
+input_layer_names: "vector2"
+output_layer_names: "__dot_prod_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "vector1"
+  layer_names: "vector2"
+  layer_names: "__dot_prod_layer_0__"
+  input_layer_names: "vector1"
+  input_layer_names: "vector2"
+  output_layer_names: "__dot_prod_layer_0__"
+  is_recurrent_layer_group: false
+}
--- a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
+from paddle.trainer_config_helpers import *
+vec1 = data_layer(name='vector1', size=10)
+vec2 = data_layer(name='vector2', size=10)
+dot_product = dot_prod_layer(input1=vec1, input2=vec2)
+outputs(dot_product)
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -4,7 +4,10 @@ import collections
 import numpy as np
 import copy
-__all__ = ['Block', 'Variable', 'Program', 'Operator', 'default_startup_program', 'default_main_program']
+__all__ = [
+    'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
+    'default_main_program'
+]
 def unique_name(prefix):
@@ -12,9 +15,9 @@ def unique_name(prefix):
    return "_".join([prefix, str(uid)])
-def _debug_string_(proto):
+def _debug_string_(proto, throw_on_error=True):
    error_fields = list()
-    if not proto.IsInitialized(error_fields):
+    if not proto.IsInitialized(error_fields) and throw_on_error:
        raise ValueError("{0} are not initialized\nThe message is {1}".format(
            error_fields, proto))
    return proto.__str__()
@@ -101,9 +104,12 @@ class Variable(object):
        self.stop_gradient = stop_gradient
    def __str__(self):
+        return self.to_string(True)
+    def to_string(self, throw_on_error):
        protostr = self.desc.serialize_to_string()
        proto = framework_pb2.VarDesc.FromString(str(protostr))
-        return _debug_string_(proto)
+        return _debug_string_(proto, throw_on_error)
    __repr__ = __str__
@@ -229,17 +235,17 @@ class Operator(object):
                    in_proto.name)
                if found:
-                    in_argus = inputs[in_proto.name]
+                    in_args = inputs[in_proto.name]
-                    if not isinstance(in_argus, list):
+                    if not isinstance(in_args, list):
-                        in_argus = [in_argus]
+                        in_args = [in_args]
-                    if not in_proto.duplicable and len(in_argus) > 1:
+                    if not in_proto.duplicable and len(in_args) > 1:
                        raise ValueError(
                            "Input %s expects only one input, but %d are given."
-                            % (in_proto.name, len(in_argus)))
+                            % (in_proto.name, len(in_args)))
-                    in_argu_names = []
+                    in_arg_names = []
-                    for argu in in_argus:
+                    for arg in in_args:
-                        in_argu_names.append(argu.name)
+                        in_arg_names.append(arg.name)
-                    self.desc.set_input(in_proto.name, in_argu_names)
+                    self.desc.set_input(in_proto.name, in_arg_names)
                else:
                    self.desc.set_input(in_proto.name, [])
@@ -257,18 +263,18 @@ class Operator(object):
                        str(e) for e in given)))
            for out_proto in proto.outputs:
-                out_argus = outputs[out_proto.name]
+                out_args = outputs[out_proto.name]
-                if not isinstance(out_argus, list):
+                if not isinstance(out_args, list):
-                    out_argus = [out_argus]
+                    out_args = [out_args]
-                if not out_proto.duplicable and len(out_argus) > 1:
+                if not out_proto.duplicable and len(out_args) > 1:
                    raise ValueError(
                        "Output %s expects only one output, but %d are given." %
-                        (out_proto.name, len(out_argus)))
+                        (out_proto.name, len(out_args)))
-                out_argu_names = []
+                out_arg_names = []
-                for argu in out_argus:
+                for arg in out_args:
-                    out_argu_names.append(argu.name)
+                    out_arg_names.append(arg.name)
-                    argu.op = self
+                    arg.op = self
-                self.desc.set_output(out_proto.name, out_argu_names)
+                self.desc.set_output(out_proto.name, out_arg_names)
        if attrs is not None:
            if not isinstance(attrs, dict):
@@ -291,10 +297,13 @@ class Operator(object):
            self.desc.infer_var_type(self.block.desc)
            self.desc.infer_shape(self.block.desc)
-    def __str__(self):
+    def to_string(self, throw_on_error):
        protostr = self.desc.serialize_to_string()
        proto = framework_pb2.OpDesc.FromString(str(protostr))
-        return _debug_string_(proto)
+        return _debug_string_(proto, throw_on_error)
+    def __str__(self):
+        return self.to_string(True)
    __repr__ = __str__
@@ -349,9 +358,12 @@ class Block(object):
        self.program = program
    def __str__(self):
+        return self.to_string(True)
+    def to_string(self, throw_on_error):
        protostr = self.desc.serialize_to_string()
        proto = framework_pb2.BlockDesc.FromString(str(protostr))
-        return _debug_string_(proto)
+        return _debug_string_(proto, throw_on_error)
    __repr__ = __str__
@@ -454,9 +466,12 @@ class Program(object):
        self.current_block_idx = 0
    def __str__(self):
+        return self.to_string(True)
+    def to_string(self, throw_on_error):
        protostr = self.desc.serialize_to_string()
        proto = framework_pb2.ProgramDesc.FromString(str(protostr))
-        return _debug_string_(proto)
+        return _debug_string_(proto, throw_on_error)
    def clone(self):
        p = Program()
@@ -512,7 +527,14 @@ class Program(object):
        assert isinstance(target, Variable)
        if no_grad_set is None:
            no_grad_set = set()
-        param_to_grad_info = self.desc.append_backward(target.desc, no_grad_set)
+        try:
+            param_to_grad_info = self.desc.append_backward(target.desc,
+                                                           no_grad_set)
+        except Exception as e:
+            raise core.EnforceNotMet(
+                str(e) + "\nCurrent protobuf is\n{0}".format(
+                    self.to_string(False)))
        self.sync_with_cpp()
        return param_to_grad_info
@@ -563,8 +585,10 @@ class Parameter(Variable):
 g_main_program = Program()
 g_startup_program = Program()
 def default_startup_program():
    return g_startup_program
 def default_main_program():
    return g_main_program
--- a/python/paddle/v2/fluid/net_drawer.py
+++ b/python/paddle/v2/fluid/net_drawer.py
@@ -66,10 +66,13 @@ def parse_graph(program, graph, var_dict, **kwargs):
            if not var_dict.has_key(var):
                var_dict[var] = "Feed"
+    temp_id = 0
    proto = framework_pb2.ProgramDesc.FromString(
        program.desc.serialize_to_string())
    for block in proto.blocks:
        for op in block.ops:
+            op.type = op.type + "_" + str(temp_id)
+            temp_id += 1
            graph.node(**draw_node(op))
            for o in op.outputs:
                for arg in o.arguments:
@@ -78,6 +81,7 @@ def parse_graph(program, graph, var_dict, **kwargs):
                for arg in e.arguments:
                    if var_dict.has_key(arg):
                        graph.edge(**draw_edge(var_dict, op, e, arg))
+        break  # only plot the first block
 def draw_graph(startup_program, main_program, **kwargs):

--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.io import save_persistables, load_persistables
+import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.io import save_persistables, load_persistables
+from paddle.v2.fluid.optimizer import SGDOptimizer
-import numpy as np
+x = layers.data(name='x', shape=[13], data_type='float32')
-x = layers.data(
-    name='x',
-    shape=[13],
-    data_type='float32')
-y_predict = layers.fc(input=x,
+y_predict = layers.fc(input=x, size=1, act=None)
-                      size=1,
-                      act=None)
-y = layers.data(
+y = layers.data(name='y', shape=[1], data_type='float32')
-    name='y',
-    shape=[1],
-    data_type='float32')
-cost = layers.square_error_cost(
+cost = layers.square_error_cost(input=y_predict, label=y)
-    input=y_predict,
-    label=y)
 avg_cost = layers.mean(x=cost)
-sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+sgd_optimizer = SGDOptimizer(learning_rate=0.001)
 opts = sgd_optimizer.minimize(avg_cost)
 BATCH_SIZE = 20

--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
-import paddle.v2.fluid.optimizer as optimizer
 from paddle.v2.fluid.executor import Executor
-import paddle.v2.fluid.framework as framework
 from paddle.v2.fluid.initializer import XavierInitializer
+from paddle.v2.fluid.optimizer import AdamOptimizer
 def resnet_cifar10(input, depth=32):
-    def conv_bn_layer(input,
+    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu'):
        tmp = layers.conv2d(
            input=input,
            filter_size=filter_size,
@@ -24,9 +19,7 @@ def resnet_cifar10(input, depth=32):
            padding=padding,
            act=None,
            bias_attr=False)
-        return layers.batch_norm(
+        return layers.batch_norm(input=tmp, act=act)
-            input=tmp,
-            act=act)
    def shortcut(input, ch_in, ch_out, stride, program, init_program):
        if ch_in != ch_out:
@@ -35,28 +28,11 @@ def resnet_cifar10(input, depth=32):
        else:
            return input
-    def basicblock(input,
+    def basicblock(input, ch_in, ch_out, stride):
-                   ch_in,
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
-                   ch_out,
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
-                   stride):
-        tmp = conv_bn_layer(
-            input,
-            ch_out,
-            3,
-            stride,
-            1)
-        tmp = conv_bn_layer(
-            tmp,
-            ch_out,
-            3,
-            1,
-            1,
-            act=None)
        short = shortcut(input, ch_in, ch_out, stride)
-        return layers.elementwise_add(
+        return layers.elementwise_add(x=tmp, y=short, act='relu')
-            x=tmp,
-            y=short,
-            act='relu')
    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
        tmp = block_func(input, ch_in, ch_out, stride)
@@ -67,45 +43,17 @@ def resnet_cifar10(input, depth=32):
    assert (depth - 2) % 6 == 0
    n = (depth - 2) / 6
    conv1 = conv_bn_layer(
-        input=input,
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-        ch_out=16,
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
-        filter_size=3,
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
-        stride=1,
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-        padding=1)
-    res1 = layer_warp(
-        basicblock,
-        conv1,
-        16,
-        16,
-        n,
-        1)
-    res2 = layer_warp(
-        basicblock,
-        res1,
-        16,
-        32,
-        n,
-        2)
-    res3 = layer_warp(
-        basicblock,
-        res2,
-        32,
-        64,
-        n,
-        2)
    pool = layers.pool2d(
-        input=res3,
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-        pool_size=8,
-        pool_type='avg',
-        pool_stride=1)
    return pool
 def vgg16_bn_drop(input):
-    def conv_block(input,
+    def conv_block(input, num_filter, groups, dropouts):
-                   num_filter,
-                   groups,
-                   dropouts):
        return nets.img_conv_group(
            input=input,
            pool_size=2,
@@ -123,22 +71,14 @@ def vgg16_bn_drop(input):
    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-    drop = layers.dropout(
+    drop = layers.dropout(x=conv5, dropout_prob=0.5)
-        x=conv5,
-        dropout_prob=0.5)
    fc1 = layers.fc(input=drop,
                    size=512,
                    act=None,
                    param_attr={"initializer": XavierInitializer()})
-    reshape1 = layers.reshape(
+    reshape1 = layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1)))
-        x=fc1,
+    bn = layers.batch_norm(input=reshape1, act='relu')
-        shape=list(fc1.shape + (1, 1)))
+    drop2 = layers.dropout(x=bn, dropout_prob=0.5)
-    bn = layers.batch_norm(
-        input=reshape1,
-        act='relu')
-    drop2 = layers.dropout(
-        x=bn,
-        dropout_prob=0.5)
    fc2 = layers.fc(input=drop2,
                    size=512,
                    act=None,
@@ -165,8 +105,8 @@ cost = layers.cross_entropy(input=predict, label=label)
 avg_cost = layers.mean(x=cost)
 accuracy = layers.accuracy(input=predict, label=label)
-# optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+# optimizer = SGDOptimizer(learning_rate=0.001)
-optimizer = optimizer.AdamOptimizer(learning_rate=0.001)
+optimizer = AdamOptimizer(learning_rate=0.001)
 opts = optimizer.minimize(avg_cost)
 BATCH_SIZE = 128

--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.evaluator as evaluator
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.optimizer import AdamOptimizer
-import numpy as np
+images = layers.data(name='pixel', shape=[1, 28, 28], data_type='float32')
+label = layers.data(name='label', shape=[1], data_type='int64')
-images = layers.data(
-    name='pixel',
-    shape=[1, 28, 28],
-    data_type='float32')
-label = layers.data(
-    name='label',
-    shape=[1],
-    data_type='int64')
 conv_pool_1 = nets.simple_img_conv_pool(
    input=images,
    filter_size=5,
@@ -32,17 +25,13 @@ conv_pool_2 = nets.simple_img_conv_pool(
    pool_stride=2,
    act="relu")
-predict = layers.fc(input=conv_pool_2,
+predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
-                    size=10,
-                    act="softmax")
 cost = layers.cross_entropy(input=predict, label=label)
 avg_cost = layers.mean(x=cost)
-optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
+optimizer = AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
 opts = optimizer.minimize(avg_cost)
-accuracy, acc_out = evaluator.accuracy(
+accuracy, acc_out = evaluator.accuracy(input=predict, label=label)
-    input=predict,
-    label=label)
 BATCH_SIZE = 50
 PASS_NUM = 3

--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.regularizer import L2DecayRegularizer
 from paddle.v2.fluid.initializer import UniformInitializer
+from paddle.v2.fluid.optimizer import MomentumOptimizer
-import numpy as np
+from paddle.v2.fluid.regularizer import L2DecayRegularizer
 BATCH_SIZE = 128
-image = layers.data(
+image = layers.data(name='x', shape=[784], data_type='float32')
-    name='x',
-    shape=[784],
-    data_type='float32')
 param_attr = {
    'name': None,
@@ -22,32 +18,21 @@ param_attr = {
    'regularization': L2DecayRegularizer(0.0005 * BATCH_SIZE)
 }
-hidden1 = layers.fc(input=image,
+hidden1 = layers.fc(input=image, size=128, act='relu', param_attr=param_attr)
-                    size=128,
+hidden2 = layers.fc(input=hidden1, size=64, act='relu', param_attr=param_attr)
-                    act='relu',
-                    param_attr=param_attr)
-hidden2 = layers.fc(input=hidden1,
-                    size=64,
-                    act='relu',
-                    param_attr=param_attr)
 predict = layers.fc(input=hidden2,
                    size=10,
                    act='softmax',
                    param_attr=param_attr)
-label = layers.data(
+label = layers.data(name='y', shape=[1], data_type='int64')
-    name='y',
-    shape=[1],
-    data_type='int64')
 cost = layers.cross_entropy(input=predict, label=label)
 avg_cost = layers.mean(x=cost)
-accuracy = layers.accuracy(
+accuracy = layers.accuracy(input=predict, label=label)
-    input=predict,
-    label=label)
-optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
 opts = optimizer.minimize(avg_cost)
 train_reader = paddle.batch(

--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.optimizer import SGDOptimizer
-import numpy as np
 IS_SPARSE = True
 USE_GPU = False
@@ -19,10 +18,7 @@ def get_usr_combined_features():
    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
-    uid = layers.data(
+    uid = layers.data(name='user_id', shape=[1], data_type='int64')
-        name='user_id',
-        shape=[1],
-        data_type='int64')
    usr_emb = layers.embedding(
        input=uid,
@@ -31,15 +27,11 @@ def get_usr_combined_features():
        param_attr={'name': 'user_table'},
        is_sparse=IS_SPARSE)
-    usr_fc = layers.fc(input=usr_emb,
+    usr_fc = layers.fc(input=usr_emb, size=32)
-                       size=32)
    USR_GENDER_DICT_SIZE = 2
-    usr_gender_id = layers.data(
+    usr_gender_id = layers.data(name='gender_id', shape=[1], data_type='int64')
-        name='gender_id',
-        shape=[1],
-        data_type='int64')
    usr_gender_emb = layers.embedding(
        input=usr_gender_id,
@@ -47,14 +39,10 @@ def get_usr_combined_features():
        param_attr={'name': 'gender_table'},
        is_sparse=IS_SPARSE)
-    usr_gender_fc = layers.fc(input=usr_gender_emb,
+    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
-                              size=16)
    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
-    usr_age_id = layers.data(
+    usr_age_id = layers.data(name='age_id', shape=[1], data_type="int64")
-        name='age_id',
-        shape=[1],
-        data_type="int64")
    usr_age_emb = layers.embedding(
        input=usr_age_id,
@@ -62,14 +50,10 @@ def get_usr_combined_features():
        is_sparse=IS_SPARSE,
        param_attr={'name': 'age_table'})
-    usr_age_fc = layers.fc(input=usr_age_emb,
+    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
-                           size=16)
    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
-    usr_job_id = layers.data(
+    usr_job_id = layers.data(name='job_id', shape=[1], data_type="int64")
-        name='job_id',
-        shape=[1],
-        data_type="int64")
    usr_job_emb = layers.embedding(
        input=usr_job_id,
@@ -77,16 +61,12 @@ def get_usr_combined_features():
        param_attr={'name': 'job_table'},
        is_sparse=IS_SPARSE)
-    usr_job_fc = layers.fc(input=usr_job_emb,
+    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
-                           size=16)
    concat_embed = layers.concat(
-        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
-        axis=1)
-    usr_combined_features = layers.fc(input=concat_embed,
+    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
-                                      size=200,
-                                      act="tanh")
    return usr_combined_features
@@ -95,10 +75,7 @@ def get_mov_combined_features():
    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
-    mov_id = layers.data(
+    mov_id = layers.data(name='movie_id', shape=[1], data_type='int64')
-        name='movie_id',
-        shape=[1],
-        data_type='int64')
    mov_emb = layers.embedding(
        input=mov_id,
@@ -107,36 +84,24 @@ def get_mov_combined_features():
        param_attr={'name': 'movie_table'},
        is_sparse=IS_SPARSE)
-    mov_fc = layers.fc(input=mov_emb,
+    mov_fc = layers.fc(input=mov_emb, size=32)
-                       size=32)
    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
-    category_id = layers.data(
+    category_id = layers.data(name='category_id', shape=[1], data_type='int64')
-        name='category_id',
-        shape=[1],
-        data_type='int64')
    mov_categories_emb = layers.embedding(
-        input=category_id,
+        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
-        size=[CATEGORY_DICT_SIZE, 32],
-        is_sparse=IS_SPARSE)
    mov_categories_hidden = layers.sequence_pool(
-        input=mov_categories_emb,
+        input=mov_categories_emb, pool_type="sum")
-        pool_type="sum")
    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
-    mov_title_id = layers.data(
+    mov_title_id = layers.data(name='movie_title', shape=[1], data_type='int64')
-        name='movie_title',
-        shape=[1],
-        data_type='int64')
    mov_title_emb = layers.embedding(
-        input=mov_title_id,
+        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
-        size=[MOV_TITLE_DICT_SIZE, 32],
-        is_sparse=IS_SPARSE)
    mov_title_conv = nets.sequence_conv_pool(
        input=mov_title_emb,
@@ -146,13 +111,10 @@ def get_mov_combined_features():
        pool_type="sum")
    concat_embed = layers.concat(
-        input=[mov_fc, mov_categories_hidden, mov_title_conv],
+        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
-        axis=1)
    # FIXME(dzh) : need tanh operator
-    mov_combined_features = layers.fc(input=concat_embed,
+    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
-                                      size=200,
-                                      act="tanh")
    return mov_combined_features
@@ -162,18 +124,11 @@ def model():
    mov_combined_features = get_mov_combined_features()
    # need cos sim
-    inference = layers.cos_sim(
+    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
-        X=usr_combined_features,
-        Y=mov_combined_features)
-    label = layers.data(
+    label = layers.data(name='score', shape=[1], data_type='float32')
-        name='score',
-        shape=[1],
-        data_type='float32')
-    square_cost = layers.square_error_cost(
+    square_cost = layers.square_error_cost(input=inference, label=label)
-        input=inference,
-        label=label)
    avg_cost = layers.mean(x=square_cost)
@@ -182,7 +137,7 @@ def model():
 def main():
    cost = model()
-    sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.2)
+    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
    opts = sgd_optimizer.minimize(cost)
    if USE_GPU:

--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.optimizer import AdamOptimizer
-import numpy as np
 def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32):
@@ -31,7 +30,7 @@ def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32):
                           act="softmax")
    cost = layers.cross_entropy(input=prediction, label=label)
    avg_cost = layers.mean(x=cost)
-    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
+    adam_optimizer = AdamOptimizer(learning_rate=0.002)
    opts = adam_optimizer.minimize(avg_cost)
    acc = layers.accuracy(input=prediction, label=label)
    return avg_cost, acc

--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.optimizer import AdamOptimizer
-import numpy as np
 def stacked_lstm_net(input_dim,
@@ -41,7 +39,7 @@ def stacked_lstm_net(input_dim,
                           act='softmax')
    cost = layers.cross_entropy(input=prediction, label=label)
    avg_cost = layers.mean(x=cost)
-    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
+    adam_optimizer = AdamOptimizer(learning_rate=0.002)
    opts = adam_optimizer.minimize(avg_cost)
    acc = layers.accuracy(input=prediction, label=label)
    return avg_cost, acc

--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.optimizer import AdamOptimizer
-import numpy as np
 def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
@@ -33,7 +32,7 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
    cost = layers.cross_entropy(input=prediction, label=label)
    avg_cost = layers.mean(x=cost)
-    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
+    adam_optimizer = AdamOptimizer(learning_rate=0.002)
    opts = adam_optimizer.minimize(avg_cost)
    acc = layers.accuracy(input=prediction, label=label)

--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.optimizer import SGDOptimizer
-import numpy as np
 PASS_NUM = 100
 EMBED_SIZE = 32
@@ -17,26 +16,11 @@ IS_SPARSE = True
 word_dict = paddle.dataset.imikolov.build_dict()
 dict_size = len(word_dict)
-first_word = layers.data(
+first_word = layers.data(name='firstw', shape=[1], data_type='int64')
-    name='firstw',
+second_word = layers.data(name='secondw', shape=[1], data_type='int64')
-    shape=[1],
+third_word = layers.data(name='thirdw', shape=[1], data_type='int64')
-    data_type='int64')
+forth_word = layers.data(name='forthw', shape=[1], data_type='int64')
-second_word = layers.data(
+next_word = layers.data(name='nextw', shape=[1], data_type='int64')
-    name='secondw',
-    shape=[1],
-    data_type='int64')
-third_word = layers.data(
-    name='thirdw',
-    shape=[1],
-    data_type='int64')
-forth_word = layers.data(
-    name='forthw',
-    shape=[1],
-    data_type='int64')
-next_word = layers.data(
-    name='nextw',
-    shape=[1],
-    data_type='int64')
 embed_first = layers.embedding(
    input=first_word,
@@ -64,19 +48,12 @@ embed_forth = layers.embedding(
    param_attr={'name': 'shared_w'})
 concat_embed = layers.concat(
-    input=[embed_first, embed_second, embed_third, embed_forth],
+    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
-    axis=1)
+hidden1 = layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
-hidden1 = layers.fc(input=concat_embed,
+predict_word = layers.fc(input=hidden1, size=dict_size, act='softmax')
-                    size=HIDDEN_SIZE,
+cost = layers.cross_entropy(input=predict_word, label=next_word)
-                    act='sigmoid')
-predict_word = layers.fc(input=hidden1,
-                         size=dict_size,
-                         act='softmax')
-cost = layers.cross_entropy(
-    input=predict_word,
-    label=next_word)
 avg_cost = layers.mean(x=cost)
-sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+sgd_optimizer = SGDOptimizer(learning_rate=0.001)
 opts = sgd_optimizer.minimize(avg_cost)
 train_reader = paddle.batch(

--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -110,13 +110,30 @@ class TestConv2dOp(OpTest):
        self.op_type = "conv2d"
+class TestWithPad(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+class TestWithStride(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
 class TestWithGroup(TestConv2dOp):
    def init_group(self):
        self.groups = 3
-    def init_op_type(self):
-        self.op_type = "conv2d"
 class TestWith1x1(TestConv2dOp):
    def init_test_case(self):
@@ -127,15 +144,9 @@ class TestWith1x1(TestConv2dOp):
        f_c = self.input_size[1] / self.groups
        self.filter_size = [6, f_c, 1, 1]
-    def init_dilation(self):
-        self.dilations = [1, 1]
    def init_group(self):
        self.groups = 3
-    def init_op_type(self):
-        self.op_type = "conv2d"
 class TestWithDilation(TestConv2dOp):
    def init_test_case(self):
@@ -152,14 +163,19 @@ class TestWithDilation(TestConv2dOp):
    def init_group(self):
        self.groups = 3
+#----------------Conv2dCudnn----------------
+class TestCudnn(TestConv2dOp):
    def init_op_type(self):
-        self.op_type = "conv2d"
+        self.op_type = "conv_cudnn"
-#----------------Conv2dCudnn----------------
+class TestCudnnWithPad(TestWithPad):
+    def init_op_type(self):
+        self.op_type = "conv_cudnn"
-class TestCudnn(TestConv2dOp):
+class TestCudnnWithStride(TestWithStride):
    def init_op_type(self):
        self.op_type = "conv_cudnn"

--- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
@@ -4,9 +4,7 @@ from op_test import OpTest
 def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
-    # [2, 3, 5, 5]
    in_n, in_c, in_h, in_w = input_.shape
-    # [3, 6, 3, 3]
    f_c, out_c, f_h, f_w = filter_.shape
    assert in_c == f_c
@@ -29,6 +27,7 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
                    j1, j2 = j * stride[0], j * stride[0] + f_w
                    out[n, k, i1:i2, j1:j2] += tmp_out
+    out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
    return out
@@ -36,8 +35,6 @@ class TestConv2dTransposeOp(OpTest):
    def setUp(self):
        # init as conv transpose
        self.init_op_type()
-        # [2, 3, 5, 5] -> kernel [3, 6, 3, 3] -> output [2, 6, 7, 7]
        self.init_test_case()
        conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad}
@@ -55,7 +52,6 @@ class TestConv2dTransposeOp(OpTest):
        self.outputs = {'Output': output}
    def test_check_output(self):
-        print 'check output here for', self.op_type
        self.check_output()
    def test_check_grad_no_input(self):
@@ -88,6 +84,26 @@ class TestConv2dTransposeOp(OpTest):
        self.op_type = "conv2d_transpose"
+class TestWithPad(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+class TestWithStride(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
 # ------------ test_cudnn ------------
 class TestCudnn(TestConv2dTransposeOp):
    def init_op_type(self):

--- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
@@ -4,9 +4,7 @@ from op_test import OpTest
 def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
-    # [2, 3, 5, 5, 5]
    in_n, in_c, in_d, in_h, in_w = input_.shape
-    # [3, 6, 3, 3, 3]
    f_c, out_c, f_d, f_h, f_w = filter_.shape
    assert in_c == f_c
@@ -14,7 +12,6 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
    out_d = (in_d - 1) * stride[0] + f_d
    out_h = (in_h - 1) * stride[1] + f_h
    out_w = (in_w - 1) * stride[2] + f_w
    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
    for n in range(in_n):
@@ -33,6 +30,8 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
                        j1, j2 = j * stride[2], j * stride[2] + f_w
                        out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
+    out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
+              pad[2]]
    return out
@@ -40,8 +39,6 @@ class TestConv3dTransposeOp(OpTest):
    def setUp(self):
        # init as conv transpose
        self.init_op_type()
-        # [2, 3, 5, 5, 5] -> kernel [3, 6, 3, 3, 3] -> output [2, 6, 7, 7, 7]
        self.init_test_case()
        conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
@@ -49,7 +46,6 @@ class TestConv3dTransposeOp(OpTest):
        filter_ = np.random.random(self.filter_size).astype("float32")
        output = conv3dtranspose_forward_naive(
            input_, filter_, conv3dtranspose_param).astype("float32")
-        # print 'deconv output py', output, output.shape
        self.inputs = {'Input': input_, 'Filter': filter_}
        self.attrs = {
@@ -60,7 +56,6 @@ class TestConv3dTransposeOp(OpTest):
        self.outputs = {'Output': output}
    def test_check_output(self):
-        print 'check output here'
        self.check_output()
    def test_check_grad(self):
@@ -85,7 +80,7 @@ class TestConv3dTransposeOp(OpTest):
        self.pad = [0, 0, 0]
        self.stride = [1, 1, 1]
        self.dilations = [1, 1, 1]
-        self.input_size = [2, 3, 5, 5, 5]  # NCHW
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
        f_c = self.input_size[1]
        self.filter_size = [f_c, 6, 3, 3, 3]
@@ -93,5 +88,31 @@ class TestConv3dTransposeOp(OpTest):
        self.op_type = "conv3d_transpose"
+class TestWithPad(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+class TestWithStride(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+# ------------ test_cudnn ------------
+class TestCudnn(TestConv3dTransposeOp):
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose_cudnn"
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_gru_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_op.py
@@ -6,7 +6,8 @@ from test_lstm_op import identity, sigmoid, tanh, relu
 class TestGRUOp(OpTest):
-    batch_size = 9
+    lod = [[0, 2, 6, 9]]
+    batch_size = lod[0][-1]
    frame_size = 5
    activate = {
        'identity': identity,
@@ -35,7 +36,7 @@ class TestGRUOp(OpTest):
                           seq_starts[sorted_seqs[i]] + batch_idx)
                idx_in_seq.append(idx)
            idx_in_seq_list.append(idx_in_seq)
-        return idx_in_seq_list
+        return idx_in_seq_list, sorted_seqs
    def gru_step(self, x, h_p, w, b):
        batch_size = x.shape[0]
@@ -66,8 +67,8 @@ class TestGRUOp(OpTest):
        batch_hidden = self.outputs['BatchHidden']
        hidden = self.outputs['Hidden']
        idx_in_seq_list = self.idx_in_seq_list
-        h_p = self.inputs['H0'] if self.inputs.has_key('H0') else np.zeros(
+        h_p = self.inputs['H0'][self.sorted_seqs] if self.inputs.has_key(
-            (len(idx_in_seq_list[0]), self.frame_size))
+            'H0') else np.zeros((len(idx_in_seq_list[0]), self.frame_size))
        num_batch = len(idx_in_seq_list)
        end_idx = 0
        for batch_idx in range(num_batch):
@@ -84,8 +85,9 @@ class TestGRUOp(OpTest):
        return batch_gate, batch_reset_hidden_prev, hidden
    def set_data(self):
-        lod = [[0, 2, 6, self.batch_size]]
+        lod = self.lod
-        self.idx_in_seq_list = self.seq_to_batch(lod, self.is_reverse)
+        self.idx_in_seq_list, self.sorted_seqs = self.seq_to_batch(
+            lod, self.is_reverse)
        batch_size = self.batch_size
        frame_size = self.frame_size
        input = np.random.rand(batch_size, frame_size * 3).astype('float64')
@@ -146,7 +148,7 @@ class TestGRUOpReverse(TestGRUOp):
    def set_confs(self):
        self.is_reverse = True
        self.attrs = {
-            'activation': 'identity',
+            'activation': 'tanh',
            'gate_activation': 'sigmoid',
            'is_reverse': self.is_reverse
        }

--- a/python/paddle/v2/fluid/tests/test_is_empty_op.py
+++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py
+import unittest
+import numpy as np
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
+def create_tensor(scope, name, np_data):
+    tensor = scope.var(name).get_tensor()
+    tensor.set_dims(np_data.shape)
+    tensor.set(np_data, core.CPUPlace())
+    return tensor
+class TestIsEmptyOp(unittest.TestCase):
+    def setUp(self):
+        self.scope = core.Scope()
+        # create input variables
+        np_data0 = np.array([0, 1, 2])
+        create_tensor(self.scope, "X0", np_data0)
+        np_data1 = np.array([1])
+        t = create_tensor(self.scope, "X1", np_data1)
+        t.set_dims([0])
+        # create output variables
+        self.scope.var("out")
+    def test_no_empty(self):
+        self.one_case("X0", False)
+    def test_empty(self):
+        self.one_case("X1", True)
+    def one_case(self, input, target):
+        op = Operator(type="is_empty", X=input, Out="out")
+        ctx = core.DeviceContext.create(core.CPUPlace())
+        op.run(self.scope, ctx)
+        out = self.scope.var("out").get_tensor()
+        self.assertEqual(np.array(out)[0], target)
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_while_op.py
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
@@ -2,6 +2,7 @@ import unittest
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.core as core
+from paddle.v2.fluid.backward import append_backward_ops
 import numpy
@@ -16,7 +17,7 @@ class TestWhileOp(unittest.TestCase):
        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True
        init = layers.zeros(shape=[10], dtype='float32')
-        mem_array = layers.array_write(init, i=i)
+        mem_array = layers.array_write(x=init, i=i)
        data_array = layers.array_write(x=d0, i=i)
        i = layers.increment(i)
@@ -29,17 +30,23 @@ class TestWhileOp(unittest.TestCase):
        i.stop_gradient = True
        array_len = layers.fill_constant(shape=[1], dtype='int64', value=3)
+        array_len.stop_gradient = True
        cond = layers.less_than(x=i, y=array_len)
        while_op = layers.While(cond=cond)
        with while_op.block():
            d = layers.array_read(array=data_array, i=i)
            prev = layers.array_read(array=mem_array, i=i)
-            i = layers.increment(x=i, in_place=True)
            result = layers.sums(input=[d, prev])
+            i = layers.increment(x=i, in_place=True)
            layers.array_write(result, i=i, array=mem_array)
            layers.less_than(x=i, y=array_len, cond=cond)
-        sum_result = layers.array_read(mem_array, i=array_len)
+        sum_result = layers.array_read(array=mem_array, i=i)
+        loss = layers.mean(x=sum_result)
+        append_backward_ops(loss)
        cpu = core.CPUPlace()
        exe = Executor(cpu)