Merge remote-tracking branch 'upstream/develop' into factorization_machine_layer

5ee63bb6 · wangmeng28 · 571ef90c · 2e7ffbd1 · 5ee63bb6 · 5ee63bb6
89 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,8 +36,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
+option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -82,10 +81,8 @@ if(ANDROID OR IOS)
        "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
    set(WITH_RDMA OFF CACHE STRING
        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKLDNN OFF CACHE STRING
+    set(WITH_MKL OFF CACHE STRING
-        "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
+        "Disable MKL when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKLML OFF CACHE STRING
-        "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
    # Compile PaddlePaddle mobile inference library
    if (NOT WITH_C_API)
@@ -111,6 +108,14 @@ else()
    set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
+set(WITH_MKLML ${WITH_MKL})
+if (WITH_MKL AND ${AVX2_FOUND})
+    set(WITH_MKLDNN ON)
+else()
+    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
+    set(WITH_MKLDNN OFF)
+endif()
 ########################################################################################
 include(external/mklml)     # download mklml package
@@ -158,14 +163,15 @@ set(EXTERNAL_LIBS
 )
 if(WITH_GPU)
-    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+  include(cuda)
-    if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
-    endif(NOT WITH_DSO)
 endif(WITH_GPU)
+if(WITH_MKLML)
+    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
+endif()
 if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
 if(USE_NNPACK)

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -76,27 +76,14 @@ else()
    include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
-if(WITH_MKLDNN)
+if (WITH_MKLML AND MKLML_IOMP_LIB)
-    add_definitions(-DPADDLE_USE_MKLDNN)
+    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    if (WITH_MKLML AND MKLDNN_IOMP_DIR)
-        message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
    set(OPENMP_FLAGS "-fopenmp")
    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
-    else()
+endif()
-        find_package(OpenMP)
-        if(OPENMP_FOUND)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-        else()
-            message(WARNING "Can not find OpenMP."
-                 "Some performance features in MKLDNN may not be available")
-        endif()
-    endif()
-endif(WITH_MKLDNN)
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")

--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
 # Set the architecture for iOS
 if(NOT DEFINED IOS_ARCH)
  if(IOS_PLATFORM STREQUAL "OS")
-    # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
+    set(IOS_ARCH "armv7;armv7s;arm64")
-    set(IOS_ARCH "arm64")
  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    # FIXME(liuyiqun): support "i386;x86_64" future
+    set(IOS_ARCH "i386;x86_64")
-    set(IOS_ARCH "x86_64")
  endif()
 endif()
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_
 # Hidden visibilty is required for cxx on iOS 
 set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
+set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
 set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
+if(NOT WITH_GPU)
+    return()
+endif()
+set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs7 "30 35 50 52")
+set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
+######################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   detect_installed_gpus(out_variable)
+function(detect_installed_gpus out_variable)
+  if(NOT CUDA_gpu_detect_output)
+    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+    file(WRITE ${cufile} ""
+      "#include <cstdio>\n"
+      "int main() {\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device) {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
+                    "--run" "${cufile}"
+                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(nvcc_res EQUAL 0)
+      # only keep the last line of nvcc_out
+      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
+      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
+      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
+    endif()
+  endif()
+  if(NOT CUDA_gpu_detect_output)
+    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
+    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+########################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
+# Usage:
+#   select_nvcc_arch_flags(out_variable)
+function(select_nvcc_arch_flags out_variable)
+  # List of arch names
+  set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
+  set(archs_name_default "All")
+  if(NOT CMAKE_CROSSCOMPILING)
+    list(APPEND archs_names "Auto")
+  endif()
+  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
+  set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
+  mark_as_advanced(CUDA_ARCH_NAME)
+  # verify CUDA_ARCH_NAME value
+  if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
+    string(REPLACE ";" ", " archs_names "${archs_names}")
+    message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
+  endif()
+  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
+  else()
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
+    set(cuda_arch_bin "30 35")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+    set(cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(cuda_arch_bin "60 61")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
+    set(cuda_arch_bin "70")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
+    set(cuda_arch_bin ${paddle_known_gpu_archs})
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
+    detect_installed_gpus(cuda_arch_bin)
+  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(cuda_arch_bin ${CUDA_ARCH_BIN})
+  endif()
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+  list(REMOVE_DUPLICATES cuda_arch_bin)
+  list(REMOVE_DUPLICATES cuda_arch_ptx)
+  set(nvcc_flags "")
+  set(nvcc_archs_readable "")
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(arch ${cuda_arch_bin})
+    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified PTX for the concrete BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+      list(APPEND nvcc_archs_readable sm_${arch})
+    endif()
+  endforeach()
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(arch ${cuda_arch_ptx})
+    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+    list(APPEND nvcc_archs_readable compute_${arch})
+  endforeach()
+  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
+message(STATUS "CUDA detected: " ${CUDA_VERSION})
+if (${CUDA_VERSION} LESS 7.0)
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
+  # warning for now.
+  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+endif()
+include_directories(${CUDA_INCLUDE_DIRS})
+list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+if(NOT WITH_DSO)
+    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+endif(NOT WITH_DSO)
+# setting nvcc arch flags
+select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
+list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
+# Set C++11 support
+set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
+# So, don't set these flags here.
+list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
+list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+# Set :expt-relaxed-constexpr to suppress Eigen warnings
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
+mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
-    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
+    MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
-    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
+ELSE()
-    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
+    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
-    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
 ENDIF()
 SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
@@ -57,15 +56,16 @@ ExternalProject_Add(
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
+                        -DMKLROOT:PATH=${MKLML_ROOT}
 )
 ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
+MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
+add_definitions(-DPADDLE_USE_MKLDNN)
 LIST(APPEND external_project_dependencies mkldnn)
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND})
                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
            ENDIF()
        ELSEIF(IOS)
-            # FIXME(liuyiqun): support multiple architectures
+            IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
                SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
                SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
-            IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
-            ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
+            ELSE()
+                MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
+                       "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
            ENDIF()
        ELSEIF(RPI)
            # use hardfp

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
 INCLUDE(ExternalProject)
 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -149,58 +149,3 @@ endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
    safe_set_nvflag(${flag})
 endforeach()
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
-# So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
-LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
-endif()
-function(specify_cuda_arch cuda_version cuda_arch)
-    if(${cuda_version} VERSION_GREATER "8.0")
-        foreach(capability 61 62)
-          if(${cuda_arch} STREQUAL ${capability})
-            list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-          endif()
-        endforeach()
-    elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
-        list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-    endif()
-endfunction()
-# Common gpu architectures: Kepler, Maxwell
-foreach(capability 30 35 50)
-      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
-endforeach()
-if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
-      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
-endif()
-# Modern gpu architectures: Pascal
-if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
-      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
-      list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
-endif()
-# Custom gpu architecture
-set(CUDA_ARCH)
-if(CUDA_ARCH)
-  specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
-endif()
-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME)
        target_link_libraries(${TARGET_NAME} log)
    endif(ANDROID)
-    if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
+    if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
-      target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+      target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
    endif()
    add_dependencies(${TARGET_NAME} ${external_project_dependencies})

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -335,6 +335,16 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
    :noindex:
+dot_prod
+---------
+.. autoclass:: paddle.v2.layer.dot_prod
+    :noindex:
+out_prod
+--------
+.. autoclass:: paddle.v2.layer.out_prod
+    :noindex:
 power
 -----
 ..  autoclass:: paddle.v2.layer.power

--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA.
 我们把集成方案大致分为了如下几个方面。
 ### CMake
-我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项，当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
+我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关，他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。
-同时，我们会引入`WITH_MKLML`选项，用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用，但是建议在开启MKL-DNN的同时也打开MKLML的开关，这样才能发挥最好的性能。
+当打开`WITH_MKL`时，会开启MKLML的功能，作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上，同时会开启MKL-DNN功能。
-所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
+当关闭`WITH_MKL`时，MKLML和MKL-DNN功能会同时关闭。
-**备注**：当`WITH_MKLML=ON`的时候，会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库，所以会稍微改动`cmake/cblas.cmake`中的逻辑。
+所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
 ### Layers
 所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在

--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -34,7 +34,7 @@ PaddlePaddle的文档构建有两种方式。
    cd TO_YOUR_PADDLE_CLONE_PATH
    mkdir -p build
    cd build
-    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
    make gen_proto_py
    make paddle_docs paddle_docs_cn

--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
-# 构建Android平台上的PaddlePaddle库
+# Android平台编译指南
 用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
 - 基于Docker容器的编译方式

--- a/doc/mobile/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
-# 构建iOS平台上的PaddlePaddle库
+# iOS平台编译指南
 交叉编译iOS平台上适用的PaddlePaddle库，需要在MacOS系统上进行。本文的将介绍在MacOS上，从源码交叉编译iOS平台上适用的PaddlePaddle库。
 ## 准备交叉编译环境
@@ -25,7 +25,7 @@ iOS平台可选配置参数：
 - `IOS_PLATFORM`，可设置为`OS/SIMULATOR`，默认值为`OS`。
  - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
  - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示：
+- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示，默认编译所有架构：
    <table class="docutils">
    <colgroup>
@@ -41,11 +41,11 @@ iOS平台可选配置参数：
    <tbody valign="top">
      <tr class="row-even">
      <td>OS</td>
-      <td>armv7, armv7s, arm64 (默认)</td>
+      <td>armv7, armv7s, arm64 </td>
    </tr>
    <tr class="row-odd">
      <td>SIMULATOR</td>
-      <td>i386, x86_64 (默认)</td>
+      <td>i386, x86_64 </td>
    </tr>
    </tbody>
    </table>
@@ -66,7 +66,7 @@ iOS平台可选配置参数：
 ```bash
 cmake -DCMAKE_SYSTEM_NAME=iOS \
      -DIOS_PLATFORM=OS \
-      -DIOS_ARCH="arm64" \
+      -DIOS_ARCH="armv7;arm64" \
      -DIOS_ENABLE_BITCODE=ON \
      -DIOS_USE_VECLIB_FOR_BLAS=ON \
      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
@@ -112,6 +112,6 @@ $ make install
 - `lib`目录，其中包含PaddlePaddle的C-API静态库
 - `third_party`目录，其中包含所依赖的所有第三方库
-注意，不同架构的PaddlePaddle库建议安装到不同的目录下，然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。
+注意，如果PaddlePaddle库需要同时支持真机和模拟器，则需要分别编译真机和模拟器版本，然后使用`lipo`工具合并fat库。
 自此，PaddlePaddle库已经安装完成，用户可将合成的fat库用于深度学习相关的iOS App中，调用方法见C-API文档。
--- a/doc/mobile/cross_compiling_for_raspberry_cn.md
+++ b/doc/mobile/cross_compiling_for_raspberry_cn.md
-# 构建Raspberry Pi平台上的PaddlePaddle库
+# Raspberry Pi平台编译指南
 通常有两个方法来构建基于 Rasspberry Pi 的版本：

--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -25,7 +25,9 @@ limitations under the License. */
 #include "hl_matrix.h"
 #include "hl_sequence.h"
 #include "hl_sparse.h"
+#ifndef PADDLE_MOBILE_INFERENCE
 #include "hl_warpctc_wrap.h"
+#endif
 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_aggregate_stub.h"

--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -73,7 +73,6 @@ if(MOBILE_INFERENCE)
    list(REMOVE_ITEM GSERVER_SOURCES
         dataproviders/DataProvider.cpp
         dataproviders/MultiDataProvider.cpp
-         dataproviders/ProtoDataProvider.cpp
         dataproviders/PyDataProvider2.cpp
         dataproviders/PyDataProvider.cpp)

--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <unistd.h>
 #include <algorithm>
-#include "ProtoDataProvider.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
 #include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
@@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config,
 REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
 REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
-REGISTER_DATA_PROVIDER(proto, ProtoDataProvider);
-REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider);
 int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
  int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)

--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <vector>
-#include "DataFormat.pb.h"
-#include "paddle/utils/Stat.h"
-#include "DataProvider.h"
-#include "ProtoReader.h"
-namespace paddle {
-/**
- * @brief Provider data from protobuf data file with each sample
- * specified by proto message
- *
- * DataSample defined in DataFormat.proto.
- *
- * The file format is
- *
- *    header
- *
- *    sample1
- *
- *    sample2
- *
- *    ...
- *
- *    sampleN
- *
- * @note: In the data file, each message is prefixed with its length.
- * The read/write of the protbuf are implemented in ProtoReader.h
- */
-class ProtoDataProvider : public DataProvider {
-public:
-  ProtoDataProvider(const DataConfig& config,
-                    bool useGpu,
-                    bool loadDataAll = true);
-  virtual void reset();
-  /**
-   * @note this size includes the sequences which are skipped because they
-   * are longer than the batch size.
-   */
-  virtual int64_t getSize() {
-    int64_t size = sampleNums_;
-    if (usageRatio_ < 1.0f) {
-      size = static_cast<int64_t>(size * usageRatio_);
-    }
-    return size;
-  }
-  virtual void shuffle();
-  void loadData(const std::vector<std::string>& fileList);
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-protected:
-  /**
-   * @brief load protobuf data from a list of file
-   * @param[in]  fileName  file name of a file which contains
-   * a list of file names
-   */
-  void loadData(const std::string& fileName);
-  /**
-   * @brief load protobuf data from file
-   * @param[in]  fileName   data file name
-   */
-  void loadDataFile(const std::string& fileName);
-  /** @brief check data header of each data sample
-   *  @param[in] header     data header read from protobuf data
-   */
-  void checkDataHeader(const DataHeader& header);
-  /**
-   * @brief fill protobuf data into slot_,
-   * slot_ is a vector of ProtoSlot in memory.
-   * @param[in]  sample     data sample read from protobuf data
-   */
-  void fillSlots(const DataSample& sample);
-  /**
-   * @brief return true if each sample is one sequence, i.e., independent
-   * of other samples.
-   */
-  inline bool iidData() const { return sequenceStartPositions_.empty(); }
-  /**
-   * @brief check that sample is consistent with header_
-   */
-  void checkSample(const DataSample& sample);
-  template <class Op>
-  int64_t sequenceLoop(Op op, int64_t size);
-  template <class Op>
-  int64_t sampleLoop(Op op, int64_t size);
-  template <class Op>
-  int64_t subSampleLoop(Op op, int64_t size, int slot);
-  void showDataStats();
-protected:
-  struct ProtoVarSlot {
-    std::vector<real> data;
-    std::vector<int> dims;
-  };
-  struct ProtoSlot {
-    SlotDef::SlotType type;
-    int dim;
-    std::vector<int> indexData;
-    std::vector<real> denseData;
-    std::vector<sparse_non_value_t> sparseNonValueData;
-    std::vector<sparse_float_value_t> sparseFloatValueData;
-    std::vector<int64_t> indices;
-    std::vector<int64_t> subIndices;
-    std::vector<ProtoVarSlot> varDenseData;
-    std::vector<std::vector<int>> varIndices;
-    std::vector<std::string> strData;
-  };
-  DataHeader header_;
-  int numVecSlots_;
-  std::vector<ProtoSlot> slots_;
-  size_t sampleNums_;
-  /**
-   * The starting position of each sequence in samples.
-   * The last element should be num of samples.
-   * If empty, each sample is one sequence.
-   */
-  std::vector<size_t> sequenceStartPositions_;
-  int64_t currentSequenceIndex_;
-  // The size should be the number of sequences.
-  std::vector<size_t> shuffledSequenceIds_;
-  ThreadLocalD<DataBatch> cpuBatch_;
-  ThreadLocalD<DataBatch> gpuBatch_;
-  RWLock lock_;
-  std::vector<StatPtr> nnzStats_;  // stats for number of none-zeros entries
-};
-/**
- * @brief Special use for Proto data: instances should contain sparse-non-value
- * slots
- * and label.
- *
- * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
- */
-class ProtoSequenceDataProvider : public ProtoDataProvider {
-public:
-  ProtoSequenceDataProvider(const DataConfig& config,
-                            bool useGpu,
-                            bool loadDataAll = true);
-  ~ProtoSequenceDataProvider() {}
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-};
-}  // namespace paddle
--- a/paddle/gserver/layers/DotProdLayer.cpp
+++ b/paddle/gserver/layers/DotProdLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+/**
+ * @brief A layer for computing the dot product of two vectors.
+ * Input1: vector (batchSize * dim)
+ * Input2: vector (batchSize * dim)
+ * Output: a matrix: (batchSize * 1)
+ */
+class DotProdLayer : public Layer {
+public:
+  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
+  ~DotProdLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+REGISTER_LAYER(dot_prod, DotProdLayer);
+bool DotProdLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 2U);
+  CHECK_EQ(1UL, getSize())
+      << "The output dimensionality of this layer should be fixed to 1.";
+  return true;
+}
+void DotProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  size_t batchSize = inV0->getHeight();
+  CHECK_EQ(inV1->getHeight(), batchSize);
+  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, 1);
+  }
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
+    outV->sumOfProducts(*inV0, *inV1, 1, 0);
+  }
+}
+void DotProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  {
+    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
+    if (inG0) {
+      inG0->addRowScale(0, *inV1, *outG);
+    }
+    if (inG1) {
+      inG1->addRowScale(0, *inV0, *outG);
+    }
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNConcatLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "MKLDNNConcatLayer.h"
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+namespace paddle {
+REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer);
+bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  CHECK_GT(inputLayers_.size(), 1UL);
+  CHECK(!biasParameter_);
+  return true;
+}
+void MKLDNNConcatLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  ic = inputLayers_[0]->getSize() / ih / iw;
+  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
+  CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+  CHECK_GT(inputLayers_.size(), 1UL);
+  channels_.resize(inputLayers_.size());
+  channels_[0] = ic;
+  // need change the output channel, so use oc_ instead
+  // TODO(TJ): change API, use &oc
+  oc_ = ic;
+  for (size_t i = 1; i < inputLayers_.size(); i++) {
+    int batchsize, height, witdh;
+    reshapeInput(batchsize, height, witdh, i);
+    CHECK_EQ(bs, batchsize);
+    CHECK_EQ(ih, height);
+    CHECK_EQ(iw, witdh);
+    channels_[i] = inputLayers_[i]->getSize() / height / witdh;
+    CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
+    oc_ += channels_[i];
+  }
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc_ * oh * ow);
+}
+void MKLDNNConcatLayer::resetFwd(std::vector<primitive>& pipeline,
+                                 MKLDNNMatrixPtr& in,
+                                 MKLDNNMatrixPtr& wgt,
+                                 MKLDNNMatrixPtr& bias,
+                                 MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inVals_, out);
+  in = inVals_[0];
+  std::shared_ptr<concat::primitive_desc> fwdPD;
+  resetFwdPD(fwdPD, inVals_, out);
+  resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+}
+void MKLDNNConcatLayer::resetBwd(std::vector<primitive>& pipeline,
+                                 MKLDNNMatrixPtr& in,
+                                 MKLDNNMatrixPtr& wgt,
+                                 MKLDNNMatrixPtr& bias,
+                                 MKLDNNMatrixPtr& out) {
+  resetBwdBuffers(inGrads_, out);
+  in = inGrads_[0];
+  resetBwdPipeline(pipeline, bwds_, inGrads_, out);
+}
+void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                        MKLDNNMatrixPtr& out) {
+  inputs.resize(inputLayers_.size());
+  bool has8c = false, has16c = false, hasnc = false;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    // resetInValue will use ic_ so temporary change as current input's channel
+    // TODO(TJ): change ic_ as vector then can remove channels_
+    ic_ = channels_[i];
+    resetInValue(inputs[i], nullptr, i);
+    CHECK(inputs[i]);
+    auto dm = inputs[i]->getDims();
+    // inputs format can be different, but ndims must equal
+    CHECK(i == 0 || dm.size() == inputs[0]->getDims().size());
+    CHECK_EQ(bs_, dm[0]);
+    CHECK_EQ(channels_[i], dm[1]);
+    if (dm.size() > 2) {
+      CHECK_EQ(ih_, dm[2]);
+      CHECK_EQ(iw_, dm[3]);
+    }
+    if (inputs[i]->getFormat() == format::nc) {
+      hasnc = true;
+    }
+    if (inputs[i]->getFormat() == format::nChw8c) {
+      has8c = true;
+    }
+    if (inputs[i]->getFormat() == format::nChw16c) {
+      has16c = true;
+    }
+  }
+  // change back, ic_ always save the input 0 size
+  ic_ = channels_[0];
+  format outFmt;
+  if (has16c && oc_ % 16 == 0) {
+    outFmt = format::nChw16c;
+  } else if (has8c && oc_ % 8 == 0) {
+    outFmt = format::nChw8c;
+  } else if (hasnc) {
+    CHECK(oh_ == 1 && ow_ == 1);
+    outFmt = format::nc;
+  } else {
+    outFmt = format::nchw;
+  }
+  memory::dims outDims =
+      hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_};
+  auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_);
+  resetOutValue(out, outPD);
+}
+void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr<concat::primitive_desc>& pd,
+                                   std::vector<MKLDNNMatrixPtr>& inputs,
+                                   MKLDNNMatrixPtr out) {
+  std::vector<memory::primitive_desc> srcPDs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
+  }
+  CHECK(out);
+  pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs));
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+}
+void MKLDNNConcatLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<concat::primitive_desc>& pd,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  std::vector<primitive::at> srcs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcs.push_back(*(inputs[i]));
+  }
+  fwd_.reset(new concat(*pd, srcs, *out));
+  pipeline.push_back(*fwd_);
+}
+void MKLDNNConcatLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                        MKLDNNMatrixPtr& out) {
+  CHECK(outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  CHECK(out);
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    CHECK(inVals_[i]);
+    // resetInGrad will use inVal_
+    // TODO(TJ): change move inVals_ to MKLDNNLayer ans remove inVal_
+    inVal_ = inVals_[i];
+    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
+  }
+  // change back, inVal_ always save the input 0
+  inVal_ = inVals_[0];
+}
+void MKLDNNConcatLayer::resetBwdPipeline(
+    std::vector<mkldnn::primitive>& pipeline,
+    std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  // reset the backward primitives
+  memory::dims offsets = {0, 0, 0, 0};
+  prims.resize(inputs.size());
+  CHECK_EQ(inputs.size(), channels_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto viewPD = view::primitive_desc(
+        out->getPrimitiveDesc(), inputs[i]->getDims(), offsets);
+    auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(),
+                                         inputs[i]->getPrimitiveDesc());
+    prims[i].reset(new reorder(bwdPD, *out, *(inputs[i])));
+    offsets[axis_] += channels_[i];
+    // push to pipeline
+    pipeline.push_back(*prims[i]);
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNConcatLayer.h
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+namespace paddle {
+/**
+ * @brief A subclass of MKLDNNLayer Concatenate layer.
+ *
+ * The config file api is mkldnn_concat
+ */
+class MKLDNNConcatLayer : public MKLDNNLayer {
+protected:
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
+  std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
+  // input channel numbers
+  std::vector<int> channels_;
+  // concat_dimension in MKLDNN
+  // if axis_ == 0, concat batchsize
+  // if axis_ == 1, concat channel (default)
+  int axis_;
+public:
+  explicit MKLDNNConcatLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), axis_(1) {}
+  ~MKLDNNConcatLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+  void printSizeInfo() override {
+    CHECK_EQ(channels_.size(), inputLayers_.size());
+    for (size_t i = 0; i < channels_.size(); ++i) {
+      VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
+                         << ": " << bs_ << ", " << channels_[i] << ", " << ih_
+                         << ", " << iw_;
+    }
+    VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
+                       << ", " << ow_;
+  }
+  void printValueFormat() override {
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << inVals_[i]->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
+    }
+    if (extOutVal_) {
+      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+    }
+  }
+  void printGradFormat() override {
+    if (extOutGrad_) {
+      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
+    }
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+    }
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << inGrads_[i]->getFormat() << "<<<";
+    }
+  }
+protected:
+  /**
+   * Forward functions: reset buffers(inputs, output, bias),
+   *                    reset primitive descriptor,
+   *                    reset pipeline.
+   */
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+  /**
+   * Backward functions: reset buffers(inputs, output, bias)
+   *                     reset primitives and pipeline
+   */
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -21,8 +21,8 @@ namespace paddle {
 bool MKLDNNLayer::init(const LayerMap& layerMap,
                       const ParameterMap& parameterMap) {
-  CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
+  CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn."
-                          << "Please set WITH_MKLDNN=ON "
+                          << "Please set WITH_MKL=ON "
                          << "and set use_mkldnn=True";
  CHECK(!useGpu_) << "Do not support GPU yet";
@@ -138,8 +138,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
  }
 }
-void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
+void MKLDNNLayer::reshapeInput(int& batchsize,
-  const Argument& input = inputLayers_[0]->getOutput();
+                               int& height,
+                               int& width,
+                               size_t inputIdx) {
+  const Argument& input = inputLayers_[inputIdx]->getOutput();
  batchsize = input.getBatchSize();
  int h = input.getFrameHeight();
  int w = input.getFrameWidth();

--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -178,7 +178,10 @@ protected:
  /**
   * reshape the input image sizes and input batchsize
   */
-  void reshapeInput(int& batchsize, int& height, int& width);
+  void reshapeInput(int& batchsize,
+                    int& height,
+                    int& width,
+                    size_t inputIdx = 0);
  /**
   * reshape output image sizes

--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -29,7 +29,7 @@ gserver_test(test_KmaxSeqScore)
 gserver_test(test_Expand)
 gserver_test(test_MaxPoolingWithMaskOutput)
-########## test_Mkldnn layers and activations ##########
+########## test_MKLDNN layers and activations ##########
 if(WITH_MKLDNN)
    add_unittest_without_exec(test_MKLDNN
        test_MKLDNN.cpp
@@ -62,17 +62,6 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
 endif()
 if(NOT MOBILE_INFERENCE)
-################### test_ProtoDataProvider ############
-    add_unittest_without_exec(test_ProtoDataProvider
-        test_ProtoDataProvider.cpp)
-    # test_ProtoDataProvider will mkdir as same name,
-    # so if WORKING_DIRECTORY is default directory, then
-    # mkdir will get error.
-    add_test(NAME test_ProtoDataProvider
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 ################## test_Evaluator #######################
    add_unittest(test_Evaluator
        test_Evaluator.cpp)
@@ -110,3 +99,24 @@ add_test(NAME test_PyDataProvider2
   COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
 )
+################# test_CompareSparse ##################
+add_unittest_without_exec(test_CompareSparse
+    test_CompareSparse.cpp)
+if(NOT ON_TRAVIS)
+  add_test(NAME test_CompareSparse
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+          ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+              ./.set_port.sh -p port -n 6
+                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endif()
+################ test_CompareTwoNets ######################
+add_unittest_without_exec(test_CompareTwoNets
+    test_CompareTwoNets.cpp)
+add_test(NAME test_CompareTwoNets
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+        ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 /**
- * @brief test the functionality of Mkldnnlayers
+ * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
 * refer to paddle original function
 */
 class MKLDNNTester {

--- a/paddle/gserver/tests/proto_files.txt
+++ b/paddle/gserver/tests/proto_files.txt
-./test_ProtoDataProvider/data1.bin
-./test_ProtoDataProvider/data2.bin
--- a/paddle/gserver/tests/proto_files_compressed.txt
+++ b/paddle/gserver/tests/proto_files_compressed.txt
-./test_ProtoDataProvider/data1.bin.gz
-./test_ProtoDataProvider/data2.bin.gz
--- a/paddle/trainer/tests/sample_trainer_config_opt_a.conf
+++ b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
+#!/usr/bin/env python
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,27 +15,50 @@
 from paddle.trainer_config_helpers import *
-################################### Data Configuration ###################################
+######################## data source ################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-################################### Algorithm Configuration ###################################
+dict_file = dict()
-settings(batch_size = 1000,
+for line_count, line in enumerate(open(dict_path, "r")):
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
+    dict_file[line.strip()] = line_count
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-fc1 = fc_layer(input=data, size=800,
+define_py_data_sources2(
-               bias_attr=True,
+    train_list='gserver/tests/Sequence/train.list',
-               act=SigmoidActivation())
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
-fc2 = fc_layer(input=fc1, size=800,
+settings(batch_size=5)
-               bias_attr=True,
+######################## network configure ################################
-               act=SigmoidActivation())
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+sparse_update = get_config_arg("sparse_update", bool, False)
-output = fc_layer(input=[fc1, fc2], size=10,
+data = data_layer(name="word", size=dict_dim)
-                  bias_attr=True,
-                  act=SoftmaxActivation())
-lbl = data_layer(name ="label", size=1)
+emb = embedding_layer(
+    input=data,
+    size=word_dim,
+    param_attr=ParamAttr(sparse_update=sparse_update))
-cost = classification_cost(input=output, label=lbl)
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
-outputs(cost)
+    lstm_input += full_matrix_projection(input=emb)
+lstm = lstmemory(
+    input=lstm_input,
+    act=TanhActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=TanhActivation())
+lstm_last = last_seq(input=lstm)
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
--- a/paddle/trainer/tests/sample_trainer_config_opt_b.conf
+++ b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
+#!/usr/bin/env python
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,27 +15,42 @@
 from paddle.trainer_config_helpers import *
-################################### Data Configuration ###################################
+######################## data source ################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-################################### Algorithm Configuration ###################################
+dict_file = dict()
-settings(batch_size = 1000,
+for line_count, line in enumerate(open(dict_path, "r")):
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
+    dict_file[line.strip()] = line_count
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-fc1 = fc_layer(input=data, size=800,
+define_py_data_sources2(
-               bias_attr=True,
+    train_list='gserver/tests/Sequence/train.list',
-               act=SigmoidActivation())
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
-fc2 = fc_layer(input=fc1, size=800,
+settings(batch_size=5)
-               bias_attr=True,
+######################## network configure ################################
-               act=SigmoidActivation())
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
-output = fc_layer(input=[fc1, fc2], size=10,
+# This config is designed to be equivalent with sequence_recurrent_group.py
-                  bias_attr=True,
-                  act=SoftmaxActivation())
-lbl = data_layer(name ="label", size=1)
+data = data_layer(name="word", size=dict_dim)
-cost = classification_cost(input=output, label=lbl)
+emb = embedding_layer(
-outputs(cost)
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
+recurrent_last = last_seq(input=recurrent)
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
--- a/paddle/gserver/tests/sequence_recurrent_group.py
+++ b/paddle/gserver/tests/sequence_recurrent_group.py
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+# This config is designed to be equivalent with sequence_recurrent.py
+data = data_layer(name="word", size=dict_dim)
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    with mixed_layer(
+            name="rnn_state",
+            size=hidden_dim,
+            bias_attr=False,
+            act=SoftmaxActivation()) as out:
+        out += identity_projection(input=y)
+        out += full_matrix_projection(
+            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
+    return out
+recurrent = recurrent_group(name="rnn", step=step, input=emb)
+recurrent_last = last_seq(input=recurrent)
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -22,8 +22,7 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
-static const string& configFile1 =
+static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
-    "trainer/tests/sample_trainer_config_compare_sparse.conf";
 DECLARE_bool(use_gpu);
 DECLARE_string(config);

--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
@@ -30,8 +30,6 @@ DECLARE_bool(use_gpu);
 DECLARE_string(config);
 DECLARE_string(nics);
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
 DEFINE_bool(need_high_accuracy,
            false,
            "whether need to run in double accuracy");
@@ -42,6 +40,10 @@ DEFINE_double(
 DECLARE_bool(thread_local_rand_use_global_seed);
 DECLARE_int32(seed);
+static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
+static const string& config_file_b =
+    "gserver/tests/sequence_recurrent_group.py";
 struct ComData {
  vector<Argument> outArgs;
  vector<ParameterPtr> parameters;
@@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) {
  DataBatch dataBatch;
  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+  trainer.getDataProvider()->reset();
  trainer.getDataProvider()->setSkipShuffle();
  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
@@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 TEST(Trainer, create) {
  ComData dataA;
-  calcGradient(dataA, FLAGS_config_file_a);
+  calcGradient(dataA, config_file_a);
  LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
  ComData dataB;
-  calcGradient(dataB, FLAGS_config_file_b);
+  calcGradient(dataB, config_file_b);
  LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
  compareGradient(dataA, dataB);

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1081,6 +1081,21 @@ TEST(Layer, InterpolationLayer) {
  }
 }
+TEST(Layer, DotProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("dot_prod");
+  config.layerConfig.set_size(1);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "dot_prod", 10, false, useGpu);
+  }
+}
 TEST(Layer, OuterProdLayer) {
  TestConfig config;
  config.layerConfig.set_type("out_prod");

--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -313,6 +313,47 @@ TEST(MKLDNNLayer, AddtoLayer) {
  testAddtoLayer({4, 12, 1, 1}, 3);
 }
+static void getMKLDNNConcatConfig(TestConfig& cfg,
+                                  const std::vector<testImageDesc>& inputs) {
+  CHECK_GE(inputs.size(), 2) << "at least two inputs";
+  int oc = inputs[0].ic;
+  for (size_t i = 1; i < inputs.size(); ++i) {
+    CHECK_EQ(inputs[i].bs, inputs[0].bs);
+    CHECK_EQ(inputs[i].ih, inputs[0].ih);
+    CHECK_EQ(inputs[i].iw, inputs[0].iw);
+    oc += inputs[i].ic;
+  }
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("mkldnn_concat");
+  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back(
+        {INPUT_DATA,
+         ss.str(),
+         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
+         0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(inputs[i].ic);
+    img_conf->set_img_size_y(inputs[i].ih);
+    img_conf->set_img_size(inputs[i].iw);
+  }
+}
+void testConcatLayer(const std::vector<testImageDesc>& inputs) {
+  TestConfig dnnConfig;
+  getMKLDNNConcatConfig(dnnConfig, inputs);
+  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
+}
+TEST(MKLDNNLayer, ConcatLayer) {
+  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
+  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
+}
 void testActivation(std::string actType, const testImageDesc& pm) {
  // TODO(TJ): remove me when paddle support elu activation
  if (actType == "mkldnn_elu") {

--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -17,9 +17,13 @@ limitations under the License. */
 #include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
+#ifndef PADDLE_MOBILE_INFERENCE
 DEFINE_int32(pool_limit_size,
             536870912,
             "maximum memory size managed by a memory pool, default is 512M");
+#else
+DEFINE_int32(pool_limit_size, 0, "default is 0");
+#endif
 namespace paddle {

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -61,6 +61,18 @@ function(op_library TARGET)
        set(pybind_flag 1)
    endif()
+    if ("${TARGET}" STREQUAL "compare_op")
+        set(pybind_flag 1)
+        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
+    endif()
+    # conv_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
+    endif()
    # pool_op contains several operators
    if ("${TARGET}" STREQUAL "pool_op")
        set(pybind_flag 1)
@@ -68,9 +80,11 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
    endif()
-    if ("${TARGET}" STREQUAL "compare_op")
+    # pool_cudnn_op contains several operators
+    if ("${TARGET}" STREQUAL "pool_cudnn_op")
        set(pybind_flag 1)
-        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
    endif()
    # pool_with_index_op contains several operators
@@ -80,13 +94,6 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
    endif()
-    # conv_op contains several operators
-    if ("${TARGET}" STREQUAL "conv_op")
-        set(pybind_flag 1)
-        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
-    endif()
    # conv_transpose_op contains several operators
    if ("${TARGET}" STREQUAL "conv_transpose_op")
        set(pybind_flag 1)
@@ -94,11 +101,11 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
    endif()
-    # pool_cudnn_op contains several operators
+    # conv_transpose_cudnn_op contains two operators
-    if ("${TARGET}" STREQUAL "pool_cudnn_op")
+    if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
        set(pybind_flag 1)
        # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
+        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
    endif()
    # save_restore_op contains several operators

--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -226,9 +226,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
    T alpha = 1.0f, beta = 0.0f;
    if (input_grad) {
      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
+      // Because beta is zero, it is unnecessary to reset input_grad.
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
      for (int i = 0; i < groups; i++) {
        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
            handle, &alpha, cudnn_filter_desc,
@@ -241,9 +240,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
    // ------------------- cudnn conv backward filter ---------------------
    if (filter_grad) {
      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*filter_grad);
+      // Because beta is zero, it is unnecessary to reset filter_grad.
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
      for (int i = 0; i < groups; i++) {
        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,

--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -225,11 +225,15 @@ REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
            ops::ConvOpGrad);
 REGISTER_OP_CPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -17,11 +17,15 @@
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(conv2d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
-    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(conv3d,
-                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
-    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/conv2d_transpose_cudnn_op.cc
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cc
@@ -23,7 +23,24 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
                              framework::OpAttrChecker* op_checker)
      : Conv2DTransposeOpMaker(proto, op_checker) {
    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault(std::vector<int>{1, 1});
+        .SetDefault({1, 1});
+    AddAttr<int>("workspace_size_MB",
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardward. This size should be carefully setted.")
+        .SetDefault(4096);
+  }
+};
+class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker {
+ public:
+  CudnnConv3DTransposeOpMaker(framework::OpProto* proto,
+                              framework::OpAttrChecker* op_checker)
+      : Conv3DTransposeOpMaker(proto, op_checker) {
+    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
+        .SetDefault({1, 1, 1});
    AddAttr<int>("workspace_size_MB",
                 "workspace size for cudnn, in MB, "
                 "workspace is a section of GPU memory which will be "
@@ -48,3 +65,14 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
    conv2d_transpose_cudnn_grad,
    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp,
+            ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad,
+            ops::ConvTransposeOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_cudnn,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_cudnn_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv2d_transpose_cudnn_op.cu.cc
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cu.cc
@@ -54,15 +54,21 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
    ScopedTensorDescriptor output_desc;
    ScopedFilterDescriptor filter_desc;
    ScopedConvolutionDescriptor conv_desc;
-    DataLayout layout = DataLayout::kNCHW;
+    DataLayout layout;
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
-    // N, M, H, W
+    // (N, M, H, W) or (N, M, D, H, W)
    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
        layout, framework::vectorize2int(input->dims()));
-    // N, C, O_h, O_w
+    // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
        layout, framework::vectorize2int(output->dims()));
-    // M, C, K_h, K_w
+    // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
        layout, framework::vectorize2int(filter->dims()));
    cudnnConvolutionDescriptor_t cudnn_conv_desc =
@@ -136,13 +142,13 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
    ScopedConvolutionDescriptor conv_desc;
    DataLayout layout = DataLayout::kNCHW;
-    // Input: (N, M, H, W)
+    // Input: (N, M, H, W) or (N, M, D, H, W)
    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
        layout, framework::vectorize2int(input->dims()));
-    // Output: (N, C, O_H, O_W)
+    // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
        layout, framework::vectorize2int(output_grad->dims()));
-    // Filter (M, C, K_H, K_W)
+    // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
        layout, framework::vectorize2int(filter->dims()));
@@ -200,8 +206,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
    T alpha = 1.0f, beta = 0.0f;
    if (input_grad) {
      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      math::set_constant(ctx.device_context(), input_grad, 0);
+      // Because beta is zero, it is unnecessary to reset input_grad.
      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
          handle, &alpha, cudnn_output_desc, output_grad_data,
          cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
@@ -212,8 +217,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
    // ------------------- cudnn conv backward filter ---------------------
    if (filter_grad) {
      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      math::set_constant(ctx.device_context(), filter_grad, 0);
+      // Because beta is zero, it is unnecessary to reset filter_grad.
      // Gradient with respect to the filter
      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
          handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
@@ -234,3 +238,8 @@ REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn,
                       ops::CudnnConvTransposeOpKernel<float>);
 REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad,
                       ops::CudnnConvTransposeGradOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn,
+                       ops::CudnnConvTransposeOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad,
+                       ops::CudnnConvTransposeGradOpKernel<float>);
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -30,11 +30,6 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    PADDLE_ENFORCE_EQ(paddings[i], 0,
-                      "No Padding allowed in conv transpose op.");
-  }
  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                 "ConvTransposeOp intput should be 4-D or 5-D tensor.");
  PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
@@ -52,7 +47,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] +
+    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
                           filter_dims[i + 2]);
  }
  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
@@ -190,17 +185,21 @@ REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
 REGISTER_OP_CPU_KERNEL(
    conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
    conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
 REGISTER_OP_CPU_KERNEL(
    conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
    conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/conv_transpose_op.cu.cc
+++ b/paddle/operators/conv_transpose_op.cu.cc
@@ -18,14 +18,18 @@ namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
    conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
    conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
    conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
    conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -62,7 +62,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
    Tensor* output = context.Output<Tensor>("Output");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    // Actually, no paddings and groups allowed in conv transpose.
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    // TODO(Zhuoyuan): Paddings can be added in future.
    // groups will alway be disabled in conv2dtranspose.
@@ -148,8 +147,8 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
      } else if (filter_shape_vec.size() == 3) {
        // col2vol: col_matrix -> dy
        // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
-        col2vol(context.device_context(), col, dilations, strides,
+        col2vol(context.device_context(), col, dilations, strides, paddings,
-                std::vector<int>{0, 0, 0}, &output_batch);
+                &output_batch);
      }
    }
  }
@@ -173,7 +172,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
    if ((!input_grad) && (!filter_grad)) return;
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    // Actually, no paddings and groups allowed in conv transpose.
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    const int batch_size = static_cast<int>(input->dims()[0]);

--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -24,8 +24,17 @@
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+template <typename Place, typename T>
+inline void ReorderInitState(const platform::DeviceContext& ctx,
+                             const framework::Tensor& src, const size_t* index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
 template <typename Place, typename T>
 class GRUKernel : public framework::OpKernel<T> {
@@ -33,7 +42,6 @@ class GRUKernel : public framework::OpKernel<T> {
  void BatchCompute(const framework::ExecutionContext& context) const {
    auto* input = context.Input<LoDTensor>("Input");
    auto* h0 = context.Input<Tensor>("H0");
-    const T* h0_data = h0 ? h0->data<T>() : nullptr;
    auto* weight = context.Input<Tensor>("Weight");
    const T* weight_data = weight->data<T>();
    auto* bias = context.Input<Tensor>("Bias");
@@ -66,7 +74,18 @@ class GRUKernel : public framework::OpKernel<T> {
    gru_value.gateWeight = const_cast<T*>(weight_data);
    gru_value.stateWeight =
        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    gru_value.prevOutValue = const_cast<T*>(h0_data);
+    Tensor ordered_h0;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<Place, T>(context.device_context(), *h0, order,
+                                 &ordered_h0, true);
+      gru_value.prevOutValue = ordered_h0.data<T>();
+    } else {
+      gru_value.prevOutValue = nullptr;
+    }
    auto batch_starts = batch_gate->lod()[0];
    size_t num_batch = batch_starts.size() - 1;
    for (size_t n = 0; n < num_batch; n++) {
@@ -102,7 +121,6 @@ class GRUGradKernel : public framework::OpKernel<T> {
 public:
  void BatchCompute(const framework::ExecutionContext& context) const {
    auto* h0 = context.Input<Tensor>("H0");
-    const T* h0_data = h0 ? h0->data<T>() : nullptr;
    auto* weight = context.Input<Tensor>("Weight");
    const T* weight_data = weight->data<T>();
    auto* batch_gate = context.Input<LoDTensor>("BatchGate");
@@ -135,6 +153,17 @@ class GRUGradKernel : public framework::OpKernel<T> {
    zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
    zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
+    Tensor ordered_h0, ordered_h0_grad;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (h0) {
+      ReorderInitState<Place, T>(context.device_context(), *h0, order,
+                                 &ordered_h0, true);
+    }
+    if (h0_grad) {
+      ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
+      zero(context.device_context(), &ordered_h0_grad, static_cast<T>(0.0));
+    }
    bool is_reverse = context.Attr<bool>("is_reverse");
    batch_hidden_grad.set_lod(batch_hidden->lod());
    to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
@@ -176,14 +205,9 @@ class GRUGradKernel : public framework::OpKernel<T> {
          batch_reset_hidden_prev_grad.Slice(bstart, bend);
      gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
      if (n == 0) {
-        gru_value.prevOutValue = const_cast<T*>(h0_data);
+        gru_value.prevOutValue = h0 ? ordered_h0.data<T>() : nullptr;
-        if (h0_grad) {
+        gru_grad.prevOutGrad =
-          T* h0_grad_data = h0_grad->mutable_data<T>(context.GetPlace());
+            h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
-          zero(dev_ctx, h0_grad, static_cast<T>(0.0));
-          gru_grad.prevOutGrad = h0_grad_data;
-        } else {
-          gru_grad.prevOutGrad = nullptr;
-        }
      } else {
        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
@@ -208,6 +232,10 @@ class GRUGradKernel : public framework::OpKernel<T> {
      math::ColwiseSum<Place, T> col_sum;
      col_sum(dev_ctx, batch_gate_grad, bias_grad);
    }
+    if (h0 && h0_grad) {
+      ReorderInitState<Place, T>(context.device_context(), ordered_h0_grad,
+                                 order, h0_grad, false);
+    }
  }
  void Compute(const framework::ExecutionContext& context) const override {

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
 add_subdirectory(detail)
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto)
    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
    nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
@@ -15,7 +15,7 @@ if(WITH_GPU)
    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context)
+    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
    cc_library(softmax SRCS softmax.cc DEPS device_context)
    cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)

--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
@@ -119,8 +119,8 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width,
  if (index < n) {
    T val = 0;
-    int w = index % im_width;
+    int w = index % im_width + padding_width;
-    int h = (index / im_width) % im_height;
+    int h = (index / im_width) % im_height + padding_height;
    int c = index / (im_width * im_height);
    // compute the start and end of the output

--- a/paddle/operators/pool_cudnn_op.cu.cc
+++ b/paddle/operators/pool_cudnn_op.cu.cc
@@ -135,8 +135,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
    if (input_grad) {
      T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<paddle::platform::GPUPlace, T> set_zero;
+      // Because beta is zero, it is unnecessary to reset input_grad.
-      set_zero(ctx.device_context(), input_grad, static_cast<T>(0));
      PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,

--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -30,7 +30,7 @@ void sgdUpdateCpu(real learningRate,
                  const real* grad,
                  real* momentumVec) {
  decayRate *= learningRate;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_USE_MKLML
 #pragma omp parallel for
 #endif
  for (size_t i = 0; i < size; ++i) {

--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
@@ -63,9 +63,10 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
    }                                                             \
  } while (false)
-enum class DataLayout {
+enum class DataLayout {  // Not use
  kNHWC,
  kNCHW,
+  kNCDHW,
  kNCHW_VECT_C,
 };
@@ -107,12 +108,15 @@ class CudnnDataType<double> {
  }
 };
-inline cudnnTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) {
+inline cudnnTensorFormat_t GetCudnnTensorFormat(
+    const DataLayout& order) {  // Not use
  switch (order) {
    case DataLayout::kNHWC:
      return CUDNN_TENSOR_NHWC;
    case DataLayout::kNCHW:
      return CUDNN_TENSOR_NCHW;
+    case DataLayout::kNCDHW:
+      return CUDNN_TENSOR_NCHW;  // TODO(chengduoZH) : add CUDNN_TENSOR_NCDHW
    default:
      PADDLE_THROW("Unknown cudnn equivalent for order");
  }
@@ -139,7 +143,7 @@ class ScopedTensorDescriptor {
      strides[i] = dims[i + 1] * strides[i + 1];
    }
    // Update tensor descriptor dims setting if groups > 1
-    // FIXME(typhoonzero): Assume using NCHW order
+    // FIXME(typhoonzero): Assume using NCHW or NCDHW order
    std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
    if (groups > 1) {
      dims_with_group[1] = dims_with_group[1] / groups;
@@ -176,9 +180,10 @@ class ScopedFilterDescriptor {
                                            const cudnnDataType_t type,
                                            const std::vector<int>& kernel,
                                            const int groups = 1) {
-    // filter layout: MCHW, where M is the number of
+    // filter layout: MCHW(MCDHW), where M is the number of
    // output image channels, C is the number of input image channels,
-    // H and W is height and width of filter.
+    // D is the depth of the filter, H is the height of the filter, and W is the
+    // width of the filter.
    std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
    if (groups > 1) {
      // M /= groups

--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -57,8 +57,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
 | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
 | `WITH_TESTING` | ON | Build unit tests binaries. |
-| `WITH_MKLDNN` | ON | Build with [Intel® MKL DNN](https://github.com/01org/mkl-dnn) support. |
+| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
-| `WITH_MKLML` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) support. |
 | `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
 | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
 | `WITH_C_API` | OFF | Build capi libraries for inference. |

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -34,8 +34,7 @@ function cmake_gen() {
        ${PYTHON_FLAGS}
        -DWITH_DOC=OFF
        -DWITH_GPU=${WITH_GPU:-OFF}
-        -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
+        -DWITH_MKL=${WITH_MKL:-ON}
-        -DWITH_MKLML=${WITH_MKLML:-ON}
        -DWITH_AVX=${WITH_AVX:-OFF}
        -DWITH_GOLANG=${WITH_GOLANG:-ON}
        -DWITH_SWIG_PY=ON
@@ -56,8 +55,7 @@ EOF
        ${PYTHON_FLAGS} \
        -DWITH_DOC=OFF \
        -DWITH_GPU=${WITH_GPU:-OFF} \
-        -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
+        -DWITH_MKL=${WITH_MKL:-ON} \
-        -DWITH_MKLML=${WITH_MKLML:-ON} \
        -DWITH_AVX=${WITH_AVX:-OFF} \
        -DWITH_GOLANG=${WITH_GOLANG:-ON} \
        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \

--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -18,8 +18,8 @@ function version(){
        echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
        echo "    with_avx: @WITH_AVX@"
        echo "    with_gpu: @WITH_GPU@"
+        echo "    with_mkl: @WITH_MKL@"
        echo "    with_mkldnn: @WITH_MKLDNN@"
-        echo "    with_mklml: @WITH_MKLML@"
        echo "    with_double: @WITH_DOUBLE@"
        echo "    with_python: @WITH_PYTHON@"
        echo "    with_rdma: @WITH_RDMA@"
@@ -45,8 +45,8 @@ function ver2num() {
 function cpu_config() {
  # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
-  # only when MKLDNN or MKLML enabled
+  # only when MKL enabled
-  if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
+  if [ "@WITH_MKL@" == "OFF" ]; then
    return 0
  fi
  ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
@@ -70,8 +70,8 @@ function cpu_config() {
 function threads_config() {
  # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
  # according to trainer_count and total processors
-  # only when MKLDNN or MKLML enabled
+  # only when MKL enabled
-  if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
+  if [ "@WITH_MKL@" == "OFF" ]; then
    return 0
  fi
  processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`

--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -6,7 +6,7 @@ mkdir -p $TRAVIS_BUILD_DIR/build
 cd $TRAVIS_BUILD_DIR/build
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
 make -j `nproc` paddle_docs paddle_docs_cn

--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -137,6 +137,10 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
    }
  }
+  if (FLAGS_use_mkldnn) {
+    CHECK_EQ(FLAGS_trainer_count, 1UL) << "MKLDNN only need 1 trainer";
+  }
  if (testing) {
    LOG(INFO) << "trainer: in testing mode";
    if (config_->getOptConfig().use_sparse_remote_updater() ||

--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -28,35 +28,7 @@ if(WITH_PYTHON)
          ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
-################ test_CompareTwoNets ######################
-add_unittest_without_exec(test_CompareTwoNets
-    test_CompareTwoNets.cpp)
-add_test(NAME test_CompareTwoNets
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
-            --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-############### test_CompareTwoOpts ###################
-add_unittest_without_exec(test_CompareTwoOpts
-    test_CompareTwoOpts.cpp)
-add_test(NAME test_CompareTwoOpts
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts
-            --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf
-            --num_passes=1 --need_high_accuracy=0
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-################# test_CompareSparse ##################
-add_unittest_without_exec(test_CompareSparse
-    test_CompareSparse.cpp)
-if(NOT ON_TRAVIS)
-  add_test(NAME test_CompareSparse
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-              ./.set_port.sh -p port -n 6
-                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-endif()
 ################# test_recurrent_machine_generation ###############
 add_unittest_without_exec(test_recurrent_machine_generation
    test_recurrent_machine_generation.cpp)

--- a/paddle/trainer/tests/mnist.list
+++ b/paddle/trainer/tests/mnist.list
-trainer/tests/mnist_bin_part
--- a/paddle/trainer/tests/mnist_bin_part
+++ b/paddle/trainer/tests/mnist_bin_part
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
+++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
+++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
-./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
--- a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
+++ b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-# Note: when making change to this file, please make sure
-# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-default_initial_std(0.1)
-default_device(0)
-word_dim = 999
-l1 = 0
-l2 = 0
-model_type("nn")
-sparse_update = get_config_arg("sparse_update", bool, False)
-TrainData(ProtoData(        
-            type = "proto_sequence",
-            files = ('trainer/tests/train_sparse.list'), 
-            ))
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-wordvec_dim = 32
-layer2_dim = 16
-layer3_dim = 16
-hidden_dim = 32
-slot_names = ["qb", "qw", "tb", "tw"]
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        Layer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            type = "recurrent",
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            inputs = Input(slot_names[i] + "_embedding_" + network_name,
-                           parameter_name = "rnn1.w0")
-        )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
--- a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
+++ b/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-# Note: when making change to this file, please make sure
-# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-default_initial_std(0.1)
-default_device(0)
-word_dim = 1451594
-l1 = 0
-l2 = 0
-model_type("nn")
-sparse_update = get_config_arg("sparse_update", bool, False)
-TrainData(ProtoData(        
-            type = "proto_sequence",
-            files = ('trainer/tests/train.list'), 
-            ))
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-wordvec_dim = 128
-layer2_dim = 96
-layer3_dim = 96
-hidden_dim = 128
-slot_names = ["qb", "qw", "tb", "tw"]
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        Layer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            type = "recurrent",
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            inputs = Input(slot_names[i] + "_embedding_" + network_name,
-                           parameter_name = "rnn1.w0")
-        )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
--- a/paddle/trainer/tests/sample_trainer_config_rnn.conf
+++ b/paddle/trainer/tests/sample_trainer_config_rnn.conf
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-# Note: when making change to this file, please make sure
-# sample_trainer_config_qb_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-default_initial_std(0.1)
-default_device(0)
-word_dim = 1451594
-l1 = 0
-l2 = 0
-model_type("recurrent_nn")
-sparse_update = get_config_arg("sparse_update", bool, False)
-TrainData(ProtoData(
-            type = "proto_sequence",
-            files = ('trainer/tests/train.list'), 
-            ))
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-wordvec_dim = 128
-layer2_dim = 96
-layer3_dim = 96
-hidden_dim = 128
-slot_names = ["qb", "qw", "tb", "tw"]
-def SimpleRecurrentLayer(name, 
-                         size, 
-                         active_type, 
-                         bias, 
-                         input_layer_name, 
-                         parameter_name,
-                         seq_reversed = False):
-    RecurrentLayerGroupBegin(name + "_layer_group", 
-                             in_links=[input_layer_name], 
-                             out_links=[name],
-                             seq_reversed=seq_reversed)
-    memory_name = Memory(name=name, size=size)
-    Layer(
-        name = name,
-        type = "mixed",
-        size = size,
-        active_type = active_type,
-        bias = bias,
-        inputs = [IdentityProjection(input_layer_name),
-                  FullMatrixProjection(memory_name,
-                                       parameter_name = parameter_name,
-                                       ),
-                  ]
-        )
-    RecurrentLayerGroupEnd(name + "_layer_group")
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        SimpleRecurrentLayer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            size = hidden_dim,
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            input_layer_name = slot_names[i] + "_embedding_" + network_name,
-            parameter_name = "rnn1.w0",
-            )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
--- a/paddle/trainer/tests/testPyDataWrapper.py
+++ b/paddle/trainer/tests/testPyDataWrapper.py
@@ -20,28 +20,6 @@ import random
 import json
 import string
-@provider(slots=[
-    SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1),
-    IndexSlot(3)
-])
-def processNonSequenceData(obj, filename):
-    with open(filename, "rb") as f:
-        for line in f:
-            slots_str = line.split(';')
-            index = int(slots_str[0])
-            non_values = map(int, slots_str[1].split()[1:])
-            dense = map(float, slots_str[2].split()[1:])
-            strs = slots_str[4].strip().split(' ', 1)[1]
-            def __values_mapper__(s):
-                s = s.split(":")
-                return int(s[0]), float(s[1])
-            values = map(__values_mapper__, slots_str[3].split()[1:])
-            yield [non_values, dense, values, strs, index]
 SPARSE_ID_LIMIT = 1000
 SPARSE_ID_COUNT = 100
 SEQUENCE_LIMIT = 50
@@ -146,8 +124,6 @@ def processSubSeqAndGenerateData(obj, name):
 if __name__ == "__main__":
-    pvd = processNonSequenceData("test.txt")
-    print pvd.getNextBatch(100)
    pvd = processSeqAndGenerateData("_")
    print pvd.getNextBatch(100)
    pvd = processSubSeqAndGenerateData("_")

--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ b/paddle/trainer/tests/test_CompareTwoOpts.cpp
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-#include "paddle/trainer/Trainer.h"
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-DECLARE_int32(gpu_id);
-DECLARE_bool(local);
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_string(nics);
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
-DEFINE_bool(need_high_accuracy,
-            true,
-            "whether need to run in double accuracy (recommended)");
-DEFINE_double(
-    max_diff_ratio,
-    0.0f,
-    "max diff ratio allowed for outputs and parameters (value/gradient)");
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-void calcGradient(ComData& data, const string configFile) {
-  FLAGS_config = configFile;
-  FLAGS_local = true;
-  FLAGS_use_gpu = false;
-  FLAGS_nics = "";
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
-  data.parameters = trainer.getGradientMachine()->getParameters();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.train();
-}
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    if (diff > 0.0f &&
-        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_max_diff_ratio) {
-      nNum++;
-      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
-                << "    " << desB << " : " << B[i];
-    }
-  }
-  EXPECT_EQ(0, nNum);
-  LOG(INFO) << "\n\n";
-}
-void compareGradient(ComData& comDataA, ComData& comDataB) {
-  vector<Argument> outArgsA = comDataA.outArgs;
-  vector<Argument> outArgsB = comDataB.outArgs;
-  for (size_t i = 0; i < outArgsA.size(); ++i) {
-    CpuMatrix matA(outArgsA[i].value->getHeight(),
-                   outArgsA[i].value->getWidth());
-    CpuMatrix matB(outArgsB[i].value->getHeight(),
-                   outArgsB[i].value->getWidth());
-    matA.copyFrom(*outArgsA[i].value);
-    matB.copyFrom(*outArgsB[i].value);
-    LOG(INFO) << "\n--------------------------------"
-              << " Check Network Output_" << i << ":"
-              << " -------------------------------------\n";
-    checkBuffer(matA.getData(),
-                "network A output",
-                matB.getData(),
-                "network B output",
-                matA.getElementCnt(),
-                matA.getWidth());
-  }
-  vector<ParameterPtr>& parametersA = comDataA.parameters;
-  vector<ParameterPtr>& parametersB = comDataB.parameters;
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "Network A",
-                paraB.getData(),
-                "Network B",
-                paraA.getSize());
-    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
-    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
-    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
-              << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(),
-                "Network A",
-                gradB.getData(),
-                "Network B",
-                gradA.getSize());
-  }
-}
-TEST(Trainer, create) {
-  ComData dataA;
-  calcGradient(dataA, FLAGS_config_file_a);
-  LOG(INFO) << "\n\ntraining of Network A is finished\n\n";
-  ComData dataB;
-  calcGradient(dataB, FLAGS_config_file_b);
-  LOG(INFO) << "\n\ntraining of the Network B is finished\n\n";
-  compareGradient(dataA, dataB);
-}
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-#ifndef PADDLE_TYPE_DOUBLE
-  if (FLAGS_need_high_accuracy) {
-    LOG(INFO) << "skip test due to it's need high accuracy";
-    return 0;
-  }
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 2e-4;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in low accuracy mode";
-  }
-#else
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 2e-7;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in high accuracy mode";
-  }
-#endif
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -25,45 +25,9 @@ limitations under the License. */
 #include <unordered_set>
 #include "picojson.h"
-void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual);
 void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
 const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/";
-TEST(PyDataProviderWrapper, NoSequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module(std::string("testPyDataWrapper"));
-  conf.set_load_data_object(std::string("processNonSequenceData"));
-  conf.set_async_load_data(false);
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(100, &batchFromPy);
-  paddle::DataConfig conf2;
-  conf2.set_type("proto");
-  conf2.set_async_load_data(false);
-  conf2.clear_files();
-  conf2.set_files(kDir + "test_pydata_provider_wrapper.protolist");
-  provider.reset(paddle::DataProvider::create(conf2, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromProto;
-  provider->getNextBatch(100, &batchFromProto);
-  std::vector<paddle::Argument>& pyArguments = batchFromPy.getStreams();
-  std::vector<paddle::Argument>& protoArguments = batchFromProto.getStreams();
-  EXPECT_EQ(pyArguments.size(), protoArguments.size());
-  for (size_t i = 0; i < pyArguments.size(); ++i) {
-    checkEqual(protoArguments[i], pyArguments[i]);
-  }
-}
 TEST(PyDataProviderWrapper, SequenceData) {
  paddle::DataConfig conf;
  conf.set_type("py");
@@ -148,66 +112,6 @@ int main(int argc, char** argv) {
  return RUN_ALL_TESTS();
 }
-void checkEqual(const paddle::Argument& expect,
-                const paddle::Argument& actual) {
-  if (expect.value) {
-    EXPECT_TRUE(actual.value != nullptr);
-    paddle::Matrix* e = expect.value.get();
-    paddle::Matrix* a = actual.value.get();
-    EXPECT_EQ(e->getWidth(), a->getWidth());
-    EXPECT_EQ(e->getHeight(), a->getHeight());
-    if (dynamic_cast<paddle::CpuSparseMatrix*>(e)) {
-      paddle::CpuSparseMatrix* se = dynamic_cast<paddle::CpuSparseMatrix*>(e);
-      paddle::CpuSparseMatrix* sa = dynamic_cast<paddle::CpuSparseMatrix*>(a);
-      EXPECT_EQ(se->getFormat(), sa->getFormat());
-      EXPECT_EQ(se->getElementCnt(), sa->getElementCnt());
-      size_t rowSize = se->getFormat() == paddle::SPARSE_CSC
-                           ? se->getElementCnt()
-                           : se->getHeight() + 1;
-      size_t colSize = se->getFormat() == paddle::SPARSE_CSC
-                           ? se->getWidth() + 1
-                           : se->getElementCnt();
-      for (size_t i = 0; i < rowSize; ++i) {
-        EXPECT_EQ(se->getRows()[i], sa->getRows()[i]);
-      }
-      for (size_t i = 0; i < colSize; ++i) {
-        EXPECT_EQ(se->getCols()[i], sa->getCols()[i]);
-      }
-      if (se->getValueType() == paddle::FLOAT_VALUE) {
-        EXPECT_EQ(paddle::FLOAT_VALUE, sa->getValueType());
-        for (size_t i = 0; i < se->getElementCnt(); ++i) {
-          EXPECT_EQ(se->getValue()[i], sa->getValue()[i]);
-        }
-      }
-    } else if (dynamic_cast<paddle::CpuMatrix*>(e)) {
-      EXPECT_EQ(e->getElementCnt(), a->getElementCnt());
-      for (size_t i = 0; i < e->getElementCnt(); ++i) {
-        EXPECT_EQ(e->getData()[i], a->getData()[i]);
-      }
-    }
-  }
-  if (expect.ids) {
-    EXPECT_TRUE(actual.ids != nullptr);
-    paddle::VectorT<int>* e = expect.ids.get();
-    paddle::VectorT<int>* a = actual.ids.get();
-    EXPECT_EQ(e->getSize(), a->getSize());
-    for (size_t i = 0; i < e->getSize(); ++i) {
-      EXPECT_EQ(e->getData()[i], a->getData()[i]);
-    }
-  }
-  if (expect.strs) {
-    EXPECT_TRUE(actual.strs != nullptr);
-    std::vector<std::string>* e = expect.strs.get();
-    std::vector<std::string>* a = actual.strs.get();
-    EXPECT_EQ(e->size(), a->size());
-    for (size_t i = 0; i < e->size(); ++i) {
-      EXPECT_EQ((*e)[i], (*a)[i]);
-    }
-  }
-}
 void checkValue(std::vector<paddle::Argument>& arguments,
                picojson::array& arr) {
  // CHECK SLOT 0, Sparse Value.

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1826,7 +1826,7 @@ class FCLayer(LayerBase):
            self.layer_type = 'mkldnn_fc'
            config_assert(
                len(inputs) == 1,
-                "MkldnnFCLayer support one and only one input!")
+                "MKLDNNFCLayer support one and only one input!")
        super(FCLayer, self).__init__(
            name, self.layer_type, size, inputs=inputs, **xargs)
        for input_index in xrange(len(self.inputs)):
@@ -1837,7 +1837,7 @@ class FCLayer(LayerBase):
            sparse = format == "csr" or format == "csc"
            if use_mkldnn:
                config_assert(not sparse,
-                              "MkldnnFCLayer do not support sparse format yet")
+                              "MKLDNNFCLayer do not support sparse format yet")
                if use_mkldnn_wgt:
                    dims = [self.config.size, input_layer.size]
            if sparse:
@@ -1853,7 +1853,7 @@ class FCLayer(LayerBase):
 @config_layer('mkldnn_fc')
-class MkldnnFcLayer(FCLayer):
+class MKLDNNFcLayer(FCLayer):
    layer_type = 'mkldnn_fc'
@@ -3209,6 +3209,18 @@ class SubNestedSequenceLayer(LayerBase):
        self.set_layer_size(size)
+@config_layer('dot_prod')
+class DotProdLayer(LayerBase):
+    def __init__(self, name, inputs, device=None):
+        super(DotProdLayer, self).__init__(
+            name, 'dot_prod', 0, inputs, device=device)
+        config_assert(len(inputs) == 2, 'DotProdLayer must have 2 inputs.')
+        config_assert(
+            self.get_input_layer(0).size == self.get_input_layer(1).size,
+            "Two inputs should have the same size.")
+        self.set_layer_size(1)
 @config_layer('out_prod')
 class OuterProdLayer(LayerBase):
    def __init__(self, name, inputs, device=None):
@@ -3506,11 +3518,17 @@ def ExpressionLayer(name, inputs, **xargs):
 @config_layer('concat')
 class ConcatenateLayer(LayerBase):
+    layer_type = 'concat'
    def __init__(self, name, inputs, bias=False, **xargs):
        config_assert(inputs, 'inputs cannot be empty')
        config_assert(not bias, 'ConcatenateLayer cannot support bias.')
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        if self.layer_type == "mkldnn_concat":
+            config_assert(use_mkldnn, "mkldnn_concat only support MKLDNN")
+        self.layer_type = 'mkldnn_concat' if use_mkldnn else 'concat'
        super(ConcatenateLayer, self).__init__(
-            name, 'concat', 0, inputs=inputs, **xargs)
+            name, self.layer_type, 0, inputs=inputs, **xargs)
        size = 0
        for input_index in xrange(len(self.inputs)):
            assert self.get_input_layer(0).height == self.get_input_layer(
@@ -3530,6 +3548,11 @@ class ConcatenateLayer(LayerBase):
        self.set_layer_size(size)
+@config_layer('mkldnn_concat')
+class MKLDNNConcatLayer(ConcatenateLayer):
+    layer_type = 'mkldnn_concat'
 # like concat layer, but each input layer was processed by a Projection.
 @config_layer('concat2')
 class ConcatenateLayer2(LayerBase):

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -115,6 +115,7 @@ __all__ = [
    'huber_classification_cost',
    'block_expand_layer',
    'maxout_layer',
+    'dot_prod_layer',
    'out_prod_layer',
    'printer_layer',
    'print_layer',
@@ -198,6 +199,7 @@ class LayerType(object):
    SCALING_LAYER = 'scaling'
    TRANS_LAYER = 'trans'
    ROTATE_LAYER = 'rotate'
+    DOT_PROD_LAYER = 'dot_prod'
    OUT_PROD_LAYER = 'out_prod'
    FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
@@ -4143,6 +4145,45 @@ def maxid_layer(input, name=None, layer_attr=None):
        size=l.config.size)
+@wrap_name_default()
+def dot_prod_layer(input1, input2, name=None, layer_attr=None):
+    """
+    A layer for computing the dot product of two vectors.
+    The example usage is:
+    .. code-block:: python
+        dot_prod = dot_prod_layer(input1=vec1, input2=vec2)
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input1: The first input layer.
+    :type input: LayerOutput
+    :param input2: The second input layer.
+    :type input2: LayerOutput
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input1, LayerOutput)
+    assert isinstance(input2, LayerOutput)
+    assert input1.size == input2.size, ("Two inputs should have the same size.")
+    l = Layer(
+        name=name,
+        type=LayerType.DOT_PROD_LAYER,
+        inputs=[input1.name, input2.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.DOT_PROD_LAYER,
+        parents=[input1, input2],
+        size=l.config.size)
 @wrap_name_default()
 def out_prod_layer(input1, input2, name=None, layer_attr=None):
    """

--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -11,6 +11,6 @@ test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_l
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
 test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
 test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer
-test_factorization_machine)
+test_dot_prod_layer test_factorization_machine)
 export whole_configs=(test_split_datasource)
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
+type: "nn"
+layers {
+  name: "vector1"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "vector2"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__dot_prod_layer_0__"
+  type: "dot_prod"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "vector1"
+  }
+  inputs {
+    input_layer_name: "vector2"
+  }
+}
+input_layer_names: "vector1"
+input_layer_names: "vector2"
+output_layer_names: "__dot_prod_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "vector1"
+  layer_names: "vector2"
+  layer_names: "__dot_prod_layer_0__"
+  input_layer_names: "vector1"
+  input_layer_names: "vector2"
+  output_layer_names: "__dot_prod_layer_0__"
+  is_recurrent_layer_group: false
+}
--- a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
+from paddle.trainer_config_helpers import *
+vec1 = data_layer(name='vector1', size=10)
+vec2 = data_layer(name='vector2', size=10)
+dot_product = dot_prod_layer(input1=vec1, input2=vec2)
+outputs(dot_product)
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -4,7 +4,10 @@ import collections
 import numpy as np
 import copy
-__all__ = ['Block', 'Variable', 'Program', 'Operator', 'default_startup_program', 'default_main_program']
+__all__ = [
+    'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
+    'default_main_program'
+]
 def unique_name(prefix):
@@ -232,17 +235,17 @@ class Operator(object):
                    in_proto.name)
                if found:
-                    in_argus = inputs[in_proto.name]
+                    in_args = inputs[in_proto.name]
-                    if not isinstance(in_argus, list):
+                    if not isinstance(in_args, list):
-                        in_argus = [in_argus]
+                        in_args = [in_args]
-                    if not in_proto.duplicable and len(in_argus) > 1:
+                    if not in_proto.duplicable and len(in_args) > 1:
                        raise ValueError(
                            "Input %s expects only one input, but %d are given."
-                            % (in_proto.name, len(in_argus)))
+                            % (in_proto.name, len(in_args)))
-                    in_argu_names = []
+                    in_arg_names = []
-                    for argu in in_argus:
+                    for arg in in_args:
-                        in_argu_names.append(argu.name)
+                        in_arg_names.append(arg.name)
-                    self.desc.set_input(in_proto.name, in_argu_names)
+                    self.desc.set_input(in_proto.name, in_arg_names)
                else:
                    self.desc.set_input(in_proto.name, [])
@@ -260,18 +263,18 @@ class Operator(object):
                        str(e) for e in given)))
            for out_proto in proto.outputs:
-                out_argus = outputs[out_proto.name]
+                out_args = outputs[out_proto.name]
-                if not isinstance(out_argus, list):
+                if not isinstance(out_args, list):
-                    out_argus = [out_argus]
+                    out_args = [out_args]
-                if not out_proto.duplicable and len(out_argus) > 1:
+                if not out_proto.duplicable and len(out_args) > 1:
                    raise ValueError(
                        "Output %s expects only one output, but %d are given." %
-                        (out_proto.name, len(out_argus)))
+                        (out_proto.name, len(out_args)))
-                out_argu_names = []
+                out_arg_names = []
-                for argu in out_argus:
+                for arg in out_args:
-                    out_argu_names.append(argu.name)
+                    out_arg_names.append(arg.name)
-                    argu.op = self
+                    arg.op = self
-                self.desc.set_output(out_proto.name, out_argu_names)
+                self.desc.set_output(out_proto.name, out_arg_names)
        if attrs is not None:
            if not isinstance(attrs, dict):
@@ -582,8 +585,10 @@ class Parameter(Variable):
 g_main_program = Program()
 g_startup_program = Program()
 def default_startup_program():
    return g_startup_program
 def default_main_program():
    return g_main_program
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.io import save_persistables, load_persistables
+import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.io import save_persistables, load_persistables
+from paddle.v2.fluid.optimizer import SGDOptimizer
-import numpy as np
+x = layers.data(name='x', shape=[13], data_type='float32')
-x = layers.data(
-    name='x',
-    shape=[13],
-    data_type='float32')
-y_predict = layers.fc(input=x,
+y_predict = layers.fc(input=x, size=1, act=None)
-                      size=1,
-                      act=None)
-y = layers.data(
+y = layers.data(name='y', shape=[1], data_type='float32')
-    name='y',
-    shape=[1],
-    data_type='float32')
-cost = layers.square_error_cost(
+cost = layers.square_error_cost(input=y_predict, label=y)
-    input=y_predict,
-    label=y)
 avg_cost = layers.mean(x=cost)
-sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+sgd_optimizer = SGDOptimizer(learning_rate=0.001)
 opts = sgd_optimizer.minimize(avg_cost)
 BATCH_SIZE = 20

--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
--- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
--- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
--- a/python/paddle/v2/fluid/tests/test_gru_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_op.py
--- a/python/paddle/v2/framework/tests/test_is_empty_op.py
+++ b/python/paddle/v2/framework/tests/test_is_empty_op.py
 import unittest
 import numpy as np
-from paddle.v2.framework.op import Operator
+from paddle.v2.fluid.op import Operator
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 def create_tensor(scope, name, np_data):