Merge branch 'develop' of upstream into rank_loss_op_dev

f2cfa324 · Yibing Liu · 26b393f9 · d865b047 · f2cfa324 · f2cfa324
94 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ CMakeFiles
 cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
+paddle/pybind/pybind.h
--- a/.travis.yml
+++ b/.travis.yml
@@ -36,10 +36,6 @@ before_install:
  # protobuf version.
  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
-  - curl https://glide.sh/get | bash
-  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
-  - go get -u github.com/alecthomas/gometalinter
-  - gometalinter --install
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
-if(NOT ANDROID)
+if(NOT ANDROID AND NOT IOS)
    find_package(Boost QUIET)
 endif()
@@ -64,27 +64,29 @@ if(NOT CMAKE_BUILD_TYPE)
      FORCE)
 endif()
-if(ANDROID)
+if(ANDROID OR IOS)
-    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
+    if(ANDROID)
-        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
+        if(AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
-    elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+            message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
-        # TODO: support glog for Android api 16 ~ 19 in the future
+        elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
-        message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
+            # TODO: support glog for Android api 16 ~ 19 in the future
+            message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
+        endif()
    endif()
    set(WITH_GPU OFF CACHE STRING
-        "Disable GPU when cross-compiling for Android" FORCE)
+        "Disable GPU when cross-compiling for Android and iOS" FORCE)
    set(WITH_AVX OFF CACHE STRING
-        "Disable AVX when cross-compiling for Android" FORCE)
+        "Disable AVX when cross-compiling for Android and iOS" FORCE)
    set(WITH_PYTHON OFF CACHE STRING
-        "Disable PYTHON when cross-compiling for Android" FORCE)
+        "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
    set(WITH_RDMA OFF CACHE STRING
-        "Disable RDMA when cross-compiling for Android" FORCE)
+        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKLDNN OFF CACHE STRING
-        "Disable MKLDNN when cross-compiling for Android" FORCE)
+        "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKLML OFF CACHE STRING
-        "Disable MKLML package when cross-compiling for Android" FORCE)
+        "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
-endif(ANDROID)
+endif()
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
  "A path setting third party libraries download & build directories.")

--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -171,3 +171,10 @@ if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
  add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
 endif()
+if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
+  set(CBLAS_FOUND ON)
+  set(CBLAS_PROVIDER vecLib)
+  set(CBLAS_INC_DIR ${VECLIB_INC_DIR})
+  add_definitions(-DPADDLE_USE_VECLIB)
+endif()
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This is a toolchain file for cross-compiling for iOS, and the
+# configuration largely refers to public toolchain file:
+#    https://raw.githubusercontent.com/leetal/ios-cmake/master/ios.toolchain.cmake
+# and
+#    https://github.com/cristeab/ios-cmake
+#
+# Supports options:
+# IOS_PLATFORM = OS (default) or SIMULATOR
+#   This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
+#   OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
+#   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
+# IOS_ARCH
+#   The archectures wanted to support, such "arm64", "armv7;arm64"
+# IOS_DEPLOYMENT_TARGET
+#   The minimum iOS deployment version, such as "7.0"
+# IOS_ENABLE_BITCODE = ON (default) or OFF
+# IOS_USE_VECLIB_FOR_BLAS = OFF (default) or ON
+# IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
+#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
+#   If set manually, it will override the default location and force the user of a particular Developer Platform
+# IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
+#   By default this location is automatcially chosen based on the IOS_DEVELOPER_ROOT value.
+#   In this case it will always be the most up-to-date SDK found in the IOS_DEVELOPER_ROOT path.
+#   If set manually, this will force the use of a specific SDK version
+# Macros:
+# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
+#  A convenience macro for setting xcode specific properties on targets
+#  example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
+# find_host_package (PROGRAM ARGS)
+#  A macro used to find executable programs on the host system, not within the iOS environment.
+#  Thanks to the android-cmake project for providing the command
+if(NOT IOS)
+  return()
+endif()
+set(CMAKE_SYSTEM_NAME Darwin)
+# Get the Xcode version being used.
+execute_process(COMMAND xcodebuild -version
+                OUTPUT_VARIABLE XCODE_VERSION
+                RESULT_VARIABLE XCODE_VERSION_RESULT
+                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT ${XCODE_VERSION_RESULT})
+  string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
+  string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
+  message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
+else()
+  message(FATAL_ERROR "Cannot execute xcodebuild, please check whether xcode is installed.")
+endif()
+# Required as of cmake 2.8.10
+set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
+# Setup iOS platform unless specified manually with IOS_PLATFORM
+if(NOT DEFINED IOS_PLATFORM)
+  set(IOS_PLATFORM "OS")
+endif()
+set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
+# Set the architecture for iOS
+if(NOT DEFINED IOS_ARCH)
+  if(IOS_PLATFORM STREQUAL "OS")
+    # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
+    set(IOS_ARCH "arm64")
+  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
+    set(IOS_ARCH "i386;x86_64")
+  elseif(IOS_PLATFORM STREQUAL "WATCHOS")
+    set(IOS_ARCH armv7k)
+  endif()
+endif()
+set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
+# Specify minimum iOS deployment version
+if(NOT DEFINED IOS_DEPLOYMENT_TARGET)
+  set(IOS_DEPLOYMENT_TARGET "7.0")
+endif()
+set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version")
+# Whether to enable bitcode
+if(NOT DEFINED IOS_ENABLE_BITCODE)
+  set(IOS_ENABLE_BITCODE ON)
+endif()
+set(IOS_ENABLE_BITCODE ${IOS_ENABLE_BITCODE} CACHE BOOL "Whether to enable bitcode")
+if(NOT DEFINED IOS_USE_VECLIB_FOR_BLAS)
+  set(IOS_USE_VECLIB_FOR_BLAS OFF)
+endif()
+set(IOS_USE_VECLIB_FOR_BLAS ${IOS_UES_VECLIB_FOR_BLAS} CACHE BOOL "Whether to use veclib")
+# Check the platform selection and setup for developer root
+if(${IOS_PLATFORM} STREQUAL "OS")
+  set(IOS_PLATFORM_LOCATION "iPhoneOS.platform")
+  set(XCODE_IOS_PLATFORM iphoneos)
+  # This causes the installers to properly locate the output libraries
+  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
+elseif(${IOS_PLATFORM} STREQUAL "SIMULATOR")
+  set(IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
+  set(XCODE_IOS_PLATFORM iphonesimulator)
+  # This causes the installers to properly locate the output libraries
+  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
+elseif(${IOS_PLATFORM} STREQUAL "WATCHOS")
+  set(IOS_PLATFORM_LOCATION "WatchOS.platform")
+  set(XCODE_IOS_PLATFORM watchos)
+  # This causes the installers to properly locate the output libraries
+  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-watchos")
+else(${IOS_PLATFORM} STREQUAL "OS")
+  message(FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please set to\n"
+          "\t OS, SIMULATOR, or WATCHOS.")
+endif()
+# Check iOS developer toolchain
+if(NOT DEFINED IOS_DEVELOPER_ROOT)
+  # Setup iOS developer location
+  execute_process(COMMAND xcode-select -print-path
+                  OUTPUT_VARIABLE XCODE_DEVELOPER_DIR
+                  RESULT_VARIABLE XCODE_DEVELOPER_DIR_RESULT
+                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  # Xcode 4.3 changed the installation location, choose the most recent one available
+  if(${XCODE_VERSION} VERSION_LESS "4.3.0")
+    set(IOS_DEVELOPER_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+  else()
+    set(IOS_DEVELOPER_ROOT "${XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+  endif()
+endif()
+if(EXISTS ${IOS_DEVELOPER_ROOT})
+  set(IOS_DEVELOPER_ROOT ${IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
+else()
+  message(FATAL_ERROR "Invalid IOS_DEVELOPER_ROOT: ${IOS_DEVELOPER_ROOT} does not exist.")
+endif()
+# Check iOS SDK
+if(NOT DEFINED IOS_SDK_ROOT)
+  # Find and use the most recent iOS sdk
+  file(GLOB IOS_SDK_LISTS "${IOS_DEVELOPER_ROOT}/SDKs/*")
+  if(IOS_SDK_LISTS)
+    list(SORT IOS_SDK_LISTS)
+    list(REVERSE IOS_SDK_LISTS)
+    list(GET IOS_SDK_LISTS 0 IOS_SDK_ROOT)
+  else(IOS_SDK_LISTS)
+    message(FATAL_ERROR "No iOS SDK's found in default search path ${IOS_DEVELOPER_ROOT}."
+            " Please manually set IOS_SDK_ROOT or install the iOS SDK.")
+  endif(IOS_SDK_LISTS)
+endif()
+if(EXISTS ${IOS_SDK_ROOT})
+  set(IOS_SDK_ROOT ${IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
+  message(STATUS "iOS toolchain: ${IOS_SDK_ROOT}")
+else()
+  message(FATAL_ERROR "Invalid IOS_SDK_ROOT: ${IOS_SDK_ROOT} does not exist.")
+endif()
+# Set the sysroot default to the most recent SDK
+set(CMAKE_OSX_SYSROOT ${IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
+# Get version of iOS SDK
+execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
+                OUTPUT_VARIABLE IOS_SDK_VERSION
+                RESULT_VARIABLE IOS_SDK_VERSION_RESULT
+                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(${IOS_SDK_VERSION_RESULT})
+  string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" IOS_SDK_VERSION "${IOS_SDK_ROOT}")
+endif()
+if(NOT IOS_SDK_VERSION)
+  message(WARNING "Cannot get SDK's version.")
+  set(IOS_SDK_VERSION 1)
+endif()
+set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION})
+# Find the C & C++ compilers for the specified SDK.
+if(NOT CMAKE_C_COMPILER)
+  # Default to use clang
+  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
+                  OUTPUT_VARIABLE IOS_C_COMPILER
+                  RESULT_VARIABLE IOS_C_COMPILER_RESULT
+                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(${IOS_C_COMPILER_RESULT})
+    get_filename_component(IOS_C_COMPILER clang PROGRAM)
+  endif()
+else(NOT CMAKE_C_COMPILER)
+  # User can set it in cmake command
+  get_filename_component(IOS_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
+endif(NOT CMAKE_C_COMPILER)
+if(NOT EXISTS ${IOS_C_COMPILER})
+  message(FATAL_ERROR "Cannot find C compiler: ${IOS_C_COMPILER}")
+endif()
+if(NOT CMAKE_CXX_COMPILER)
+  # Default to use clang++
+  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
+                  OUTPUT_VARIABLE IOS_CXX_COMPILER
+                  RESULT_VARIABLE IOS_CXX_COMPILER_RESULT
+                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(${IOS_CXX_COMPILER_RESULT})
+    get_filename_component(IOS_CXX_COMPILER clang++ PROGRAM)
+  endif()
+else(NOT CMAKE_CXX_COMPILER)
+  # User can set it in cmake command
+  get_filename_component(IOS_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
+endif(NOT CMAKE_CXX_COMPILER)
+if(NOT EXISTS ${IOS_CXX_COMPILER})
+  message(FATAL_ERROR "Cannot find CXX compiler: ${IOS_CXX_COMPILER}")
+endif()
+set(CMAKE_C_COMPILER ${IOS_C_COMPILER} CACHE PATH "C compiler" FORCE)
+set(CMAKE_CXX_COMPILER ${IOS_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
+set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
+set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
+set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
+set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
+# Set iOS specific C/C++ flags
+if(IOS_PLATFORM STREQUAL "OS")
+  if(XCODE_VERSION VERSION_LESS "7.0")
+    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
+  else()
+    # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM.
+    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
+  endif()
+else()
+  set(XCODE_IOS_FLATFORM_VERSION_FLAGS "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
+endif()
+if(IOS_ENABLE_BITCODE)
+  set(XCODE_IOS_BITCODE_FLAGS "${IOS_COMPILER_FLAGS} -fembed-bitcode")
+else()
+  set(XCODE_IOS_BITCODE_FLAGS "")
+endif()
+set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_FLAGS}")
+# Hidden visibilty is required for cxx on iOS 
+set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
+set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
+set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
+if(IOS_USE_VECLIB_FOR_BLAS)
+  # Find vecLib for iOS
+  set(VECLIB_SEARCH_DIRS
+      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks
+      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Frameworks
+      )
+  find_path(VECLIB_INC_DIR vecLib.h PATHS ${VECLIB_SEARCH_DIRS}/vecLib.framework/Headers)
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(vecLib DEFAULT_MSG VECLIB_INC_DIR)
+  if(VECLIB_FOUND)
+    if(VECLIB_INC_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
+      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework vecLib")
+      message(STATUS "Found standalone vecLib.framework")
+    else()
+      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework Accelerate")
+      message(STATUS "Found vecLib as part of Accelerate.framework")
+    endif()
+  endif()
+endif()
+set(CMAKE_C_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_C_LINK_FLAGS}")
+set(CMAKE_CXX_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_CXX_LINK_FLAGS}")
+set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
+if(NOT IOS_ENABLE_BITCODE)
+  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
+  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
+else()
+  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib")
+  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle")
+endif()
+set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
+set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
+set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
+# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
+# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
+# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
+# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
+if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
+  find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
+endif()
+# Set the find root to the iOS developer roots and to user defined paths
+set(CMAKE_FIND_ROOT_PATH ${IOS_DEVELOPER_ROOT} ${IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH}
+    CACHE string  "iOS find search path root")
+# default to searching for frameworks first
+set(CMAKE_FIND_FRAMEWORK FIRST)
+# set up the default search directories for frameworks
+set(CMAKE_SYSTEM_FRAMEWORK_PATH
+    ${IOS_SDK_ROOT}/System/Library/Frameworks
+    ${IOS_SDK_ROOT}/System/Library/PrivateFrameworks
+    ${IOS_SDK_ROOT}/Developer/Library/Frameworks
+    )
+# only search the iOS sdks, not the remainder of the host filesystem
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+message(STATUS "iOS: Targeting iOS '${CMAKE_SYSTEM_VERSION}', "
+        "building for '${IOS_PLATFORM}' platform, with architecture '${CMAKE_OSX_ARCHITECTURES}'")
+message(STATUS "System CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
+message(STATUS "System CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+# Used in ExternalProject command
+string(REPLACE ";" "\\$<SEMICOLON>" EXTERNAL_IOS_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+set(EXTERNAL_OPTIONAL_ARGS
+    -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT}
+    -DCMAKE_OSX_ARCHITECTURES=${EXTERNAL_IOS_ARCHITECTURES})
+# This little macro lets you set any XCode specific property
+macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
+  set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
+endmacro(set_xcode_property)
+# This macro lets you find executable programs on the host system
+macro(find_host_package)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
+  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+  set(IOS FALSE)
+  find_package(${ARGN})
+  set(IOS TRUE)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+endmacro(find_host_package)
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -39,13 +39,14 @@ ExternalProject_Add(
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DBUILD_TESTING=OFF
+                    -DBUILD_TESTING=OFF
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=Release
+                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_BUILD_TYPE:STRING=Release

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -34,16 +34,17 @@ ExternalProject_Add(
    PREFIX          ${GLOG_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
+                    -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
-    CMAKE_ARGS      -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
+                    -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DWITH_GFLAGS=ON
+                    -DWITH_GFLAGS=ON
-    CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
+                    -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
-    CMAKE_ARGS      -DBUILD_TESTING=OFF
+                    -DBUILD_TESTING=OFF
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=Release
+                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON

--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -48,15 +48,16 @@ IF(WITH_TESTING)
        PREFIX          ${GTEST_SOURCES_DIR}
        UPDATE_COMMAND  ""
        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-        CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-        CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-        CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
+                        -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
-        CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-        CMAKE_ARGS      -DBUILD_GMOCK=ON
+                        -DBUILD_GMOCK=ON
-        CMAKE_ARGS      -Dgtest_disable_pthreads=ON
+                        -Dgtest_disable_pthreads=ON
-        CMAKE_ARGS      -Dgtest_force_shared_crt=ON
+                        -Dgtest_force_shared_crt=ON
-        CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                        -DCMAKE_BUILD_TYPE=Release
+                        ${EXTERNAL_OPTIONAL_ARGS}
        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                         -DCMAKE_BUILD_TYPE:STRING=Release

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -29,30 +29,41 @@ IF(NOT ${CBLAS_FOUND})
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
        CACHE FILEPATH "openblas library." FORCE)
-    IF(APPLE)
+    SET(OPENBLAS_CC "${CMAKE_C_COMPILER}")
-        SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
-        SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
-    ELSE()
-        SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
-    ENDIF()
    IF(CMAKE_CROSSCOMPILING)
+        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
+        GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
+        SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
        IF(ANDROID)
            # arm_soft_fp_abi branch of OpenBLAS to support softfp
            #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
            IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-                SET(TARGET "ARMV7")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
            ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-                SET(TARGET "ARMV8")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
+            ENDIF()
+        ELSEIF(IOS)
+            # FIXME(liuyiqun): support multiple architectures
+            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
+            SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
+            IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
+            ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
            ENDIF()
-            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
        ELSEIF(RPI)
            # use hardfp
            SET(OPENBLAS_COMMIT "v0.2.20")
-            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0)
+            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
        ENDIF()
    ELSE()
+        IF(APPLE)
+            SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+        ENDIF()
        SET(OPENBLAS_COMMIT "v0.2.20")
        SET(OPTIONAL_ARGS "")
        IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
@@ -60,6 +71,8 @@ IF(NOT ${CBLAS_FOUND})
        ENDIF()
    ENDIF()
+    SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
    ExternalProject_Add(
        extern_openblas
        ${EXTERNAL_PROJECT_LOG_ARGS}

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -173,7 +173,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
            "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
            "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
            "-Dprotobuf_WITH_ZLIB=ON"
-            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}")
+            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
+            ${EXTERNAL_OPTIONAL_ARGS})
        SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
    ENDIF()

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-INCLUDE(ExternalProject)
+IF(NOT WITH_PYTHON)
+    return()
+ENDIF()
 INCLUDE(python_module)
 FIND_PACKAGE(PythonInterp 2.7)
-IF(WITH_PYTHON)
+FIND_PACKAGE(PythonLibs 2.7)
-    FIND_PACKAGE(PythonLibs 2.7)
+# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
-    # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
+ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
-    ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
-    SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
-ENDIF(WITH_PYTHON)
 SET(py_env "")
 IF(PYTHONINTERP_FOUND)
@@ -36,9 +37,5 @@ IF(PYTHONINTERP_FOUND)
    ENDIF()
 ENDIF(PYTHONINTERP_FOUND)
-IF(WITH_PYTHON)
+INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
-    INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
-    INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
-ELSE()
-    SET(PYTHON_LIBRARIES "")
-ENDIF()
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+IF(NOT WITH_SWIG_PY)
+    return()
+ENDIF()
 FIND_PACKAGE(SWIG)
 IF(NOT SWIG_FOUND)

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -16,25 +16,14 @@ INCLUDE(ExternalProject)
 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
-SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
+    CACHE PATH "Warp-ctc Directory" FORCE)
-SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE)
+# Used in unit test test_WarpCTCLayer
+SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
-IF(WIN32)
+    CACHE PATH "Warp-ctc Library Directory" FORCE)
-    SET(WARPCTC_LIBRARIES
+SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
-        "${WARPCTC_INSTALL_DIR}/lib/warpctc.dll" CACHE FILEPATH "Warp-ctc Library" FORCE)
+    CACHE FILEPATH "Warp-ctc Library" FORCE)
-ELSE(WIN32)
-    IF(APPLE)
-        SET(_warpctc_SHARED_SUFFIX dylib)
-    ELSE(APPLE)
-        SET(_warpctc_SHARED_SUFFIX so)
-    ENDIF(APPLE)
-    SET(WARPCTC_LIBRARIES
-        "${WARPCTC_INSTALL_DIR}/lib/libwarpctc.${_warpctc_SHARED_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE)
-ENDIF(WIN32)
 IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
    SET(USE_OMP OFF)
@@ -49,22 +38,26 @@ ExternalProject_Add(
    PREFIX          ${WARPCTC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-    CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
+                    -DWITH_GPU=${WITH_GPU}
-    CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
+                    -DWITH_OMP=${USE_OMP}
-    CMAKE_ARGS      -DWITH_TORCH=OFF
+                    -DWITH_TORCH=OFF
-    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-    CMAKE_ARGS      -DBUILD_SHARED=ON
+                    -DBUILD_SHARED=ON
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=Release
+                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
+MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
 ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
 ADD_DEPENDENCIES(warpctc extern_warpctc)

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -34,15 +34,16 @@ ExternalProject_Add(
    GIT_TAG         "v1.2.8"
    PREFIX          ${ZLIB_SOURCES_DIR}
    UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DBUILD_SHARED_LIBS=OFF
+                    -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_SHARED_LIBS=OFF
-    CMAKE_ARGS      -DCMAKE_MACOSX_RPATH=ON
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_MACOSX_RPATH=ON
+                    -DCMAKE_BUILD_TYPE=Release
+                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_BUILD_TYPE:STRING=Release

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -128,8 +128,10 @@ set(GPU_COMMON_FLAGS
 )
 if (APPLE)
-    # On Mac OS X build fat binaries with x86_64 architectures by default.
+    if(NOT CMAKE_CROSSCOMPILING)
-    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
+        # On Mac OS X build fat binaries with x86_64 architectures by default.
+        set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
+    endif()
 else()
    set(GPU_COMMON_FLAGS
        -Wall

--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -24,11 +24,10 @@ IF(WIN32)
    SET(HOST_SYSTEM "win32")
 ELSE(WIN32)
    IF(APPLE)
-        EXEC_PROGRAM (sw_vers ARGS -productVersion OUTPUT_VARIABLE MACOSX_VERSION)
-        STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
-        SET(MACOS_VERSION ${VERSION})
        SET(HOST_SYSTEM "macosx")
-        IF(NOT DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
+        EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION)
+        STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}")
+        IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET})
            # Set cache variable - end user may change this during ccmake or cmake-gui configure.
            SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
                "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
@@ -49,6 +48,8 @@ ELSE(WIN32)
            ELSEIF(LINUX_ISSUE MATCHES "Fedora")
                SET(HOST_SYSTEM "fedora")
            ENDIF()
+            STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}")
        ENDIF(EXISTS "/etc/issue")
        IF(EXISTS "/etc/redhat-release")
@@ -70,7 +71,7 @@ CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
 MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
-MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
+MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
 MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
 # configuration for cross-compiling
@@ -82,6 +83,9 @@ IF(DEFINED CMAKE_SYSTEM_NAME)
    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi")
        SET(RPI TRUE)
        INCLUDE(cross_compiling/raspberry_pi)
+    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+        SET(IOS TRUE)
+        INCLUDE(cross_compiling/ios)
    ENDIF()
 ENDIF()

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -25,7 +25,9 @@ function(target_circle_link_libraries TARGET_NAME)
            endif()
        endforeach()
        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-            list(APPEND LIBS "-undefined dynamic_lookup")
+            if(IOS AND NOT IOS_ENABLE_BITCODE)
+                list(APPEND LIBS "-undefined dynamic_lookup")
+            endif()
        endif()
        list(REVERSE libsInArgn)
        target_link_libraries(${TARGET_NAME}

--- a/doc/design/refactorization.md
+++ b/doc/design/refactorization.md
+# Design Doc: Refactorization Overview
+The goal of refactorizaiton include:
+1. Make it easy for external contributors to write new elementory computaiton operations.
+1. Make the codebase clean and readable.
+1. Introduce a new design of computation representation -- a computation graph of operators and variables.
+1. The graph representation helps implementing auto-scalable and auto fault recoverable distributed computing.
+## Computation Graphs
+1. PaddlePaddle represent the computation, training and inference of DL models, by computation graphs.
+  1. Please dig into [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a solid example.
+1. Users write Python programs to describe the graphs and run it (locally or remotely).
+1. A graph is composed of *variabels* and *operators*.
+1. The description of graphs must be able to be serialized/deserialized, so it
+   1. could to be sent to the cloud for distributed execution, and
+   1. be sent to clients for mobile or enterprise deployment.
+1. The Python program do
+   1. *compilation*: runs a Python program to generate a protobuf message representation of the graph and send it to
+      1. the C++ library `libpaddle.so` for local execution,
+      1. the master process of a distributed training job for training, or
+      1. the server process of a Kubernetes serving job for distributed serving.
+   1. *execution*: according to the protobuf message, constructs instances of class `Variable` and `OperatorBase`, and run them.
+## Description and Realization
+At compile time, the Python program generates protobuf message representation of the graph, or the description of the graph.
+At runtime, the C++ program realizes the graph and run it.
+| | Representation (protobuf messages) | Realization (C++ class objects) |
+|---|---|---|
+|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
+|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
+|Block|BlockDesc|Block|
+The word *graph* is exchangable with *block* in this document.  A graph represent computation steps and local variables as a C++/Java program block, or a pair of { and }.
+## Compilation and Execution
+1. Run an applicaton Python program to describe the graph.  In particular,
+   1. create VarDesc to represent local/intermediate variables,
+   1. create operators and set attributes,
+   1. validate attribute values,
+   1. inference the type and the shape of variables,
+   1. plan for memory-reuse for variables,
+   1. generate backward and optimization part of the Graph.
+   1. possiblly split the graph for distributed training.
+1. The invocation of `train` or `infer` in the application Python program:
+   1. create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
+      1. realize local variables defined in the BlockDesc message in the new scope,
+      1. a scope is similar to the stack frame in programming languages,
+   1. create an instance of class `Block`, in which,
+      1. realize operators in the BlockDesc message,
+   1. run the Block by calling
+      1. `Block::Eval(vector<Variable>* targets)` for forward and backward computations, or
+      1. `Block::Eval(vector<Operator>* targets)` for optimization.
+## Intermediate Representation (IR)
+```text
+Compile Time -> IR -> Runtime
+```
+### Benefit
+- Optimization
+  ```text
+  Compile Time -> IR -> Optimized IR -> Runtime
+  ```
+- Send automatically partitioned IR to different nodes.
+  - Automatic data parallel
+    ```text
+    Compile Time
+    |-> Single GPU IR
+        |-> [trainer-IR-0, trainer-IR-1, pserver-IR]
+            |-> Node-0 (runs trainer-IR-0)
+            |-> Node-1 (runs trainer-IR-1)
+            |-> Node-2 (runs pserver-IR)
+    ```
+  - Automatic model parallel (planned for future)
+---
+# Operator/OpWithKernel/OpKernel
+![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot)
+---
+# Operator
+![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
+* `Operator` is the fundamental building block as the user interface.
+    * Operator stores input/output variable name, and attributes.
+    * The `InferShape` interface is used to infer output variable shapes by its input shapes.
+    * Use `Run` to compute `input variables` to `output variables`.
+---
+# OpWithKernel/Kernel
+![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot)
+* `OpWithKernel` inherits `Operator`.
+* `OpWithKernel` contains a Kernel map.
+    * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`.
+    * `OpKernelKey` is the map key. Only device place now, but may be data type later.
+---
+# Why separate Kernel and Operator
+* Separate GPU and CPU code.
+    * Make Paddle can run without GPU.
+* Make one operator (which is user interface) can contain many implementations.
+    * Same mul op, different FP16, FP32 Kernel. different MKL, eigen kernel.
+---
+# Libraries for Kernel development
+* `Eigen::Tensor` contains basic math and element-wise functions.
+    * Note that `Eigen::Tensor` has broadcast implementation.
+    * Limit number of `tensor.device(dev) = ` in your code.
+* `thrust::tranform` and `std::transform`.
+    * `thrust` has the same API as C++ standard library. Using `transform` can quickly implement a customized elementwise kernel.
+    * `thrust` has more complex API, like `scan`, `reduce`, `reduce_by_key`.
+* Hand-writing `GPUKernel` and `CPU` code
+    * Do not write `.h`. CPU Kernel should be in `.cc`. CPU kernel should be in `.cu`. (`GCC` cannot compile GPU code.)
+---
+# Operator Register
+## Why register is necessary?
+We need a method to build mappings between Op type names and Op classes.
+## How to do the register?
+Maintain a map, whose key is the type name and value is corresponding Op constructor.
+---
+# The Registry Map
+### `OpInfoMap`
+`op_type(string)` -> `OpInfo`
+`OpInfo`:
+- **`creator`**: The Op constructor.
+- **`grad_op_type`**: The type of the gradient Op.
+- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes.
+- **`checker`**: Used to check attributes.
+---
+# Related Concepts
+### Op_Maker
+It's constructor takes `proto` and `checker`. They are compeleted during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
+### Register Macros
+```cpp
+REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
+REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
+```
+### `USE` Macros
+make sure the registration process is executed and linked.
+---
+# Register Process
+1. Write Op class, as well as its gradient Op class if there is.
+2. Write Op maker class. In the constructor, describe its inputs, outputs, and attributes.
+3. Invoke macro `REGISTER_OP`. The macro will
+	1. call maker class to complete `proto` and `checker`
+	2. with the completed `proto` and `checker`, build a new key-value pair in the `OpInfoMap`
+4. Invoke `USE` macro in where the Op is used to make sure it is linked.
+---
+# Backward Module (1/2)
+### Create Backward Operator
+- Mapping from forwarding Op to backward Op
+![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
+---
+# Backward Module (2/2)
+### Build Backward Network
+- **Input** graph of forwarding operators
+- **Output** graph of backward operators
+- **corner case in construction**
+	- shared variable => insert `Add` operator
+	- no gradient => insert `fill_zero_grad` operator
+	- recursive netOp => call `Backward` recursively
+	- RNN Op => recursively call `Backward` on stepnet
+---
+# Scope, Variable, Tensor
+* `Tensor` is an n-dimension array with type.
+	* Only dims and data pointers are stored in `Tensor`.
+	* All operators on `Tensor` is written in `Operator` or global functions.
+	* variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+* `Variable` is the inputs and outputs of an operator. Not just `Tensor`.
+	* step_scopes in RNN is a variable and not a tensor.
+* `Scope` is where variables store at.
+	* map<string/*var name */, Variable>
+	* `Scope` has a hierarchical structure. The local scope can get variable from its parent scope.
+---
+# Block (in design)
+## the difference with original RNNOp
+- as an operator is more intuitive than `RNNOp`,
+- offers new interface `Eval(targets)` to deduce the minimal block to `Run`,
+- fits the compile-time/ runtime separation design.
+  - during the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
+  - when graph executes, a Block with `BlockDesc` passed in creates `Op` and `Var` then `Run`
+---
+# Milestone
+- take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
+- model migration
+  - framework development gives **priority support** to model migration, for example,
+    - the MNIST demo needs a Python interface,
+    - the RNN models require the framework to support `LoDTensor`.
+  - determine some timelines,
+  - heavily-relied Ops need to be migrated first,
+  - different models can be migrated parallelly.
+- improve the framework at the same time
+- accept imperfection, concentrated on solving the specific problem at the right price.
+---
+# Control the migration quality
+- compare the performance of migrated models with old ones.
+- follow google C style
+- build the automatic workflow of generating Python/C++ documentations
+  - the documentation of layers and ops should be written inside the code
+  - take the documentation quality into account when doing PR
+  - preview the documentations, read and improve them from users' perspective
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -19,7 +19,7 @@ if(Boost_FOUND)
 endif()
 if(WITH_C_API)
-    add_subdirectory(capi)
+  add_subdirectory(capi)
 endif()
 if(WITH_SWIG_PY)

--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -28,42 +28,38 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
 add_dependencies(paddle_capi paddle_proto)
 # combine all paddle static libraries together, into libpaddle_capi_whole.a
 # user should use PaddleCAPI as -lpaddle_capi_whole
-set(capi_whole_library libpaddle_capi_whole.a)
+set(PADDLE_CAPI_INFER_LIBS
-add_custom_target(paddle_capi_whole ALL
+    paddle_utils
-        COMMAND mkdir -p o_files/capi && cd o_files/capi/ && ar -x $<TARGET_FILE:paddle_capi>
+    paddle_parameter
-        COMMAND mkdir -p o_files/utils && cd o_files/utils/ && ar -x $<TARGET_FILE:paddle_utils>
+    paddle_math
-        COMMAND mkdir -p o_files/parameter && cd o_files/parameter/ && ar -x $<TARGET_FILE:paddle_parameter>
+    paddle_cuda
-        COMMAND mkdir -p o_files/math && cd o_files/math/  && ar -x $<TARGET_FILE:paddle_math>
+    paddle_function
-        COMMAND mkdir -p o_files/cuda && cd o_files/cuda/ && ar -x $<TARGET_FILE:paddle_cuda>
+    paddle_gserver
-        COMMAND mkdir -p o_files/function && cd o_files/function/ && ar -x $<TARGET_FILE:paddle_function>
+    paddle_proto
-        COMMAND mkdir -p o_files/gserver && cd o_files/gserver/ && ar -x $<TARGET_FILE:paddle_gserver>
+    paddle_pserver
-        COMMAND mkdir -p o_files/proto && cd o_files/proto/ && ar -x $<TARGET_FILE:paddle_proto>
+    paddle_network)
-        COMMAND mkdir -p o_files/network && cd o_files/network/ && ar -x $<TARGET_FILE:paddle_network>
-        COMMAND mkdir -p o_files/pserver && cd o_files/pserver/ && ar -x $<TARGET_FILE:paddle_pserver>
+cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
-        COMMAND ar crs ${capi_whole_library} `find ./o_files -name '*.o'`
-        COMMAND rm -rf o_files
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-        DEPENDS paddle_capi paddle_utils paddle_parameter paddle_math
-                paddle_cuda paddle_function paddle_gserver
-                paddle_proto paddle_pserver paddle_network
-        )
-set_target_properties(paddle_capi_whole
-  PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library})
-set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
+# No shared library for iOS
-# TODO: merge mkl into paddle_capi_shared
+if(NOT IOS)
-add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
+  set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
-set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  # TODO: merge mkl into paddle_capi_shared
-target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+  add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
-link_paddle_exe(paddle_capi_shared)
+  set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+  link_paddle_exe(paddle_capi_shared)
+endif()
 # install library & headers.
 install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
 if(ANDROID)
+  install(TARGETS paddle_capi_whole paddle_capi_shared
+          ARCHIVE DESTINATION lib/${ANDROID_ABI}
+          LIBRARY DESTINATION lib/${ANDROID_ABI})
  execute_process(
    COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
    OUTPUT_VARIABLE GIT_COMMITS_LIST
@@ -72,9 +68,6 @@ if(ANDROID)
  if(${GIT_COMMITS_LIST_RESULT})
    set(GIT_COMMITS_LIST "No commits.")
  endif()
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library}
-          DESTINATION lib/${ANDROID_ABI})
-  install(TARGETS paddle_capi_shared DESTINATION lib/${ANDROID_ABI})
  install(CODE "FILE(WRITE ${CMAKE_INSTALL_PREFIX}/lib/${ANDROID_ABI}/BUILD.txt
          \"Compiler:\n\"
          \"\\t${CMAKE_C_COMPILER}\\n\"
@@ -88,8 +81,11 @@ if(ANDROID)
      )"
  )
 else(ANDROID)
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib)
+  install(TARGETS paddle_capi_whole
-  install(TARGETS paddle_capi_shared DESTINATION lib)
+          ARCHIVE DESTINATION lib)
+  if(NOT IOS)
+    install(TARGETS paddle_capi_shared DESTINATION lib)
+  endif()
 endif(ANDROID)
 # this variable used for unittest

--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -22,10 +22,10 @@ limitations under the License. */
 */
 typedef enum {
  HL_POOLING_MAX = 0,
-  // average includes padded values
-  HL_POOLING_AVERAGE = 1,
  // average does not include padded values
-  HL_POOLING_AVERAGE_EXCLUDE_PADDING = 2,
+  HL_POOLING_AVERAGE = 1,
+  // average includes padded values
+  HL_POOLING_AVERAGE_INCLUDE_PADDING = 2,
  HL_POOLING_END
 } hl_pooling_mode_t;

--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
@@ -461,7 +461,7 @@ class add<float32x4_t> {
 public:
  INLINE float32x4_t operator()(const float32x4_t a,
                                const float32x4_t b) const {
-    return vmulq_f32(a, b);
+    return vaddq_f32(a, b);
  }
 };

--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -211,13 +211,11 @@ __global__ void KeAvgPoolForward(const int nthreads,
    int hstart = ph * strideH - padH;
    int wstart = pw * strideW - padW;
-    int hend = min(hstart + sizeY, height + padH);
+    int hend = min(hstart + sizeY, height);
-    int wend = min(wstart + sizeX, width + padW);
+    int wend = min(wstart + sizeX, width);
-    int pool_size = (hend - hstart) * (wend - wstart);
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
-    hend = min(hend, height);
+    int pool_size = (hend - hstart) * (wend - wstart);
-    wend = min(wend, width);
    real aveval = 0;
    inputData += (frameNum * channels + c) * height * width;
@@ -299,12 +297,14 @@ __global__ void KeAvgPoolBackward(const int nthreads,
    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
    for (int ph = phstart; ph < phend; ++ph) {
+      int hstart = ph * strideH - padH;
+      int hend = min(hstart + sizeY, height);
+      hstart = max(hstart, 0);
      for (int pw = pwstart; pw < pwend; ++pw) {
        // figure out the pooling size
-        int hstart = ph * strideH - padH;
        int wstart = pw * strideW - padW;
-        int hend = min(hstart + sizeY, height + padH);
+        int wend = min(wstart + sizeX, width);
-        int wend = min(wstart + sizeX, width + padW);
+        wstart = max(wstart, 0);
        int poolsize = (hend - hstart) * (wend - wstart);
        gradient += outGrad[ph * pooledW + pw] / poolsize;
      }
@@ -600,16 +600,13 @@ __global__ void KeAvgPool3DForward(const int nthreads,
    int dstart = pd * strideD - padD;
    int hstart = ph * strideH - padH;
    int wstart = pw * strideW - padW;
-    int dend = min(dstart + sizeZ, depth + padD);
+    int dend = min(dstart + sizeZ, depth);
-    int hend = min(hstart + sizeY, height + padH);
+    int hend = min(hstart + sizeY, height);
-    int wend = min(wstart + sizeX, width + padW);
+    int wend = min(wstart + sizeX, width);
-    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
    dstart = max(dstart, 0);
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
-    dend = min(dend, depth);
+    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-    hend = min(hend, height);
-    wend = min(wend, width);
    real aveval = 0;
    inputData += (frameNum * channels + c) * depth * height * width;
@@ -712,15 +709,18 @@ __global__ void KeAvgPool3DBackward(const int nthreads,
    outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;
    for (int pd = pdstart; pd < pdend; ++pd) {
+      int dstart = pd * strideD - padD;
+      int dend = min(dstart + sizeZ, depth);
+      dstart = max(dstart, 0);
      for (int ph = phstart; ph < phend; ++ph) {
+        int hstart = ph * strideH - padH;
+        int hend = min(hstart + sizeY, height);
+        hstart = max(hstart, 0);
        for (int pw = pwstart; pw < pwend; ++pw) {
          // figure out the pooling size
-          int dstart = pd * strideD - padD;
-          int hstart = ph * strideH - padH;
          int wstart = pw * strideW - padW;
-          int dend = min(dstart + sizeZ, depth + padD);
+          int wend = min(wstart + sizeX, width);
-          int hend = min(hstart + sizeY, height + padH);
+          wstart = max(wstart, 0);
-          int wend = min(wstart + sizeX, width + padW);
          int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
          gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
        }

--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -432,11 +432,11 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
      cudnn_mode = CUDNN_POOLING_MAX;
      break;
    case HL_POOLING_AVERAGE:
-      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-      break;
-    case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
      break;
+    case HL_POOLING_AVERAGE_INCLUDE_PADDING:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+      break;
    default:
      LOG(FATAL) << "parameter mode error";
  }

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,14 +22,14 @@ namespace framework {
 template <>
 Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_->get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
 }
 #ifndef PADDLE_ONLY_CPU
 template <>
 Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_->get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
 }
 #endif

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -366,7 +366,7 @@ struct EigenDeviceConverter<platform::GPUPlace> {
 class ExecutionContext : public InferShapeContext {
 public:
  ExecutionContext(const OperatorBase& op, const Scope& scope,
-                   const platform::DeviceContext* device_context)
+                   const platform::DeviceContext& device_context)
      : InferShapeContext(op, scope), device_context_(device_context) {}
  template <typename PlaceType,
@@ -374,9 +374,9 @@ class ExecutionContext : public InferShapeContext {
                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
  DeviceType& GetEigenDevice() const;
-  platform::Place GetPlace() const { return device_context_->GetPlace(); }
+  platform::Place GetPlace() const { return device_context_.GetPlace(); }
-  const platform::DeviceContext* device_context() const {
+  const platform::DeviceContext& device_context() const {
    return device_context_;
  }
@@ -401,7 +401,8 @@ class ExecutionContext : public InferShapeContext {
    return res;
  }
-  const platform::DeviceContext* device_context_;
+ private:
+  const platform::DeviceContext& device_context_;
 };
 template <>
@@ -461,7 +462,7 @@ class OperatorWithKernel : public OperatorBase {
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const final {
    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(ExecutionContext(*this, scope, &dev_ctx));
+    opKernel->Compute(ExecutionContext(*this, scope, dev_ctx));
  }
  static std::unordered_map<std::string /* op_type */, OpKernelMap>&

--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
@@ -52,7 +52,7 @@ public:
    int outputHeight = output[2];
    int outputWidth = output[3];
    int filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
+    CHECK_EQ(static_cast<size_t>(inputChannels), groups_);
    // only support strideH() == strideW() and filterHeight == filterWidth.
    CHECK_EQ(strideH(), strideW());

--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -22,9 +22,12 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/parameter/Argument.h"
 #include "paddle/utils/ClassRegistrar.h"
 #include "paddle/utils/Logging.h"
+#ifdef PADDLE_USE_MKLDNN
+#include "MKLDNNActivation.h"
+#endif
 namespace paddle {
 static ClassRegistrar<ActivationFunction> gActivationRegistrar;
@@ -456,6 +459,12 @@ Error __must_check backward(Argument& act) {
 END_DEFINE_ACTIVATION(log)
 ActivationFunction* ActivationFunction::create(const std::string& type) {
+#ifdef PADDLE_USE_MKLDNN
+  if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
+    return MKLDNNActivation::create(type);
+  }
+#endif
  return gActivationRegistrar.createByType(type);
 }

--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "MKLDNNActivation.h"
+#include "mkldnn.hpp"
+#include "paddle/utils/ClassRegistrar.h"
+namespace paddle {
+static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
+/**
+ * @def MKLDNN_ACTIVATION_CLASS_NAME
+ * @note MKLDNN_ACTIVATION_CLASS_NAME(relu) relu_;
+ * means mkldnn_reluActivation relu_;
+ */
+#define MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) mkldnn_##ACT_TYPE##Activation
+/**
+ * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
+ */
+#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(ACT_TYPE, ALPHA, BWD_ALPHA)        \
+  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)                              \
+      : public MKLDNNEltwiseActivation {                                    \
+  private:                                                                  \
+    static const std::string name;                                          \
+    static const float alpha;                                               \
+    static const float bwdAlpha;                                            \
+                                                                            \
+  public:                                                                   \
+    const std::string& getName() const { return name; }                     \
+    float getAlpha() const { return alpha; }                                \
+    float getBwdAlpha() const { return bwdAlpha; }                          \
+  };                                                                        \
+  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name =          \
+      "mkldnn_" #ACT_TYPE;                                                  \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA;        \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA; \
+  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {              \
+    gMKLDNNActivationRegistrar                                              \
+        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(             \
+            "mkldnn_" #ACT_TYPE);                                           \
+  });
+/**
+ * @brief MKLDNN Relu Activation.
+ * Actually mkldnn_relu is Leaky Relu.
+ *  f(x) = x                   (x >= 0)
+ *  f(x) = negative_slope * x  (x <  0)
+ * @note the negative_slope should be -0.f in forward
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, -0.f, 0.f)
+/**
+ * @brief MKLDNN Tanh Activation.
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, 0.f, 0.f)
+/**
+ * @brief MKLDNN ELU(Exponential Linear Unit) Activation.
+ *  f(x) = x                              (x >= 0)
+ *  f(x) = negative_slope * (exp(x) - 1)  (x <  0)
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, 0.f, 0.f)
+ActivationFunction* MKLDNNActivation::create(const std::string& type) {
+  return gMKLDNNActivationRegistrar.createByType(type);
+}
+std::vector<std::string> MKLDNNActivation::getAllRegisteredTypes() {
+  std::vector<std::string> types;
+  gMKLDNNActivationRegistrar.forEachType(
+      [&](const std::string& type) { types.push_back(type); });
+  return types;
+}
+}  // namespace paddle
--- a/paddle/gserver/activations/MKLDNNActivation.h
+++ b/paddle/gserver/activations/MKLDNNActivation.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "ActivationFunction.h"
+#include "mkldnn.hpp"
+#include "paddle/gserver/layers/MKLDNNBase.h"
+#include "paddle/math/MKLDNNMatrix.h"
+#include "paddle/parameter/Argument.h"
+namespace paddle {
+/**
+ * @brief Base class of MKLDNN Activation.
+ * Common activation function are provieded,
+ * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
+ */
+class MKLDNNActivation : public ActivationFunction {
+protected:
+  // input value element count
+  size_t cnt_;
+  // should not merge the resetBwd into resetFwd,
+  // because the grad data would be changing before backward.
+  bool needResetBwd_;
+  // mkldnn matrix, primitive, stream and pipeline
+  MKLDNNMatrixPtr val_;
+  MKLDNNMatrixPtr grad_;
+  std::shared_ptr<MKLDNNStream> stream_;
+  std::shared_ptr<mkldnn::primitive> fwd_;
+  std::shared_ptr<mkldnn::primitive> bwd_;
+  std::vector<mkldnn::primitive> pipelineFwd_;
+  std::vector<mkldnn::primitive> pipelineBwd_;
+public:
+  MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
+  ~MKLDNNActivation() {}
+  static ActivationFunction* create(const std::string& type);
+  static std::vector<std::string> getAllRegisteredTypes();
+  virtual const std::string& getName() const = 0;
+  virtual Error __must_check forward(Argument& act) = 0;
+  virtual Error __must_check backward(Argument& act) = 0;
+};
+/**
+ * @brief Base class of MKLDNN Eltwise Activation,
+ * includes mkldnn_relu, mkldnn_elu and mkldnn_tanh.
+ */
+class MKLDNNEltwiseActivation : public MKLDNNActivation {
+  typedef mkldnn::eltwise_forward eltwise_fwd;
+  typedef mkldnn::eltwise_backward eltwise_bwd;
+protected:
+  // save the forward primitive desc, which can be used backward
+  std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
+  // eltwise_bwd need src input value
+  MKLDNNMatrixPtr inVal_;
+  // use for copy data
+  std::shared_ptr<mkldnn::reorder> copyInVal_;
+public:
+  MKLDNNEltwiseActivation() {}
+  ~MKLDNNEltwiseActivation() {}
+  virtual const std::string& getName() const = 0;
+  // in common, the alpha of forward and backward should be equal.
+  // but for relu, to avoid negative value, they should be opposite
+  virtual float getAlpha() const = 0;
+  virtual float getBwdAlpha() const = 0;
+  virtual float getBeta() const { return 0.f; }
+  virtual mkldnn::algorithm getAlgo(const std::string& type) const {
+    if (type == "mkldnn_relu") {
+      return mkldnn::algorithm::eltwise_relu;
+    } else if (type == "mkldnn_tanh") {
+      return mkldnn::algorithm::eltwise_tanh;
+    } else if (type == "mkldnn_elu") {
+      return mkldnn::algorithm::eltwise_elu;
+    } else {
+      LOG(FATAL) << "Unkown eltwise activation type: " << type;
+    }
+    return (mkldnn::algorithm)0;
+  }
+  /**
+   * reshape and reset the forward primitives
+   */
+  void resetFwd(Argument& act) {
+    if (cnt_ == act.value->getElementCnt()) {
+      return;
+    }
+    cnt_ = act.value->getElementCnt();
+    stream_.reset(new MKLDNNStream());
+    auto eng = CPUEngine::Instance().getEngine();
+    // get algo setting
+    mkldnn::algorithm algo = getAlgo(this->getName());
+    // note: alpha represents the NegativeSlope when used in relu.
+    float alpha = getAlpha();
+    float beta = getBeta();
+    /// forward
+    pipelineFwd_.clear();
+    val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
+    if (val_ == nullptr) {
+      int bs = act.getBatchSize();
+      int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
+      int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
+      int ic = cnt_ / bs / ih / iw;
+      CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
+      val_ = MKLDNNMatrix::create(
+          act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, eng);
+      CHECK(val_);
+    }
+    auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
+                                     algo,
+                                     val_->getMemoryDesc(),
+                                     alpha,
+                                     beta);
+    fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, eng));
+    // use inplace for forward but save input value before submit
+    inVal_ = val_;
+    if (act.grad) {
+      // only copy when need do backward
+      inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
+      copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
+      CHECK(copyInVal_) << "should not be emptry";
+      pipelineFwd_.push_back(*copyInVal_);
+    }
+    fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
+    pipelineFwd_.push_back(*fwd_);
+    needResetBwd_ = true;
+  }
+  /**
+   * reset the backward primitives, can not merge into resetFwd as the grad data
+   * would be changing before backward.
+   */
+  void resetBwd(Argument& act) {
+    if (!needResetBwd_) {
+      return;
+    }
+    needResetBwd_ = false;
+    mkldnn::algorithm algo = getAlgo(this->getName());
+    float alpha = getBwdAlpha();
+    float beta = getBeta();
+    grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
+    auto eng = CPUEngine::Instance().getEngine();
+    auto bwdDesc = eltwise_bwd::desc(
+        algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
+    auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
+    CHECK(inVal_);
+    bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
+    pipelineBwd_.clear();
+    pipelineBwd_.push_back(*bwd_);
+  }
+  Error __must_check forward(Argument& act) {
+    resetFwd(act);
+    stream_->submit(pipelineFwd_);
+    return Error();
+  }
+  Error __must_check backward(Argument& act) {
+    resetBwd(act);
+    stream_->submit(pipelineBwd_);
+    return Error();
+  }
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ b/paddle/gserver/layers/CudnnPoolLayer.cpp
@@ -29,9 +29,9 @@ bool CudnnPoolLayer::typeCheck(const std::string &poolType,
    if (mode) {
      *mode = HL_POOLING_AVERAGE;
    }
-  } else if (poolType == "cudnn-avg-excl-pad-pool") {
+  } else if (poolType == "cudnn-avg-incl-pad-pool") {
    if (mode) {
-      *mode = HL_POOLING_AVERAGE_EXCLUDE_PADDING;
+      *mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
    }
  } else {
    return false;

--- a/paddle/gserver/layers/DetectionOutputLayer.cpp
+++ b/paddle/gserver/layers/DetectionOutputLayer.cpp
@@ -143,7 +143,7 @@ void DetectionOutputLayer::forward(PassType passType) {
    resetOutput(numKept, 7);
  } else {
    MatrixPtr outV = getOutputValue();
-    outV = NULL;
+    if (outV) outV->resize(0, 0);
    return;
  }
  MatrixPtr outV = getOutputValue();

--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -294,12 +294,9 @@ void MKLDNNConvLayer::resetOutValue(
    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& out) {
  out = MKLDNNMatrix::create(output_.value, pd->dst_primitive_desc());
-  // change original output value from cpu matrix to mkldnn matrix
-  output_.value = std::dynamic_pointer_cast<Matrix>(out);
  // create reorder if output value has cpu device and pd do not match
  cpuOutVal_ = nullptr;
-  cpuOutVal_ = nullptr;
+  cvtOutVal_ = nullptr;
  if (!outputIsOnlyMKLDNN()) {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
    memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};

--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -172,12 +172,10 @@ void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
 void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
  out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
-  // change original output value to mkldnn output value
-  output_.value = std::dynamic_pointer_cast<Matrix>(out);
  if (!outputIsOnlyMKLDNN()) {
    // fc cpu output value do not need create convert
    // just share point
-    getOutput(CPU_DEVICE).value->setData(output_.value->getData());
+    getOutput(CPU_DEVICE).value->setData(out->getData());
  }
 }

--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -119,6 +119,10 @@ public:
        inputElemenCnt_ = elemenCnt;
        reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
        resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
+        if (outVal_) {
+          // change original output value to mkldnn output value
+          output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
+        }
        convertWeightsFromPaddle();
        needResetBwd_ = true;
      }

--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "MKLDNNPoolLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+namespace paddle {
+REGISTER_LAYER(mkldnn_pool, MKLDNNPoolLayer);
+bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  ic_ = conf.channels();
+  ih_ = conf.img_size_y();
+  iw_ = conf.img_size();
+  oc_ = ic_;
+  oh_ = conf.output_y();
+  ow_ = conf.output_x();
+  fh_ = conf.size_y();
+  fw_ = conf.size_x();
+  ph_ = conf.padding_y();
+  pw_ = conf.padding();
+  sh_ = conf.stride_y();
+  sw_ = conf.stride();
+  const std::string& type = conf.pool_type();
+  if (type == "max-projection") {
+    poolAlgo_ = algorithm::pooling_max;
+  } else if (type == "avg-projection") {
+    // paddle only use exclude_padding
+    poolAlgo_ = algorithm::pooling_avg_exclude_padding;
+  } else {
+    LOG(FATAL) << "unknow pooling type!";
+  }
+  return true;
+}
+void MKLDNNPoolLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
+      << "Input channel can not be changed";
+  // cal output sizes
+  // paddle used false caffeMode for pooling
+  oh = outputSize(ih, fh_, ph_, sh_, false);
+  ow = outputSize(iw, fw_, pw_, sw_, false);
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+  printSizeInfo();
+}
+void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
+                               MKLDNNMatrixPtr& in,
+                               MKLDNNMatrixPtr& wgt,
+                               MKLDNNMatrixPtr& bias,
+                               MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(in, out);
+  resetFwdPD(fwdPD_, in, out);
+  resetFwdPipeline(pipeline, fwdPD_, in, out);
+  printValueFormatFlow();
+}
+void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
+                               MKLDNNMatrixPtr& in,
+                               MKLDNNMatrixPtr& wgt,
+                               MKLDNNMatrixPtr& bias,
+                               MKLDNNMatrixPtr& out) {
+  std::shared_ptr<pool_bwd::primitive_desc> pd;
+  resetBwdBuffers(in, out);
+  resetBwdPD(pd, in, out);
+  resetBwdPipeline(pipeline, pd, in, out);
+  printGradFormatFlow();
+}
+void MKLDNNPoolLayer::updateInputData() {
+  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+}
+void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                      MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  resetOutValue(out);
+}
+void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {
+  if (inputIsOnlyMKLDNN()) {
+    const MatrixPtr& dnnIn = getInputValue(0);
+    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
+    CHECK(in) << "Input should be MKLDNNMatrix";
+  } else {
+    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
+    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
+    in = MKLDNNMatrix::create(
+        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
+  }
+}
+void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
+  CHECK(inVal_) << "Should reset input value first";
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  out = MKLDNNMatrix::create(
+      output_.value, outDims, inVal_->getFormat(), engine_);
+  // create reorder if output value has cpu device and pd do not match
+  cpuOutVal_ = nullptr;
+  cvtOutVal_ = nullptr;
+  if (!outputIsOnlyMKLDNN()) {
+    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
+    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
+    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
+      CHECK(cvtOutVal_) << "should not be emptry";
+    } else {
+      // CPU output share the same data of MKLDNN output
+      cpuOut->setData(out->getData());
+      cpuOutVal_ = out;
+    }
+  }
+}
+void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
+                                 MKLDNNMatrixPtr in,
+                                 MKLDNNMatrixPtr out) {
+  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  memory::dims kernels = memory::dims{fh_, fw_};
+  memory::dims strides = memory::dims{sh_, sw_};
+  memory::dims padL = memory::dims{ph_, pw_};
+  memory::dims padR = getPaddingR();
+  padding_kind padKind = padding_kind::zero;
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = pool_fwd::desc(pk,
+                                poolAlgo_,
+                                in->getMemoryDesc(),
+                                out->getMemoryDesc(),
+                                strides,
+                                kernels,
+                                padL,
+                                padR,
+                                padKind);
+  pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_));
+  // prepare workspace if necessary
+  workspace_ =
+      (passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max)
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
+void MKLDNNPoolLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<pool_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  pipeline.clear();
+  fwd_ = workspace_
+             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
+             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+  if (cvtOutVal_) {
+    pipeline.push_back(*cvtOutVal_);
+  }
+}
+void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                      MKLDNNMatrixPtr& out) {
+  resetOutGrad(out);
+  resetInGrad(in);
+}
+void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
+  CHECK(outVal_) << "Should have output value";
+  out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
+  // create reorder if output value has cpu device and pd do not match
+  cpuOutGrad_ = nullptr;
+  cvtOutGrad_ = nullptr;
+  if (!outputIsOnlyMKLDNN()) {
+    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    cpuOutGrad_ = MKLDNNMatrix::create(
+        cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
+    if (cpuOutGrad_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
+      CHECK(cvtOutGrad_) << "should not be emptry";
+    } else {
+      // share the same data of CPU output
+      output_.grad->setData(cpuOut->getData());
+      out = cpuOutGrad_;
+    }
+  }
+}
+void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
+  in = nullptr;
+  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
+  if (inGrad == nullptr) {
+    return;
+  }
+  CHECK(inVal_);
+  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+}
+void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
+                                 MKLDNNMatrixPtr& in,
+                                 MKLDNNMatrixPtr& out) {
+  memory::dims kernels = memory::dims{fh_, fw_};
+  memory::dims strides = memory::dims{sh_, sw_};
+  memory::dims padL = memory::dims{ph_, pw_};
+  memory::dims padR = getPaddingR();
+  CHECK(in);
+  CHECK(out);
+  auto bwdDesc = pool_bwd::desc(poolAlgo_,
+                                in->getMemoryDesc(),
+                                out->getMemoryDesc(),
+                                strides,
+                                kernels,
+                                padL,
+                                padR,
+                                padding_kind::zero);
+  pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+void MKLDNNPoolLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<pool_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  pipeline.clear();
+  if (cvtOutGrad_) {
+    pipeline.push_back(*cvtOutGrad_);
+  }
+  bwdData_ =
+      workspace_
+          ? std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *workspace_, *in))
+          : std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *in));
+  pipeline.push_back(*bwdData_);
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+namespace paddle {
+typedef mkldnn::pooling_forward pool_fwd;
+typedef mkldnn::pooling_backward pool_bwd;
+/**
+ * @brief A subclass of MKLDNNLayer pool layer.
+ *
+ * The config file api is mkldnn_pool
+ */
+class MKLDNNPoolLayer : public MKLDNNLayer {
+protected:
+  // padding height and width
+  int ph_, pw_;
+  // stride height and width
+  int sh_, sw_;
+  // filter(kenerl) height and width
+  int fh_, fw_;
+  // pooling_avg or pooling_max
+  mkldnn::algorithm poolAlgo_;
+  // MKLDNNMatrixPtr which should be created from CPU Device
+  MKLDNNMatrixPtr cpuOutVal_;
+  MKLDNNMatrixPtr cpuOutGrad_;
+  // convert handle between CPU device and MKLDNN device
+  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
+  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
+  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+  // test_pooling_forward.cpp, pool need workspace for backward
+  std::shared_ptr<mkldnn::memory> workspace_;
+public:
+  explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+  ~MKLDNNPoolLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+  void updateInputData() override;
+  void printSizeInfo() override {
+    MKLDNNLayer::printSizeInfo();
+    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
+                       << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
+                       << ", sw: " << sw_;
+  }
+protected:
+  /**
+   * Forward functions: reset buffers(input, output),
+   *                    reset primitive descriptor,
+   *                    reset pipeline.
+   */
+  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetInValue(MKLDNNMatrixPtr& in);
+  void resetOutValue(MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<pool_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+  /**
+   * Backward functions: reset buffers(input, output),
+   *                     reset primitive descriptor,
+   *                     reset pipeline.
+   */
+  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetOutGrad(MKLDNNMatrixPtr& out);
+  void resetInGrad(MKLDNNMatrixPtr& in);
+  void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<pool_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+  /**
+   * get padding_r according to
+   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+   * test_pooling_forward.cpp
+   */
+  mkldnn::memory::dims getPaddingR() const {
+    mkldnn::memory::dims padR = {ph_, pw_};
+    for (int i = 0; i < 2; ++i) {
+      if ((ih_ + ph_ + padR[0] - fh_) / sh_ + 1 < oh_) {
+        ++padR[0];
+      }
+      if ((iw_ + pw_ + padR[1] - fw_) / sw_ + 1 < ow_) {
+        ++padR[1];
+      }
+    }
+    return padR;
+  }
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ b/paddle/gserver/layers/SequenceSliceLayer.cpp
@@ -73,9 +73,10 @@ void SequenceSliceLayer::checkInputs() {
  CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
                           << "must be a sequence.";
  const MatrixPtr indices1 = getInputValue(1);
-  CHECK_EQ(static_cast<size_t>(indices1->getHeight()),
+  CHECK_EQ(
-           inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
+      indices1->getHeight(),
-                                : inputSeq.getNumSequences())
+      static_cast<size_t>(inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
+                                               : inputSeq.getNumSequences()))
      << "Height of the second input should be equal to number of sequence "
      << "in the first input.";
  if (inputLayers_.size() == 3) {
@@ -151,7 +152,7 @@ void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
        if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
        int seqLen = endPos - begPos + 1;
-        CHECK_GT(seqLen, 0U);
+        CHECK_GT(seqLen, 0);
        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
        hasSubseq
            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)

--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -64,15 +64,17 @@ void MKLDNNTester::reset(const TestConfig& dnn,
        configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
  }
  refLayer_ = testLayers_[REF];
-  dnnLayer_ = std::dynamic_pointer_cast<MKLDNNLayer>(testLayers_[DNN]);
+  dnnLayer_ = testLayers_[DNN];
-  CHECK(dnnLayer_);
-  // for comparison with Paddle reference results,
-  // need manually add cpu device output for test
-  dnnLayer_->addOutputArgument(CPU_DEVICE);
  EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
  setInputImgSize();
+  // for comparison with Paddle reference results,
+  // need manually add cpu device output for test
+  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  if (dnnLayer) {
+    dnnLayer->addOutputArgument(CPU_DEVICE);
+  }
 }
 void MKLDNNTester::setInputImgSize() {
@@ -122,7 +124,7 @@ void MKLDNNTester::randomTopDiffs() {
 void MKLDNNTester::checkForward() {
  VLOG(MKLDNN_ALL) << "Check Forward";
  printTopDatas();
-  double delta = compareMatrix(dnnLayer_->getOutput(-1).value,
+  double delta = compareMatrix(dnnLayer_->getOutput(CPU_DEVICE).value,
                               refLayer_->getOutputValue());
  EXPECT_LE(fabs(delta), eps_);
 }
@@ -155,7 +157,10 @@ void MKLDNNTester::checkBackwardWgts() {
  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
  saveWgt(parameters_[DNN], dnnWgts);
-  dnnLayer_->convertWeightsToPaddle();
+  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  if (dnnLayer) {
+    dnnLayer->convertWeightsToPaddle();
+  }
  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
@@ -322,6 +327,10 @@ void MKLDNNTester::runOnce() {
  // and clearTopDatas(REF) should be coverd by ref layers
  clearBotDiffs(REF);
  clearWgtDiffs(REF);
+  // it is necessary to clear bottom diffs when only activation is dnn type
+  if (configs_[DNN].layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) {
+    clearBotDiffs(DNN);
+  }
 }
 void MKLDNNTester::run(const TestConfig& dnn,
@@ -333,8 +342,19 @@ void MKLDNNTester::run(const TestConfig& dnn,
                       float epsilon,
                       bool log,
                       int level) {
-  VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: " << dnn.layerConfig.type()
+  CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
-                     << " vs " << ref.layerConfig.type();
+        dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
+      << "should be MKLDNN layer or MKLDNN activation";
+  if (dnn.layerConfig.type() == ref.layerConfig.type()) {
+    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
+                       << dnn.layerConfig.active_type() << " vs "
+                       << ref.layerConfig.active_type();
+  } else {
+    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
+                       << dnn.layerConfig.type() << " vs "
+                       << ref.layerConfig.type();
+  }
  ih_ = inputImgH;
  iw_ = inputImgW;
  iter_ = iter;

--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -41,8 +41,7 @@ protected:
  vector<LayerMap> layerMaps_;
  vector<vector<ParameterPtr>> parameters_;
  vector<LayerPtr> testLayers_;
-  LayerPtr refLayer_;
+  LayerPtr refLayer_, dnnLayer_;
-  MKLDNNLayerPtr dnnLayer_;
  /// run some iterations, all the result should pass
  size_t iter_;

--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <vector>
 #include "MKLDNNTester.h"
 #include "ModelConfig.pb.h"
+#include "paddle/gserver/activations/MKLDNNActivation.h"
 #include "paddle/math/MathUtils.h"
 using namespace paddle;  // NOLINT
@@ -141,6 +142,110 @@ TEST(MKLDNNLayer, ConvLayer) {
  testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
 }
+struct testPoolDesc {
+  int bs, ch;  // input channel and output channel are the same
+  int ih, iw;
+  int oh, ow;
+  int fh, fw;
+  int ph, pw;
+  int sh, sw;
+};
+void testPoolLayer(const testPoolDesc& pm) {
+  const std::string compareTypes[] = {"mkldnn_pool", "pool"};
+  TestConfig cfg;
+  cfg.layerConfig.set_type(compareTypes[0]);
+  cfg.layerConfig.set_size(pm.ch * pm.oh * pm.ow);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ch * pm.ih * pm.iw),
+       0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+  pool->set_channels(pm.ch);
+  pool->set_img_size(pm.iw);
+  pool->set_img_size_y(pm.ih);
+  pool->set_output_x(pm.ow);
+  pool->set_output_y(pm.oh);
+  pool->set_size_x(pm.fw);
+  pool->set_size_y(pm.fh);
+  pool->set_padding(pm.pw);
+  pool->set_padding_y(pm.ph);
+  pool->set_stride(pm.sw);
+  pool->set_stride_y(pm.sh);
+  int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
+  int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
+  CHECK_EQ(ow, pm.ow) << "output size check failed";
+  CHECK_EQ(oh, pm.oh) << "output size check failed";
+  MKLDNNTester tester;
+  for (auto type : {"max-projection", "avg-projection"}) {
+    pool->set_pool_type(type);
+    TestConfig ref = cfg;
+    ref.layerConfig.set_type(compareTypes[1]);
+    for (auto bs : {pm.bs, 1}) {
+      tester.run(cfg, ref, bs, pm.ih, pm.iw);
+    }
+  }
+}
+TEST(MKLDNNLayer, PoolLayer) {
+  /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw*/
+  testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
+  testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
+  testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
+  testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
+  testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
+}
+struct testActDesc {
+  int bs, ch;
+  int ih, iw;
+};
+static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("addto");
+  cfg.layerConfig.set_size(pm.ch * pm.ih * pm.iw);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ch * pm.ih * pm.iw),
+       0});
+  cfg.layerConfig.add_inputs();
+}
+void testActivation(std::string& type, const testActDesc& pm) {
+  const std::string compareTypes[] = {type, type.erase(0, 7)};
+  TestConfig cfg;
+  getAddtoConfig(cfg, pm);
+  TestConfig ref = cfg;
+  cfg.layerConfig.set_active_type(compareTypes[0]);
+  ref.layerConfig.set_active_type(compareTypes[1]);
+  MKLDNNTester tester;
+  for (auto bs : {pm.bs, 1}) {
+    tester.run(cfg, ref, bs, pm.ih, pm.iw);
+  }
+}
+TEST(MKLDNNActivation, Activations) {
+  auto types = MKLDNNActivation::getAllRegisteredTypes();
+  // TODO(TJ): mkldnn_softmax not implemented, paddle do not have elu activation
+  std::set<string> excluded{"mkldnn_softmax", "mkldnn_elu"};
+  for (auto type : types) {
+    if (excluded.count(type)) {
+      continue;
+    }
+    testActivation(type, {16, 64, 32, 32});
+  }
+}
 // TODO(TJ): add branch test
 int main(int argc, char** argv) {

--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cmath>
 #include "BaseMatrix.h"
 #include "MathFunctions.h"
+#include "NEONFunctions.h"
 #include "SIMDFunctions.h"
 #include "hl_matrix_apply.cuh"
 #include "hl_matrix_base.cuh"
@@ -666,6 +667,13 @@ void BaseMatrixT<T>::relu(BaseMatrixT& b) {
  applyBinary(binary::Relu<T>(), b);
 }
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <>
+void BaseMatrixT<float>::relu(BaseMatrixT& b) {
+  neon::relu(data_, b.data_, height_ * width_);
+}
+#endif
 DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
 template <class T>
 void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {

--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include <mkl_lapacke.h>
 #endif
-#ifdef PADDLE_USE_ATLAS
+#if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB)
 extern "C" {
 #include <cblas.h>
 #include <clapack.h>

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/NEONFunctions.cpp
+++ b/paddle/math/NEONFunctions.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include "NEONFunctions.h"
+#include <arm_neon.h>
+namespace paddle {
+namespace neon {
+// b[i] = a[i] > 0.0f ? a[i] : 0.0f
+void relu(const float* a, float* b, int len) {
+  int offset = len % 16;
+  float32x4_t ma0, ma1, ma2, ma3;
+  float32x4_t mb0, mb1, mb2, mb3;
+  float32x4_t zero = vdupq_n_f32(0.f);
+  for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
+    ma0 = vld1q_f32(a);
+    ma1 = vld1q_f32(a + 4);
+    ma2 = vld1q_f32(a + 8);
+    ma3 = vld1q_f32(a + 12);
+    mb0 = vmaxq_f32(ma0, zero);
+    mb1 = vmaxq_f32(ma1, zero);
+    mb2 = vmaxq_f32(ma2, zero);
+    mb3 = vmaxq_f32(ma3, zero);
+    vst1q_f32(b, mb0);
+    vst1q_f32(b + 4, mb1);
+    vst1q_f32(b + 8, mb2);
+    vst1q_f32(b + 12, mb3);
+  }
+  for (int i = 0; i < offset; i++) {
+    b[i] = a[i] > 0.0f ? a[i] : 0.0f;
+  }
+}
+}  // namespace neon
+}  // namespace paddle
+#endif
--- a/paddle/math/NEONFunctions.h
+++ b/paddle/math/NEONFunctions.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+namespace paddle {
+namespace neon {
+void relu(const float* a, float* b, int len);
+}  // namespace neon
+}  // namespace paddle
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -825,9 +825,8 @@ void testMaxPoolFwdBwd(int numSamples,
                       int strideW,
                       int padH,
                       int padW) {
-  int outH = 0, outW = 0;
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-  outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
  int inWidth = imgSizeH * imgSizeW * channels;
  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
@@ -927,9 +926,8 @@ void testAvgPoolFwdBwd(int numSamples,
                       int strideW,
                       int padH,
                       int padW) {
-  int outH = 0, outW = 0;
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-  outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
  int inWidth = imgSizeH * imgSizeW * channels;
  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);

--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -12,26 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <thrust/execution_policy.h>
+#include <thrust/reduce.h>
 #include "paddle/operators/accuracy_op.h"
+#include "paddle/platform/cuda_helper.h"
 namespace paddle {
 namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
-__global__ void AccuracySingleKernel(const int N, const int D, const int top_k,
+template <int BlockSize>
-                                     const int* Xdata, const int* labelData,
+__global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata,
-                                     float* accuracy) {
+                                   const int* labeldata, float* accuracy) {
-  int correct = 0;
+  int count = 0;
-  for (int row = 0; row < N; row++) {
+  __shared__ int total[BlockSize];
-    const int label = labelData[row];
-    for (int col = 0; col < D; col++) {
+  // support only 1 block
-      const int pred = Xdata[row * D + col];
+  for (int i = threadIdx.x; i < (N); i += BlockSize) {
-      if (pred == label) {
+    for (int j = 0; j < D; ++j) {
-        ++correct;
+      if (Xdata[i * D + j] == labeldata[i]) {
+        ++count;
        break;
      }
    }
  }
-  *accuracy = static_cast<float>(correct) / static_cast<float>(N);
+  total[threadIdx.x] = count;
+  __syncthreads();
+  // reduce the count with init value 0, and output accuracy.
+  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
+  if (threadIdx.x == 0) {
+    *accuracy = static_cast<float>(result) / static_cast<float>(N);
+  }
 }
 template <typename T>
@@ -57,8 +69,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel {
      return;
    }
-    AccuracySingleKernel<<<1, 1>>>(num_samples, infer_width, 1, inference_data,
+    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS>>>(
-                                   label_data, accuracy_data);
+        num_samples, infer_width, inference_data, label_data, accuracy_data);
  }
 };

--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/cross_entropy_op.h"
+namespace paddle {
+namespace operators {
+using framework::LoDTensor;
+class CrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Label) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"), "Output(Y) must not be null.");
+    auto x = ctx.Input<Tensor>("X");
+    auto label = ctx.Input<Tensor>("Label");
+    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(label->dims().size(), 2,
+                      "Input(Label)'s rank must be 2.");
+    // TODO(xinghai-sun): remove this check after swtiching to bool
+    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
+                   ctx.Attr<int>("soft_label") == 1);
+    PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
+                      "The 1st dimension of Input(X) and Input(Label) must "
+                      "be equal.");
+    if (ctx.Attr<int>("soft_label") == 1) {
+      PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
+                        "If Attr(soft_label) == 1, The 2nd dimension of "
+                        "Input(X) and Input(Label) must be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(label->dims()[1], 1,
+                        "If Attr(soft_label) == 0, The 2nd dimension of "
+                        "Input(Label) must be 1.");
+    }
+    ctx.Output<LoDTensor>("Y")->Resize({x->dims()[0], 1});
+  }
+};
+class CrossEntropyGradientOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Label) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
+                            "Input(Y@GRAD) must not be null.");
+    auto x = ctx.Input<Tensor>("X");
+    auto label = ctx.Input<Tensor>("Label");
+    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(dy->dims().size(), 2, "Input(Y@Grad)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(label->dims().size(), 2,
+                      "Input(Label)'s rank must be 2.");
+    // TODO(xinghai-sun): remove this check after swtiching to bool
+    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
+                   ctx.Attr<int>("soft_label") == 1);
+    PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
+                      "The 1st dimension of Input(X) and Input(Label) must "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x->dims()[0], dy->dims()[0],
+                      "The 1st dimension of Input(X) and Input(Y@Grad) must "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(dy->dims()[1], 1,
+                      "The 2nd dimension of Input(Y@Grad) must be 1.");
+    if (ctx.Attr<int>("soft_label") == 1) {
+      PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
+                        "If Attr(soft_label) == 1, The 2nd dimension of "
+                        "Input(X) and Input(Label) must be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(label->dims()[1], 1,
+                        "If Attr(soft_label) == 0, The 2nd dimension of "
+                        "Input(Label) must be 1.");
+    }
+    auto dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    dx->Resize(x->dims());
+  }
+};
+class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CrossEntropyOpMaker(framework::OpProto *proto,
+                      framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of CrossEntropyOp");
+    AddInput("Label", "The second input of CrossEntropyOp");
+    AddOutput("Y", "The output of CrossEntropyOp");
+    AddAttr<int>("soft_label", "Is soft label. Default zero.").SetDefault(0);
+    AddComment(R"DOC(
+CrossEntropy Operator.
+It supports both standard cross-entropy and soft-label cross-entropy loss
+computation.
+1) One-hot cross-entropy:
+    soft_label = 0, Label[i, 0] indicates the class index for sample i:
+                Y[i] = -log(X[i, Label[i]])
+2) Soft-label cross-entropy:
+    soft_label = 1, Label[i, j] indicates the soft label of class j
+    for sample i:
+                Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
+   Please make sure that in this case the summuation of each row of Label
+   equals one.
+3) One-hot cross-entropy with vecterized Input(Label):
+     As a special case of 2), when each row of Input(Label) has only one
+     non-zero element (equals 1), soft-label cross-entropy degenerates to a
+     one-hot cross-entropy with one-hot label representation.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
+            cross_entropy_grad, ops::CrossEntropyGradientOp);
+REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
+                       ops::CrossEntropyGradientOpKernel<float>);
--- a/paddle/operators/onehot_cross_entropy_op.cu
+++ b/paddle/operators/onehot_cross_entropy_op.cu
@@ -13,27 +13,13 @@
   limitations under the License. */
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/cross_entropy_op.h"
 #include "paddle/platform/assert.h"
+#include "paddle/platform/hostdevice.h"
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
-template <typename T>
-__host__ __device__ T clipping_log(const T x) {
-  PADDLE_ASSERT(std::is_floating_point<T>::value);
-  const T kApproInf = 1e20;
-  T v = log(x);
-  if (v == INFINITY) {
-    return kApproInf;
-  }
-  if (v == -INFINITY) {
-    return -kApproInf;
-  }
-  return v;
-}
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
                                   const int N, const int D) {
@@ -42,7 +28,20 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
       i += blockDim.x * gridDim.x) {
    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
-    Y[i] = -clipping_log(X[i * D + label[i]]);
+    Y[i] = -tolerable_value(log(X[i * D + label[i]]));
+  }
+}
+template <typename T>
+__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
+                                       const int N, const int D) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    T sum = static_cast<T>(0);
+    for (int j = 0; j < D; j++) {
+      sum += label[i * D + j] * tolerable_value(log(X[i * D + j]));
+    }
+    Y[i] = -sum;
  }
 }
@@ -69,57 +68,84 @@ __global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
 }
 template <typename T>
-class OnehotCrossEntropyOpCUDAKernel : public framework::OpKernel {
+__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
+                                               const T* label, const int N,
+                                               const int D) {
+  // TOOD(qingqing): optimize for this kernel
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    for (int j = 0; j < D; ++j) {
+      int idx = i * D + j;
+      dX[idx] = -label[idx] * dY[i] / X[idx];
+    }
+  }
+}
+template <typename T>
+class CrossEntropyOpCUDAKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "It must use GPUPlace.");
-    auto X = ctx.Input<Tensor>("X");
+    auto x = ctx.Input<Tensor>("X");
-    const T* Xdata = X->data<T>();
+    auto y = ctx.Output<Tensor>("Y");
-    const int* label_data = ctx.Input<Tensor>("label")->data<int>();
+    auto label = ctx.Input<Tensor>("Label");
-    auto Y = ctx.Output<Tensor>("Y");
-    Y->mutable_data<T>(ctx.GetPlace());
-    T* Ydata = Y->data<T>();
-    int N = X->dims()[0];
+    auto* x_data = x->data<T>();
-    int D = X->dims()[1];
+    y->mutable_data<T>(ctx.GetPlace());
+    auto* y_data = y->data<T>();
+    int n = x->dims()[0];
+    int d = x->dims()[1];
    int block = 512;
-    int grid = (N + block - 1) / block;
+    int grid = (n + block - 1) / block;
    // TODO(qingqing) launch kernel on specified stream
    // base on ExecutionContext.
-    CrossEntropyKernel<T><<<grid, block>>>(Ydata, Xdata, label_data, N, D);
+    if (ctx.Attr<int>("soft_label") == 1) {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
+      SoftCrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n,
+                                                 d);
+    } else {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
+      CrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, d);
+    }
  }
 };
 template <typename T>
-class OnehotCrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
+class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "It must use GPUPlace.");
-    auto X = ctx.Input<Tensor>("X");
+    auto x = ctx.Input<Tensor>("X");
-    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto label = ctx.Input<Tensor>("label");
+    auto label = ctx.Input<Tensor>("Label");
-    auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace());
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto* dYdata = dY->template data<T>();
+    auto* dy_data = dy->data<T>();
-    auto* Xdata = X->template data<T>();
+    auto* x_data = x->data<T>();
-    auto* label_data = label->data<int>();
-    int N = X->dims()[0];
+    int n = x->dims()[0];
-    int D = X->dims()[1];
+    int d = x->dims()[1];
    int block = 512;
-    int grid = (N * D + block - 1) / block;
+    int grid = (n * d + block - 1) / block;
-    zero<T><<<grid, block>>>(dXdata, N * D);
+    zero<T><<<grid, block>>>(dx_data, n * d);
+    grid = (n + block - 1) / block;
-    grid = (N + block - 1) / block;
    // TODO(qingqing): launch kernel on specified stream
    // base on ExecutionContext.
-    CrossEntropyGradientKernel<T><<<grid, block>>>(dXdata, dYdata, Xdata,
+    if (ctx.Attr<int>("soft_label") == 1) {
-                                                   label_data, N, D);
+      auto* label_data = label->data<T>();
+      SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
+          dx_data, dy_data, x_data, label_data, n, d);
+    } else {
+      auto* label_data = label->data<int>();
+      CrossEntropyGradientKernel<T><<<grid, block>>>(dx_data, dy_data, x_data,
+                                                     label_data, n, d);
+    }
  }
 };
@@ -127,7 +153,6 @@ class OnehotCrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
+REGISTER_OP_GPU_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>);
-                       ops::OnehotCrossEntropyOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(cross_entropy_grad,
-REGISTER_OP_GPU_KERNEL(onehot_cross_entropy_grad,
+                       ops::CrossEntropyGradientOpCUDAKernel<float>);
-                       ops::OnehotCrossEntropyGradientOpCUDAKernel<float>);
--- a/paddle/operators/onehot_cross_entropy_op.h
+++ b/paddle/operators/onehot_cross_entropy_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/hostdevice.h"
 namespace paddle {
 namespace operators {
@@ -21,75 +22,93 @@ namespace operators {
 using Tensor = framework::Tensor;
 template <typename T>
-inline T tolerable_value(const T x) {
+HOSTDEVICE T tolerable_value(const T x) {
-  static_assert(std::is_floating_point<T>::value,
+  PADDLE_ASSERT(std::is_floating_point<T>::value);
-                "tolerable_value works only on float, "
-                "double and double double.");
  const T kApproInf = 1e20;
  if (x == INFINITY) {
    return kApproInf;
  }
  if (x == -INFINITY) {
    return -kApproInf;
  }
  return x;
 }
 template <typename T>
-class OnehotCrossEntropyOpKernel : public framework::OpKernel {
+class CrossEntropyOpKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                   "It must use CPUPlace.");
-    auto X = ctx.Input<Tensor>("X");
+    auto x = ctx.Input<Tensor>("X");
-    const T* Xdata = X->data<T>();
+    auto y = ctx.Output<Tensor>("Y");
-    const int* label_data = ctx.Input<Tensor>("label")->data<int>();
-    auto Y = ctx.Output<Tensor>("Y");
+    auto* x_data = x->data<T>();
+    y->mutable_data<T>(ctx.GetPlace());
-    Y->mutable_data<T>(ctx.GetPlace());
+    auto* y_data = y->data<T>();
-    T* Ydata = Y->data<T>();
+    int batch_size = x->dims()[0];
+    int class_num = x->dims()[1];
-    int batch_size = X->dims()[0];
-    int class_num = X->dims()[1];
+    if (ctx.Attr<int>("soft_label") == 1) {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-    for (int i = 0; i < batch_size; ++i) {
+      int index = 0;
-      int index = i * class_num + label_data[i];
+      for (int i = 0; i < batch_size; ++i) {
-      Ydata[i] = -tolerable_value(std::log(Xdata[index]));
+        T sum = static_cast<T>(0);
+        for (int j = 0; j < class_num; ++j) {
+          sum += label_data[index] * tolerable_value(std::log(x_data[index]));
+          y_data[i] = -sum;
+          index++;
+        }
+      }
+    } else {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
+      for (int i = 0; i < batch_size; ++i) {
+        int index = i * class_num + label_data[i];
+        y_data[i] = -tolerable_value(std::log(x_data[index]));
+      }
    }
  }
 };
 template <typename T>
-class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
+class CrossEntropyGradientOpKernel : public framework::OpKernel {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                   "It must use CPUPlace.");
-    auto X = ctx.Input<Tensor>("X");
+    auto x = ctx.Input<Tensor>("X");
-    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto label = ctx.Input<Tensor>("label");
+    auto label = ctx.Input<Tensor>("Label");
-    auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace());
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto* dYdata = dY->template data<T>();
+    auto* dy_data = dy->data<T>();
-    auto* Xdata = X->template data<T>();
+    auto* x_data = x->data<T>();
-    auto* label_data = label->data<int>();
-    const int batch_size = X->dims()[0];
+    int batch_size = x->dims()[0];
-    const int class_num = X->dims()[1];
+    int class_num = x->dims()[1];
    // TODO(qingqing): make zero setting an common function.
-    memset(dXdata, 0, sizeof(T) * batch_size * class_num);
+    if (ctx.Attr<int>("soft_label") == 1) {
-    for (int i = 0; i < batch_size; ++i) {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      int index = i * class_num + label_data[i];
+      int index = 0;
-      dXdata[index] = -tolerable_value(dYdata[i] / Xdata[index]);
+      for (int i = 0; i < batch_size; ++i) {
+        for (int j = 0; j < class_num; ++j) {
+          dx_data[index] = -label_data[index] * dy_data[i] / x_data[index];
+          index++;
+        }
+      }
+    } else {
+      auto* label_data = label->data<int>();
+      memset(dx_data, 0, sizeof(T) * batch_size * class_num);
+      for (int i = 0; i < batch_size; ++i) {
+        PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
+        int index = i * class_num + label_data[i];
+        dx_data[index] = -dy_data[i] / x_data[index];
+      }
    }
  }
 };

--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/dropout_op.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+using framework::LoDTensor;
+class DropoutOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_GE(ctx.Attr<float>("dropout_prob"), 0);
+    PADDLE_ENFORCE_LE(ctx.Attr<float>("dropout_prob"), 1);
+    // TODO(xinghai-sun): remove this check after swtiching to bool
+    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
+                   ctx.Attr<int>("is_training") == 1);
+    auto dims = ctx.Input<Tensor>("X")->dims();
+    ctx.Output<LoDTensor>("Out")->Resize(dims);
+    if (ctx.Attr<int>("is_training") == 1) {
+      ctx.Output<LoDTensor>("Mask")->Resize(dims);
+    }
+  }
+};
+template <typename AttrType>
+class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DropoutOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<AttrType>("dropout_prob", "Probability of setting units to zero.")
+        .SetDefault(.5f);
+    // TODO(xinghai-sun): use bool for is_training after bool is supported.
+    AddAttr<int>("is_training", "Whether in training phase.").SetDefault(1);
+    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
+    AddInput("X", "The input of dropout op.");
+    AddOutput("Out", "The output of dropout op.");
+    AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
+    AddComment(R"DOC(
+Dropout Operator.
+"Dropout" refers to randomly dropping out units in a nerual network. It is a
+regularization technique for reducing overfitting by preventing neuron
+co-adaption during training. The dropout operator randomly set (according to
+the given dropout probability) the outputs of some units to zero, while others
+being set to their inputs.
+)DOC");
+  }
+};
+template <typename AttrType>
+class DropoutOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx.Attr<int>("is_training"), 1,
+                      "GradOp is only callable when is_training is true");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Mask"), "Mask must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) must not be null.");
+    PADDLE_ENFORCE_GE(ctx.Attr<AttrType>("dropout_prob"), 0);
+    PADDLE_ENFORCE_LE(ctx.Attr<AttrType>("dropout_prob"), 1);
+    // TODO(xinghai-sun): remove this check after swtiching to bool
+    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
+                   ctx.Attr<int>("is_training") == 1);
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    PADDLE_ENFORCE_EQ(x_dims, out_dims,
+                      "Dimensions of Input(X) and Out@Grad must be the same.");
+    auto mask_dims = ctx.Input<Tensor>("Mask")->dims();
+    PADDLE_ENFORCE_EQ(x_dims, mask_dims,
+                      "Dimensions of Input(X) and Mask must be the same.");
+    auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    x_grad->Resize(x_dims);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
+            ops::DropoutOpGrad<float>);
+REGISTER_OP_CPU_KERNEL(
+    dropout, ops::CPUDropoutKernel<paddle::platform::CPUPlace, float, float>);
+REGISTER_OP_CPU_KERNEL(
+    dropout_grad, ops::DropoutGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#define EIGEN_USE_GPU
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/operators/dropout_op.h"
+namespace paddle {
+namespace operators {
+template <typename T, typename AttrType>
+struct MaskGenerator {
+  AttrType dropout_prob;
+  int seed;
+  __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
+      : dropout_prob(dropout_prob), seed(seed) {}
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<AttrType> dist(0, 1);
+    rng.discard(n);
+    if (dist(rng) < dropout_prob) {
+      return static_cast<T>(0);
+    } else {
+      return static_cast<T>(1);
+    }
+  }
+};
+// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename Place, typename T, typename AttrType>
+class GPUDropoutKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Output<Tensor>("Out");
+    y->mutable_data<T>(context.GetPlace());
+    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
+    auto X = EigenMatrix<T>::Reshape(*x, 1);
+    auto Y = EigenMatrix<T>::Reshape(*y, 1);
+    auto place = context.GetEigenDevice<Place>();
+    if (context.Attr<int>("is_training") == 1) {
+      auto* mask = context.Output<Tensor>("Mask");
+      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
+      int size = framework::product(mask->dims());
+      int seed = context.Attr<int>("seed");
+      thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(mask_data),
+                        MaskGenerator<T, AttrType>(dropout_prob, seed));
+      auto M = EigenMatrix<T>::Reshape(*mask, 1);
+      Y.device(place) = X * M;
+    } else {
+      Y.device(place) = X * dropout_prob;
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    dropout, ops::GPUDropoutKernel<paddle::platform::GPUPlace, float, float>);
+REGISTER_OP_GPU_KERNEL(
+    dropout_grad, ops::DropoutGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <random>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename Place, typename T, typename AttrType>
+class CPUDropoutKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Output<Tensor>("Out");
+    const auto* x_data = x->data<T>();
+    auto* y_data = y->mutable_data<T>(context.GetPlace());
+    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
+    if (context.Attr<int>("is_training") == 1) {
+      auto* mask = context.Output<Tensor>("Mask");
+      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
+      int seed = context.Attr<int>("seed");
+      std::minstd_rand engine;
+      engine.seed(seed);
+      std::uniform_real_distribution<AttrType> dist(0, 1);
+      size_t size = framework::product(mask->dims());
+      for (size_t i = 0; i < size; ++i) {
+        if (dist(engine) < dropout_prob) {
+          mask_data[i] = 0;
+          y_data[i] = 0;
+        } else {
+          mask_data[i] = 1;
+          y_data[i] = x_data[i];
+        }
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(*x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
+      auto place = context.GetEigenDevice<Place>();
+      Y.device(place) = X * dropout_prob;
+    }
+  }
+};
+template <typename Place, typename T>
+class DropoutGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(context.Attr<int>("is_training"), 1,
+                      "GradOp is only callable when is_training is true");
+    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* mask = context.Input<Tensor>("Mask");
+    grad_x->mutable_data<T>(context.GetPlace());
+    auto M = EigenMatrix<T>::Reshape(*mask, 1);
+    auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
+    auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
+    auto place = context.GetEigenDevice<Place>();
+    dX.device(place) = dY * M;
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
+namespace paddle {
+namespace operators {
+class FCOp : public NetOp {
+ public:
+  FCOp(const std::string &type, const framework::VariableNameMap &inputs,
+       const framework::VariableNameMap &outputs,
+       const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    PADDLE_ENFORCE(!Inputs("X").empty(),
+                   "Inputs(X) of FCOp should not be null.");
+    PADDLE_ENFORCE(!Inputs("W").empty(),
+                   "Inputs(W) of FCOp should not be null.");
+    PADDLE_ENFORCE(!Outputs("MulOut").empty(),
+                   "Outputs(MulOut) of FCOp should not be null.");
+    PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
+                      "Output(Out) of FCOp should not be null.");
+    auto x = Inputs("X");
+    auto w = Inputs("W");
+    auto mul_out = Outputs("MulOut");
+    PADDLE_ENFORCE_EQ(
+        x.size(), w.size(),
+        "The size of inputs X(%d) should be the same as that of weights W(%d).",
+        x.size(), w.size());
+    PADDLE_ENFORCE_EQ(mul_out.size(), x.size(),
+                      "The size of intermediate mul_out(%d) should be the same "
+                      "as that of inputs X(%d).",
+                      mul_out.size(), x.size());
+    size_t n = x.size();
+    PADDLE_ENFORCE_GE(n, static_cast<size_t>(1),
+                      "The size of inputs X(%d) should be no less than 1.", n);
+    auto x_num_col_dims = Attr<std::vector<int>>("xNumColDims");
+    // Set all values or set no values (use the default value)
+    if (!x_num_col_dims.empty()) {
+      PADDLE_ENFORCE_EQ(x_num_col_dims.size(), n,
+                        "The size of attribute xNumColDims(%d) should be the "
+                        "same as that of inputs X(%d).",
+                        x_num_col_dims.size(), n);
+    } else {
+      x_num_col_dims.resize(n);
+      for (size_t i = 0; i < n; i++) {
+        x_num_col_dims[i] = 1;
+      }
+    }
+    // mul_out[i] = X[i] * W[i]
+    for (size_t i = 0; i < n; i++) {
+      framework::AttributeMap mul_attr;
+      mul_attr["x_num_col_dims"] = static_cast<int>(x_num_col_dims[i]);
+      mul_attr["y_num_col_dims"] = static_cast<int>(1);
+      AppendOp(
+          framework::OpRegistry::CreateOp("mul", {{"X", {x[i]}}, {"Y", {w[i]}}},
+                                          {{"Out", {mul_out[i]}}}, mul_attr));
+    }
+    // sum_out = X[0] * W[0] + ... + X[n-1] * W[n-1]
+    auto sum_out = mul_out[0];
+    if (n > 1) {
+      PADDLE_ENFORCE_NE(Output("SumOut"), framework::kEmptyVarName,
+                        "Output(SumOut) of FCOp should not be null when the "
+                        "size of Inputs(X) > 1.");
+      sum_out = Output("SumOut");
+      AppendOp(framework::OpRegistry::CreateOp("sum", {{"X", {mul_out}}},
+                                               {{"Out", {sum_out}}}, {}));
+    } else {
+      if (Output("SumOut") != framework::kEmptyVarName) {
+        this->Rename(Output("SumOut"), framework::kEmptyVarName);
+      }
+    }
+    // add_out = sum_out + b
+    auto b = Input("B");
+    auto add_out = sum_out;
+    if (b != framework::kEmptyVarName) {
+      PADDLE_ENFORCE_NE(
+          Output("AddOut"), framework::kEmptyVarName,
+          "Output(AddOut) of FCOp should not be null when Input(B) is set.");
+      add_out = Output("AddOut");
+      AppendOp(framework::OpRegistry::CreateOp(
+          "rowwise_add", {{"X", {sum_out}}, {"b", {Input("B")}}},
+          {{"Out", {add_out}}}, {}));
+    } else {
+      if (Output("AddOut") != framework::kEmptyVarName) {
+        this->Rename(Output("AddOut"), framework::kEmptyVarName);
+      }
+    }
+    auto activation = Attr<std::string>("activation");
+    AppendOp(framework::OpRegistry::CreateOp(activation, {{"X", {add_out}}},
+                                             {{"Y", {Output("Out")}}}, {}));
+    CompleteAddOp(false);
+  }
+};
+class FCOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FCOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(A vector of Tensors) each input Tensor can be of arbitrary "
+             "dimension, and will be reshaped to a 2-D matrix of size "
+             "(minibatch, number_of_input_features) according to attribute "
+             "xNumColDims.")
+        .AsDuplicable();
+    AddInput("W",
+             "(A vector of Tensors) the weights of FC operator, a "
+             "vector of 2-D matrix of size "
+             "(number_of_input_features, number_of_neurons).")
+        .AsDuplicable();
+    AddInput("B",
+             "(Tensor) the bias of FC operator, a 1-D vector of size "
+             "number_of_neurons.");
+    AddOutput("Out",
+              "(Tensor) the activated output matrix of FC operator, a 2-D "
+              "matrix of size (minibatch, number_of_neurons).");
+    AddOutput("MulOut",
+              "(A vector of Tensors) the intermediate outputs of FC operator, "
+              "each Tensor saving the product of X_i * W_i.")
+        .AsIntermediate()
+        .AsDuplicable();
+    AddOutput(
+        "SumOut",
+        "(Tensor) the intermediate output of FC operator, "
+        "saving the sum of the products of X and W, that is sum{X_i * W_i}.")
+        .AsIntermediate();
+    AddOutput("AddOut",
+              "(Tensor) the non-actived output of FC operator, "
+              "saving sum{X_i * W_i} + B.")
+        .AsIntermediate();
+    AddAttr<std::string>(
+        "activation",
+        "(string, default identity) the activation type of FC operator.")
+        .SetDefault("identity")
+        .InEnum({"identity", "sigmoid", "softmax"});
+    AddAttr<std::vector<int>>(
+        "xNumColDims",
+        "(std::vector<int>) The inputs Tensors of FC operator can be of "
+        "more than 2 dimensions. In that case, each input Tensor `X_i` will be "
+        "reshaped to a 2-D matrix. The matrix's first dimension "
+        "(the length of column) will be the product of `X_i`'s last "
+        "`xNumColDims_i` dimensions, that is "
+        "`X_i.dims[0] x ... x X_i.dims[xNumColDims_i - 1]`. "
+        "The matrix's second dimension (the length of row) will be the product "
+        "of `X_i`'s first `rank - xNumColDims_i` dimensions, that is "
+        "`X_i.dims[xNumColDims_i] x ... x X_i.dims[rank - 1]`)")
+        .SetDefault(std::vector<int>{});
+    AddComment(R"DOC(
+Fully Connected Operator, known as Fully Connected Layer or Inner Product Layer
+in Convolutional Neural Networks. Neurons in a fully connected layer have
+full connections to all activations in the previous layer.
+It computes an inner product of a set of
+learned weights with a matrix multiplication followed by a bias offset
+(optionally).
+Equation:
+  Out = Act(sum_n{X_i * W_i} + B)
+where X_i is Tensor that will be reshaped to a 2-D matrix of size (M x K),
+usually M is the minibatch size and K is the number of input features.
+W_i is a 2-D matrix of size (K x N), where N means the number of neurons
+in the fully connected layer. B is a 1-D vector of size N.
+Thus, the output Out is a 2-D matrix of size (M x N).
+Activation type can be set to `identity` (default), `sigmoid` or `softmax`.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(fc, ops::FCOp, ops::FCOpMaker);
--- a/paddle/operators/identity_op.cc
+++ b/paddle/operators/identity_op.cc
@@ -27,7 +27,7 @@ class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
                  framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of identity operator.");
-    AddOutput("Out", "The output tensor of identity operator.");
+    AddOutput("Y", "The output tensor of identity operator.");
    AddComment(R"DOC(
 The identity operator is an alias of the scale operator
 with the attribute scale fixed to 1.0.
@@ -44,12 +44,13 @@ class IdentityOp : public NetOp {
      : NetOp(type, inputs, outputs, attrs) {
    PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName,
                      "Input(X) of IdentityOp should not be null.");
-    PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
+    PADDLE_ENFORCE_NE(Output("Y"), framework::kEmptyVarName,
-                      "Output(Out) of IdentityOp should not be null.");
+                      "Output(Y) of IdentityOp should not be null.");
    AppendOp(framework::OpRegistry::CreateOp(
-        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
+        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Y")}}},
        {{"scale", static_cast<AttrType>(1)}}));
+    CompleteAddOp(false);
  }
 };

--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -19,12 +19,13 @@ namespace operators {
 namespace math {
 template <>
-void gemm<platform::CPUPlace, float>(const CBLAS_TRANSPOSE transA,
+void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
+                                     const CBLAS_TRANSPOSE transA,
                                     const CBLAS_TRANSPOSE transB, const int M,
                                     const int N, const int K,
                                     const float alpha, const float* A,
-                                     const float* B, const float beta, float* C,
+                                     const float* B, const float beta,
-                                     platform::DeviceContext* context) {
+                                     float* C) {
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
@@ -33,13 +34,13 @@ void gemm<platform::CPUPlace, float>(const CBLAS_TRANSPOSE transA,
 }
 template <>
-void gemm<platform::CPUPlace, double>(const CBLAS_TRANSPOSE transA,
+void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
+                                      const CBLAS_TRANSPOSE transA,
                                      const CBLAS_TRANSPOSE transB, const int M,
                                      const int N, const int K,
                                      const double alpha, const double* A,
                                      const double* B, const double beta,
-                                      double* C,
+                                      double* C) {
-                                      platform::DeviceContext* context) {
  int lda = (transA == CblasNoTrans) ? K : M;
  int ldb = (transB == CblasNoTrans) ? N : K;
  int ldc = N;
@@ -48,13 +49,10 @@ void gemm<platform::CPUPlace, double>(const CBLAS_TRANSPOSE transA,
 }
 template <>
-void matmul<platform::CPUPlace, float>(const framework::Tensor& matrix_a,
+void matmul<platform::CPUPlace, float>(
-                                       bool trans_a,
+    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-                                       const framework::Tensor& matrix_b,
+    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
-                                       bool trans_b, float alpha,
+    framework::Tensor* matrix_out, float beta) {
-                                       framework::Tensor* matrix_out,
-                                       float beta,
-                                       platform::DeviceContext* context) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -74,18 +72,15 @@ void matmul<platform::CPUPlace, float>(const framework::Tensor& matrix_a,
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
  gemm<platform::CPUPlace, float>(
-      transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
-      matrix_b.data<float>(), beta, matrix_out->data<float>(), context);
+      matrix_b.data<float>(), beta, matrix_out->data<float>());
 }
 template <>
-void matmul<platform::CPUPlace, double>(const framework::Tensor& matrix_a,
+void matmul<platform::CPUPlace, double>(
-                                        bool trans_a,
+    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-                                        const framework::Tensor& matrix_b,
+    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
-                                        bool trans_b, double alpha,
+    framework::Tensor* matrix_out, double beta) {
-                                        framework::Tensor* matrix_out,
-                                        double beta,
-                                        platform::DeviceContext* context) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -105,8 +100,8 @@ void matmul<platform::CPUPlace, double>(const framework::Tensor& matrix_a,
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
  gemm<platform::CPUPlace, double>(
-      transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
-      matrix_b.data<double>(), beta, matrix_out->data<double>(), context);
+      matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 }  // namespace math

--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -19,12 +19,13 @@ namespace operators {
 namespace math {
 template <>
-void gemm<platform::GPUPlace, float>(const CBLAS_TRANSPOSE transA,
+void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
+                                     const CBLAS_TRANSPOSE transA,
                                     const CBLAS_TRANSPOSE transB, const int M,
                                     const int N, const int K,
                                     const float alpha, const float* A,
-                                     const float* B, const float beta, float* C,
+                                     const float* B, const float beta,
-                                     platform::DeviceContext* context) {
+                                     float* C) {
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  int lda = (transA == CblasNoTrans) ? K : M;
@@ -35,18 +36,19 @@ void gemm<platform::GPUPlace, float>(const CBLAS_TRANSPOSE transA,
      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      reinterpret_cast<platform::CUDADeviceContext*>(context)->cublas_handle(),
+      reinterpret_cast<const platform::CUDADeviceContext&>(context)
+          .cublas_handle(),
      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 template <>
-void gemm<platform::GPUPlace, double>(const CBLAS_TRANSPOSE transA,
+void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
+                                      const CBLAS_TRANSPOSE transA,
                                      const CBLAS_TRANSPOSE transB, const int M,
                                      const int N, const int K,
                                      const double alpha, const double* A,
                                      const double* B, const double beta,
-                                      double* C,
+                                      double* C) {
-                                      platform::DeviceContext* context) {
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  int lda = (transA == CblasNoTrans) ? K : M;
@@ -56,18 +58,16 @@ void gemm<platform::GPUPlace, double>(const CBLAS_TRANSPOSE transA,
  cublasOperation_t cuTransB =
      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      reinterpret_cast<platform::CUDADeviceContext*>(context)->cublas_handle(),
+      reinterpret_cast<const platform::CUDADeviceContext&>(context)
+          .cublas_handle(),
      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 template <>
-void matmul<platform::GPUPlace, float>(const framework::Tensor& matrix_a,
+void matmul<platform::GPUPlace, float>(
-                                       bool trans_a,
+    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-                                       const framework::Tensor& matrix_b,
+    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
-                                       bool trans_b, float alpha,
+    framework::Tensor* matrix_out, float beta) {
-                                       framework::Tensor* matrix_out,
-                                       float beta,
-                                       platform::DeviceContext* context) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -87,18 +87,15 @@ void matmul<platform::GPUPlace, float>(const framework::Tensor& matrix_a,
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
  gemm<platform::GPUPlace, float>(
-      transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
-      matrix_b.data<float>(), beta, matrix_out->data<float>(), context);
+      matrix_b.data<float>(), beta, matrix_out->data<float>());
 }
 template <>
-void matmul<platform::GPUPlace, double>(const framework::Tensor& matrix_a,
+void matmul<platform::GPUPlace, double>(
-                                        bool trans_a,
+    const platform::DeviceContext& context, const framework::Tensor& matrix_a,
-                                        const framework::Tensor& matrix_b,
+    bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
-                                        bool trans_b, double alpha,
+    framework::Tensor* matrix_out, double beta) {
-                                        framework::Tensor* matrix_out,
-                                        double beta,
-                                        platform::DeviceContext* context) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -118,8 +115,8 @@ void matmul<platform::GPUPlace, double>(const framework::Tensor& matrix_a,
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
  gemm<platform::GPUPlace, double>(
-      transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
-      matrix_b.data<double>(), beta, matrix_out->data<double>(), context);
+      matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 }  // namespace math

--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -66,16 +66,16 @@ namespace math {
 // For more detailed info, please refer to
 // http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
 template <typename Place, typename T>
-void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
+void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
-          const int M, const int N, const int K, const T alpha, const T* A,
+          const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-          const T* B, const T beta, T* C, platform::DeviceContext* context);
+          const T alpha, const T* A, const T* B, const T beta, T* C);
 // matrix multiply with continuous memory
 template <typename Place, typename T>
-void matmul(const framework::Tensor& matrix_a, bool trans_a,
+void matmul(const platform::DeviceContext& context,
+            const framework::Tensor& matrix_a, bool trans_a,
            const framework::Tensor& matrix_b, bool trans_b, T alpha,
-            framework::Tensor* matrix_out, T beta,
+            framework::Tensor* matrix_out, T beta);
-            platform::DeviceContext* context);
 }  // namespace math
 }  // namespace operators

--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -15,8 +15,7 @@ TEST(math_function, notrans_mul_trans) {
  memcpy(input1_ptr, arr, 6 * sizeof(float));
  auto* gpu_place = new paddle::platform::GPUPlace(0);
-  paddle::platform::DeviceContext* context =
+  paddle::platform::CUDADeviceContext context(*gpu_place);
-      new paddle::platform::CUDADeviceContext(*gpu_place);
  input1_gpu.CopyFrom<float>(input1, *gpu_place);
  input2_gpu.CopyFrom<float>(input1, *gpu_place);
@@ -24,7 +23,7 @@ TEST(math_function, notrans_mul_trans) {
  out_gpu.mutable_data<float>({2, 2}, *gpu_place);
  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
-      input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0, context);
+      context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
  out.CopyFrom<float>(out_gpu, *cpu_place);
@@ -33,6 +32,7 @@ TEST(math_function, notrans_mul_trans) {
  EXPECT_EQ(out_ptr[1], 14);
  EXPECT_EQ(out_ptr[2], 14);
  EXPECT_EQ(out_ptr[3], 50);
+  delete gpu_place;
 }
 TEST(math_function, trans_mul_notrans) {
@@ -48,8 +48,7 @@ TEST(math_function, trans_mul_notrans) {
  memcpy(input1_ptr, arr, 6 * sizeof(float));
  auto* gpu_place = new paddle::platform::GPUPlace(0);
-  paddle::platform::DeviceContext* context =
+  paddle::platform::CUDADeviceContext context(*gpu_place);
-      new paddle::platform::CUDADeviceContext(*gpu_place);
  input1_gpu.CopyFrom<float>(input1, *gpu_place);
  input2_gpu.CopyFrom<float>(input1, *gpu_place);
@@ -57,7 +56,7 @@ TEST(math_function, trans_mul_notrans) {
  out_gpu.mutable_data<float>({3, 3}, *gpu_place);
  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
-      input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0, context);
+      context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
  out.CopyFrom<float>(out_gpu, *cpu_place);
@@ -71,5 +70,6 @@ TEST(math_function, trans_mul_notrans) {
  EXPECT_EQ(out_ptr[6], 15);
  EXPECT_EQ(out_ptr[7], 22);
  EXPECT_EQ(out_ptr[8], 29);
+  delete gpu_place;
 }
 #endif
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -71,7 +71,7 @@ class MinusGradOp : public NetOp {
    // x_grad = out_grad
    AppendOp(framework::OpRegistry::CreateOp("identity", {{"X", {out_grad}}},
-                                             {{"Out", {x_grad}}}, {}));
+                                             {{"Y", {x_grad}}}, {}));
    framework::AttributeMap scale_attr;
    scale_attr["scale"] = static_cast<AttrType>(-1);

--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -46,10 +46,8 @@ class MulKernel : public framework::OpKernel {
            : *y;
    z->mutable_data<T>(context.GetPlace());
-    auto* device_context =
+    math::matmul<Place, T>(context.device_context(), x_matrix, false, y_matrix,
-        const_cast<platform::DeviceContext*>(context.device_context_);
+                           false, 1, z, 0);
-    math::matmul<Place, T>(x_matrix, false, y_matrix, false, 1, z, 0,
-                           device_context);
  }
 };
@@ -71,16 +69,14 @@ class MulGradKernel : public framework::OpKernel {
    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* device_context =
-        const_cast<platform::DeviceContext*>(ctx.device_context_);
    if (dx) {
      dx->mutable_data<T>(ctx.GetPlace());
      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix<T>(
                                                     *dx, x_num_col_dims)
                                               : *dx;
      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      math::matmul<Place, T>(*dout, false, y_matrix, true, 1, &dx_matrix, 0,
+      math::matmul<Place, T>(ctx.device_context(), *dout, false, y_matrix, true,
-                             device_context);
+                             1, &dx_matrix, 0);
    }
    if (dy) {
      dy->mutable_data<T>(ctx.GetPlace());
@@ -88,8 +84,8 @@ class MulGradKernel : public framework::OpKernel {
                                                     *dy, y_num_col_dims)
                                               : *dy;
      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      math::matmul<Place, T>(x_matrix, true, *dout, false, 1, &dy_matrix, 0,
+      math::matmul<Place, T>(ctx.device_context(), x_matrix, true, *dout, false,
-                             device_context);
+                             1, &dy_matrix, 0);
    }
  }
 };

--- a/paddle/operators/onehot_cross_entropy_op.cc
+++ b/paddle/operators/onehot_cross_entropy_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/operators/onehot_cross_entropy_op.h"
-namespace paddle {
-namespace operators {
-class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("X"),
-        "Input(X) of OnehotCrossEntropyOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("label"),
-        "Input(label) of OnehotCrossEntropyOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Y"),
-        "Output(Y) of OnehotCrossEntropyOp should not be null.");
-    auto *X = ctx.Input<Tensor>("X");
-    auto *label = ctx.Input<Tensor>("label");
-    PADDLE_ENFORCE_EQ(X->dims().size(), 2, "X's dimension must be 2.");
-    PADDLE_ENFORCE_EQ(label->dims().size(), 1, "label's dimension must be 1.");
-    PADDLE_ENFORCE_EQ(X->dims()[0], label->dims()[0]);
-    ctx.Output<framework::LoDTensor>("Y")->Resize({X->dims()[0], 1});
-  }
-};
-class OnehotCrossEntropyGradientOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto dX = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto X = ctx.Input<Tensor>("X");
-    dX->Resize(X->dims());
-  }
-};
-class OnehotCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  OnehotCrossEntropyOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of OnehotCrossEntropyOp");
-    AddInput("label", "The second input of OnehotCrossEntropyOp");
-    AddOutput("Y", "The output of OnehotCrossEntropyOp");
-    AddComment(R"DOC(
-OnehotCrossEntropy Operator.
-                Y[i] = -log(X[i][j])
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
-            ops::OnehotCrossEntropyOpMaker, onehot_cross_entropy_grad,
-            ops::OnehotCrossEntropyGradientOp);
-REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
-                       ops::OnehotCrossEntropyOpKernel<float>);
-REGISTER_OP_CPU_KERNEL(onehot_cross_entropy_grad,
-                       ops::OnehotCrossEntropyGradientOpKernel<float>);
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/prelu_op.h"
+#include "paddle/operators/net_op.h"
+namespace paddle {
+namespace operators {
+class PReluOp : public framework::OperatorWithKernel {
+ public:
+  PReluOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
+    auto *in = ctx.Input<framework::Tensor>("X");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Alpha"),
+                            "Input(Alpha) should not be null");
+    auto *alpha = ctx.Input<framework::Tensor>("Alpha");
+    PADDLE_ENFORCE(alpha->numel() == 1, "Size of weight Alpha must be one.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
+                            "Output(Out) should not be null");
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    out->Resize(in->dims());
+  }
+};
+class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of prelu operator.");
+    AddInput("Alpha", "The alpha weight of PRelu operator.");
+    AddOutput("Out", "The output tensor of PRelu operator.");
+    AddComment(R"DOC(PRelu operator
+The equation is:
+  f(x) = alpha * x , for x < 0
+  f(x) = x         , for x >= 0
+)DOC");
+  }
+};
+// The operator to calculate gradients of a prelu operator.
+class PReluGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) should not be null");
+    auto *dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *x = ctx.Input<framework::Tensor>("X");
+    auto *dalpha =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("Alpha"));
+    auto *alpha = ctx.Input<framework::Tensor>("Alpha");
+    dx->Resize(x->dims());
+    dalpha->Resize(alpha->dims());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
+            ops::PReluGradOp);
+REGISTER_OP_CPU_KERNEL(prelu,
+                       ops::PReluKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(prelu_grad,
+                       ops::PReluGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/prelu_op.cu
+++ b/paddle/operators/prelu_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/prelu_op.h"
+REGISTER_OP_GPU_KERNEL(
+    prelu, paddle::operators::PReluKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    prelu_grad,
+    paddle::operators::PReluGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/prelu_op.h
+++ b/paddle/operators/prelu_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using platform::Transform;
+template <typename T>
+class PReluFunctor {
+ public:
+  explicit PReluFunctor(const T* alpha) : alpha_(alpha) {}
+  HOSTDEVICE T operator()(const T& x) const {
+    if (x > 0)
+      return x;
+    else
+      return x * (*alpha_);
+  }
+ private:
+  const T* alpha_;
+};
+template <typename Place, typename T>
+class PReluKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* alpha = context.Input<Tensor>("Alpha");
+    auto* out = context.Output<Tensor>("Out");
+    const T* x_ptr = x->data<T>();
+    T* o_ptr = out->mutable_data<T>(context.GetPlace());
+    auto* alpha_ptr = alpha->data<T>();
+    int numel = x->numel();
+    Transform<Place> trans;
+    trans(context.device_context(), x_ptr, x_ptr + numel, o_ptr,
+          PReluFunctor<T>(alpha_ptr));
+  }
+};
+template <typename T>
+class PReluGradFunctor {
+ public:
+  explicit PReluGradFunctor(const T* alpha) : alpha_(alpha) {}
+  HOSTDEVICE T operator()(const T& out, const T& dout) const {
+    if (out > 0)
+      return dout;
+    else
+      return dout * (*alpha_);
+  }
+ private:
+  const T* alpha_;
+};
+template <typename Place, typename T>
+class PReluGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out = context.Input<Tensor>("Out");
+    auto* alpha = context.Input<Tensor>("Alpha");
+    auto* alpha_ptr = alpha->data<T>();
+    T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
+    const T* dout_ptr = dout->data<T>();
+    const T* out_ptr = out->data<T>();
+    int numel = dx->numel();
+    Transform<Place> trans;
+    trans(context.device_context(), out_ptr, out_ptr + numel, dout_ptr, dx_ptr,
+          PReluGradFunctor<T>(alpha_ptr));
+    // TODO (Zhuoyuan): add dalpha upgrade when GPU kernels ready
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/split_op.h"
+#include "paddle/operators/net_op.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+class SplitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    // infershape
+    auto *in = ctx.Input<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
+    size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
+    size_t num = static_cast<size_t>(ctx.Attr<int>("num"));
+    std::vector<int> sections =
+        static_cast<std::vector<int>>(ctx.Attr<std::vector<int>>("sections"));
+    const size_t n = outs.size();
+    if (num > 0) {
+      int64_t in_axis_dim = in->dims()[axis];
+      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
+                        "tensor split does not result"
+                        " in an equal division");
+      size_t out_axis_dim = in_axis_dim / num;
+      for (size_t i = 0; i < n; ++i) {
+        auto dim = in->dims();
+        dim[axis] = out_axis_dim;
+        outs[i]->Resize(dim);
+      }
+    } else if (sections.size() > 0) {
+      PADDLE_ENFORCE_EQ(sections.size(), n,
+                        "tensor split sections size"
+                        "should be equal to output size.");
+      for (size_t i = 0; i < n; ++i) {
+        auto dim = in->dims();
+        dim[axis] = sections[i];
+        outs[i]->Resize(dim);
+      }
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(nullptr, "split operator should",
+                              " specify indices or sections.");
+    }
+  }
+};
+class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "the input tensor of split operator.");
+    AddOutput("Out", "the output tensors of split operator.").AsDuplicable();
+    AddComment(R"DOC(
+      Split the input tensor into multiple sub-tensors.
+      Example:
+        Input = [[1,2],
+                 [3,4],
+                 [5,6]]
+        sections = [2,1]
+        axis = 0
+        Output[0] = [[1,2],
+                     [3,4]]
+        Output[1] = [[5,6]]
+    )DOC");
+    AddAttr<std::vector<int>>("sections",
+                              "the length for each"
+                              "output along with the specify axis.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("num",
+                 "number of the sub-tensors, it must evenly divide "
+                 "Input.dims()[axis]")
+        .SetDefault(0);
+    AddAttr<int>("axis", "The axis which the input will be splited on.")
+        .SetDefault(0);
+  }
+};
+class SplitOpGrad : public NetOp {
+ public:
+  SplitOpGrad(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    auto out_grad = Inputs(framework::GradVarName("Out"));
+    auto x_grad = Output(framework::GradVarName("X"));
+    AppendOp(framework::OpRegistry::CreateOp("concat", {{"X", out_grad}},
+                                             {{"Out", {x_grad}}}, attrs));
+    CompleteAddOp(false);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+USE_CPU_ONLY_OP(concat);
+REGISTER_OP(split, ops::SplitOp, ops::SplitOpMaker, split_grad,
+            ops::SplitOpGrad);
+REGISTER_OP_CPU_KERNEL(split,
+                       ops::SplitKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/split_op.h
+++ b/paddle/operators/split_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename Place, typename T>
+class SplitKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    size_t before = 1, after = 1;
+    const size_t n = outs.size();
+    size_t input_axis_dim = in->dims()[axis];
+    for (int64_t i = 0; i < in->dims().size(); ++i) {
+      if (i == axis) {
+        continue;
+      }
+      if (i < axis) {
+        before *= in->dims()[i];
+      } else {
+        after *= in->dims()[i];
+      }
+    }
+    size_t input_offset = 0;
+    for (size_t i = 0; i < n; i++) {
+      auto& out = outs[i];
+      size_t axis_dim = out->dims()[axis];
+      for (size_t j = 0; j < before; j++) {
+        size_t len = axis_dim * after * sizeof(T);
+        T* dest =
+            out->mutable_data<T>(platform::CPUPlace()) + axis_dim * after * j;
+        const T* src =
+            in->data<T>() + input_offset + input_axis_dim * after * j;
+        memcpy(dest, src, len);
+      }
+      input_offset += axis_dim * after;
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -24,4 +24,4 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
-nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place)
+nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
--- a/paddle/platform/cuda_helper.h
+++ b/paddle/platform/cuda_helper.h
@@ -24,6 +24,11 @@ namespace platform {
 #define USE_CUDA_ATOMIC(op, T) \
  CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
+// Default thread count per block(or block size).
+// TODO(typhoonzero): need to benchmark against setting this value
+//                    to 1024.
+constexpr int PADDLE_CUDA_NUM_THREADS = 512;
 // For atomicAdd.
 USE_CUDA_ATOMIC(Add, float);

--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -101,19 +101,17 @@ CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
  eigen_stream_.reset(new EigenCudaStreamDevice());
  eigen_stream_->Reinitialize(&stream_, place);
  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+  PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
+  PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
+  PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+  PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
 }
 CUDADeviceContext::~CUDADeviceContext() {
  SetDeviceId(place_.device);
  Wait();
-  if (cublas_handle_) {
+  PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
-    PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
+  PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
-  }
-  if (cudnn_handle_) {
-    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
-  }
  eigen_stream_.reset();
  eigen_device_.reset();
  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
@@ -129,25 +127,13 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
  return eigen_device_.get();
 }
-cublasHandle_t CUDADeviceContext::cublas_handle() {
+cublasHandle_t CUDADeviceContext::cublas_handle() const {
-  if (!cublas_handle_) {
-    SetDeviceId(place_.device);
-    PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
-    PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
-  }
  return cublas_handle_;
 }
-cudnnHandle_t CUDADeviceContext::cudnn_handle() {
+cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
-  if (!cudnn_handle_) {
-    SetDeviceId(place_.device);
-    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
-    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
-  }
-  return cudnn_handle_;
-}
-cudaStream_t CUDADeviceContext::stream() { return stream_; }
+cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 #endif  // PADDLE_ONLY_CPU

--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -67,16 +67,14 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Return eigen device in the device context. */
  Eigen::GpuDevice* eigen_device() const;
-  // clang-format off
  /*! \brief  Return cublas handle in the device context. */
-  cublasHandle_t    cublas_handle();
+  cublasHandle_t cublas_handle() const;
  /*! \brief  Return cudnn  handle in the device context. */
-  cudnnHandle_t     cudnn_handle();
+  cudnnHandle_t cudnn_handle() const;
  /*! \brief  Return cuda stream in the device context. */
-  cudaStream_t      stream();
+  cudaStream_t stream() const;
-  // clang-format on
 private:
  GPUPlace place_;
@@ -84,11 +82,9 @@ class CUDADeviceContext : public DeviceContext {
  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
-  // clang-format off
+  cudaStream_t stream_;
-  cudaStream_t       stream_{nullptr};
+  cudnnHandle_t cudnn_handle_;
-  cudnnHandle_t      cudnn_handle_{nullptr};
+  cublasHandle_t cublas_handle_;
-  cublasHandle_t     cublas_handle_{nullptr};
-  // clang-format on
 };
 #endif

--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
@@ -14,6 +14,7 @@
 #pragma once
+#include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/hostdevice.h"
 #include "paddle/platform/place.h"
@@ -21,46 +22,78 @@
 #include <algorithm>
 #include <type_traits>
 #ifdef __NVCC__
+#include <thrust/execution_policy.h>
 #include <thrust/transform.h>
 #include "paddle/platform/details/device_ptr_cast.h"
 #endif
 namespace paddle {
 namespace platform {
 // Transform on host or device. It provides the same API in std library.
-template <typename Place, typename InputIter, typename OutputIter,
+template <typename Place>
-          typename UnaryOperation>
+struct Transform {
-void Transform(Place place, InputIter first, InputIter last, OutputIter result,
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-               UnaryOperation op) {
+  void operator()(const DeviceContext& context, InputIter first, InputIter last,
-  if (is_cpu_place(place)) {
+                  OutputIter result, UnaryOperation op);
+  template <typename InputIter1, typename InputIter2, typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const DeviceContext& context, InputIter1 first1,
+                  InputIter1 last1, InputIter2 first2, OutputIter result,
+                  BinaryOperation op);
+};
+template <>
+struct Transform<platform::CPUPlace> {
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const DeviceContext& context, InputIter first, InputIter last,
+                  OutputIter result, UnaryOperation op) {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
    std::transform(first, last, result, op);
-  } else {
-#ifdef __NVCC__
-    using namespace details;
-    thrust::transform(DevPtrCast(first), DevPtrCast(last), DevPtrCast(result),
-                      op);
-#else
-    PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
-#endif
  }
-}
-template <typename Place, typename InputIter1, typename InputIter2,
+  template <typename InputIter1, typename InputIter2, typename OutputIter,
-          typename OutputIter, typename BinaryOperation>
+            typename BinaryOperation>
-void Transform(Place place, InputIter1 first1, InputIter1 last1,
+  void operator()(const DeviceContext& context, InputIter1 first1,
-               InputIter2 first2, OutputIter result, BinaryOperation op) {
+                  InputIter1 last1, InputIter2 first2, OutputIter result,
-  if (is_cpu_place(place)) {
+                  BinaryOperation op) {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
    std::transform(first1, last1, first2, result, op);
-  } else {
+  }
+};
 #ifdef __NVCC__
-    using namespace details;
+template <>
-    thrust::transform(DevPtrCast(first1), DevPtrCast(last1), DevPtrCast(first2),
+struct Transform<platform::GPUPlace> {
-                      DevPtrCast(result), op);
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-#else
+  void operator()(const DeviceContext& context, InputIter first, InputIter last,
-    PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
+                  OutputIter result, UnaryOperation op) {
-#endif
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
+    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
+    thrust::transform(thrust::cuda::par.on(ctx.stream()),
+                      details::DevPtrCast(first), details::DevPtrCast(last),
+                      details::DevPtrCast(result), op);
+  }
+  template <typename InputIter1, typename InputIter2, typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const DeviceContext& context, InputIter1 first1,
+                  InputIter1 last1, InputIter2 first2, OutputIter result,
+                  BinaryOperation op) {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
+    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
+    thrust::transform(thrust::cuda::par.on(ctx.stream()),
+                      details::DevPtrCast(first1), details::DevPtrCast(last1),
+                      details::DevPtrCast(first2), details::DevPtrCast(result),
+                      op);
  }
 };
+#endif
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -45,14 +45,18 @@ add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})
 set(PSERVER_MAIN_SOURCES
    ParameterServer2Main.cpp)
-add_executable(paddle_pserver_main
-    ${PSERVER_MAIN_SOURCES})
-link_paddle_exe(paddle_pserver_main)
 if(WITH_TESTING)
  add_subdirectory(test)
 endif()
-install(TARGETS paddle_pserver_main
-    RUNTIME DESTINATION opt/paddle/bin
+if(NOT WITH_C_API)
-    PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+  add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES})
-        GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
+  link_paddle_exe(paddle_pserver_main)
-set_target_properties(paddle_pserver_main PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+  install(TARGETS paddle_pserver_main
+          RUNTIME DESTINATION opt/paddle/bin
+          PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+          GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
+  set_target_properties(paddle_pserver_main PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+endif()
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
--- a/paddle/scripts/travis/build_ios.sh
+++ b/paddle/scripts/travis/build_ios.sh
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
--- a/paddle/utils/arch/osx/Excepts.cpp
+++ b/paddle/utils/arch/osx/Excepts.cpp
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
--- a/python/paddle/v2/framework/tests/test_accuracy_op.py
+++ b/python/paddle/v2/framework/tests/test_accuracy_op.py
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_dropout_op.py
+++ b/python/paddle/v2/framework/tests/test_dropout_op.py
--- a/python/paddle/v2/framework/tests/test_fc_op.py
+++ b/python/paddle/v2/framework/tests/test_fc_op.py
--- a/python/paddle/v2/framework/tests/test_identity_op.py
+++ b/python/paddle/v2/framework/tests/test_identity_op.py
--- a/python/paddle/v2/framework/tests/test_mnist.py
+++ b/python/paddle/v2/framework/tests/test_mnist.py
--- a/python/paddle/v2/framework/tests/test_onehot_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_onehot_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_prelu_op.py
+++ b/python/paddle/v2/framework/tests/test_prelu_op.py
--- a/python/paddle/v2/framework/tests/test_split_op.py
+++ b/python/paddle/v2/framework/tests/test_split_op.py