diff --git a/.travis.yml b/.travis.yml
index e217c8f5a740ef5ab7315656ed7839ffa219c805..d0e2696f100e55f320e410afd6a3038db647f76f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -36,10 +36,6 @@ before_install:
   # protobuf version.
   - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
   - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
-  - curl https://glide.sh/get | bash
-  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
-  - go get -u github.com/alecthomas/gometalinter
-  - gometalinter --install
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5739c2a26039426ab544f762e9401445f01e7de7..4b564b48265897d8b412603baf181030e2b00f82 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
-if(NOT ANDROID)
+if(NOT ANDROID AND NOT IOS)
     find_package(Boost QUIET)
 endif()
 
@@ -64,27 +64,29 @@ if(NOT CMAKE_BUILD_TYPE)
       FORCE)
 endif()
 
-if(ANDROID)
-    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
-        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
-    elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
-        # TODO: support glog for Android api 16 ~ 19 in the future
-        message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
+if(ANDROID OR IOS)
+    if(ANDROID)
+        if(AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
+            message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
+        elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+            # TODO: support glog for Android api 16 ~ 19 in the future
+            message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
+        endif()
     endif()
 
     set(WITH_GPU OFF CACHE STRING
-        "Disable GPU when cross-compiling for Android" FORCE)
+        "Disable GPU when cross-compiling for Android and iOS" FORCE)
     set(WITH_AVX OFF CACHE STRING
-        "Disable AVX when cross-compiling for Android" FORCE)
+        "Disable AVX when cross-compiling for Android and iOS" FORCE)
     set(WITH_PYTHON OFF CACHE STRING
-        "Disable PYTHON when cross-compiling for Android" FORCE)
+        "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
     set(WITH_RDMA OFF CACHE STRING
-        "Disable RDMA when cross-compiling for Android" FORCE)
+        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
     set(WITH_MKLDNN OFF CACHE STRING
-        "Disable MKLDNN when cross-compiling for Android" FORCE)
+        "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
     set(WITH_MKLML OFF CACHE STRING
-        "Disable MKLML package when cross-compiling for Android" FORCE)
-endif(ANDROID)
+        "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
+endif()
 
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 854066fd1d205c337fbdbe08997d88251095c799..8fdc382f0c1c453a01dba884a3dad216e1c3092c 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -171,3 +171,10 @@ if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
   add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
   message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
 endif()
+
+if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
+  set(CBLAS_FOUND ON)
+  set(CBLAS_PROVIDER vecLib)
+  set(CBLAS_INC_DIR ${VECLIB_INC_DIR})
+  add_definitions(-DPADDLE_USE_VECLIB)
+endif()
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0b38943952f7fb9052368fe95eb31dd7592d8a47
--- /dev/null
+++ b/cmake/cross_compiling/ios.cmake
@@ -0,0 +1,350 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is a toolchain file for cross-compiling for iOS, and the
+# configuration largely refers to public toolchain file:
+#    https://raw.githubusercontent.com/leetal/ios-cmake/master/ios.toolchain.cmake
+# and
+#    https://github.com/cristeab/ios-cmake
+#
+# Supports options:
+# IOS_PLATFORM = OS (default) or SIMULATOR
+#   This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
+#   OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
+#   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
+# IOS_ARCH
+#   The archectures wanted to support, such "arm64", "armv7;arm64"
+# IOS_DEPLOYMENT_TARGET
+#   The minimum iOS deployment version, such as "7.0"
+# IOS_ENABLE_BITCODE = ON (default) or OFF
+# IOS_USE_VECLIB_FOR_BLAS = OFF (default) or ON
+# IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
+#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
+#   If set manually, it will override the default location and force the user of a particular Developer Platform
+# IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
+#   By default this location is automatcially chosen based on the IOS_DEVELOPER_ROOT value.
+#   In this case it will always be the most up-to-date SDK found in the IOS_DEVELOPER_ROOT path.
+#   If set manually, this will force the use of a specific SDK version
+
+# Macros:
+# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
+#  A convenience macro for setting xcode specific properties on targets
+#  example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
+# find_host_package (PROGRAM ARGS)
+#  A macro used to find executable programs on the host system, not within the iOS environment.
+#  Thanks to the android-cmake project for providing the command
+
+if(NOT IOS)
+  return()
+endif()
+
+set(CMAKE_SYSTEM_NAME Darwin)
+
+# Get the Xcode version being used.
+execute_process(COMMAND xcodebuild -version
+                OUTPUT_VARIABLE XCODE_VERSION
+                RESULT_VARIABLE XCODE_VERSION_RESULT
+                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT ${XCODE_VERSION_RESULT})
+  string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
+  string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
+  message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
+else()
+  message(FATAL_ERROR "Cannot execute xcodebuild, please check whether xcode is installed.")
+endif()
+
+# Required as of cmake 2.8.10
+set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
+
+# Setup iOS platform unless specified manually with IOS_PLATFORM
+if(NOT DEFINED IOS_PLATFORM)
+  set(IOS_PLATFORM "OS")
+endif()
+set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
+
+# Set the architecture for iOS
+if(NOT DEFINED IOS_ARCH)
+  if(IOS_PLATFORM STREQUAL "OS")
+    # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
+    set(IOS_ARCH "arm64")
+  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
+    set(IOS_ARCH "i386;x86_64")
+  elseif(IOS_PLATFORM STREQUAL "WATCHOS")
+    set(IOS_ARCH armv7k)
+  endif()
+endif()
+set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
+
+# Specify minimum iOS deployment version
+if(NOT DEFINED IOS_DEPLOYMENT_TARGET)
+  set(IOS_DEPLOYMENT_TARGET "7.0")
+endif()
+set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version")
+
+# Whether to enable bitcode
+if(NOT DEFINED IOS_ENABLE_BITCODE)
+  set(IOS_ENABLE_BITCODE ON)
+endif()
+set(IOS_ENABLE_BITCODE ${IOS_ENABLE_BITCODE} CACHE BOOL "Whether to enable bitcode")
+
+if(NOT DEFINED IOS_USE_VECLIB_FOR_BLAS)
+  set(IOS_USE_VECLIB_FOR_BLAS OFF)
+endif()
+set(IOS_USE_VECLIB_FOR_BLAS ${IOS_UES_VECLIB_FOR_BLAS} CACHE BOOL "Whether to use veclib")
+
+# Check the platform selection and setup for developer root
+if(${IOS_PLATFORM} STREQUAL "OS")
+  set(IOS_PLATFORM_LOCATION "iPhoneOS.platform")
+  set(XCODE_IOS_PLATFORM iphoneos)
+
+  # This causes the installers to properly locate the output libraries
+  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
+elseif(${IOS_PLATFORM} STREQUAL "SIMULATOR")
+  set(IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
+  set(XCODE_IOS_PLATFORM iphonesimulator)
+
+  # This causes the installers to properly locate the output libraries
+  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
+elseif(${IOS_PLATFORM} STREQUAL "WATCHOS")
+  set(IOS_PLATFORM_LOCATION "WatchOS.platform")
+  set(XCODE_IOS_PLATFORM watchos)
+
+  # This causes the installers to properly locate the output libraries
+  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-watchos")
+else(${IOS_PLATFORM} STREQUAL "OS")
+  message(FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please set to\n"
+          "\t OS, SIMULATOR, or WATCHOS.")
+endif()
+
+# Check iOS developer toolchain
+if(NOT DEFINED IOS_DEVELOPER_ROOT)
+  # Setup iOS developer location
+  execute_process(COMMAND xcode-select -print-path
+                  OUTPUT_VARIABLE XCODE_DEVELOPER_DIR
+                  RESULT_VARIABLE XCODE_DEVELOPER_DIR_RESULT
+                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  # Xcode 4.3 changed the installation location, choose the most recent one available
+  if(${XCODE_VERSION} VERSION_LESS "4.3.0")
+    set(IOS_DEVELOPER_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+  else()
+    set(IOS_DEVELOPER_ROOT "${XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+  endif()
+endif()
+if(EXISTS ${IOS_DEVELOPER_ROOT})
+  set(IOS_DEVELOPER_ROOT ${IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
+else()
+  message(FATAL_ERROR "Invalid IOS_DEVELOPER_ROOT: ${IOS_DEVELOPER_ROOT} does not exist.")
+endif()
+
+# Check iOS SDK
+if(NOT DEFINED IOS_SDK_ROOT)
+  # Find and use the most recent iOS sdk
+  file(GLOB IOS_SDK_LISTS "${IOS_DEVELOPER_ROOT}/SDKs/*")
+  if(IOS_SDK_LISTS)
+    list(SORT IOS_SDK_LISTS)
+    list(REVERSE IOS_SDK_LISTS)
+    list(GET IOS_SDK_LISTS 0 IOS_SDK_ROOT)
+  else(IOS_SDK_LISTS)
+    message(FATAL_ERROR "No iOS SDK's found in default search path ${IOS_DEVELOPER_ROOT}."
+            " Please manually set IOS_SDK_ROOT or install the iOS SDK.")
+  endif(IOS_SDK_LISTS)
+endif()
+if(EXISTS ${IOS_SDK_ROOT})
+  set(IOS_SDK_ROOT ${IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
+  message(STATUS "iOS toolchain: ${IOS_SDK_ROOT}")
+else()
+  message(FATAL_ERROR "Invalid IOS_SDK_ROOT: ${IOS_SDK_ROOT} does not exist.")
+endif()
+
+# Set the sysroot default to the most recent SDK
+set(CMAKE_OSX_SYSROOT ${IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
+
+# Get version of iOS SDK
+execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
+                OUTPUT_VARIABLE IOS_SDK_VERSION
+                RESULT_VARIABLE IOS_SDK_VERSION_RESULT
+                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(${IOS_SDK_VERSION_RESULT})
+  string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" IOS_SDK_VERSION "${IOS_SDK_ROOT}")
+endif()
+if(NOT IOS_SDK_VERSION)
+  message(WARNING "Cannot get SDK's version.")
+  set(IOS_SDK_VERSION 1)
+endif()
+set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION})
+
+# Find the C & C++ compilers for the specified SDK.
+if(NOT CMAKE_C_COMPILER)
+  # Default to use clang
+  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
+                  OUTPUT_VARIABLE IOS_C_COMPILER
+                  RESULT_VARIABLE IOS_C_COMPILER_RESULT
+                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(${IOS_C_COMPILER_RESULT})
+    get_filename_component(IOS_C_COMPILER clang PROGRAM)
+  endif()
+else(NOT CMAKE_C_COMPILER)
+  # User can set it in cmake command
+  get_filename_component(IOS_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
+endif(NOT CMAKE_C_COMPILER)
+if(NOT EXISTS ${IOS_C_COMPILER})
+  message(FATAL_ERROR "Cannot find C compiler: ${IOS_C_COMPILER}")
+endif()
+
+if(NOT CMAKE_CXX_COMPILER)
+  # Default to use clang++
+  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
+                  OUTPUT_VARIABLE IOS_CXX_COMPILER
+                  RESULT_VARIABLE IOS_CXX_COMPILER_RESULT
+                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(${IOS_CXX_COMPILER_RESULT})
+    get_filename_component(IOS_CXX_COMPILER clang++ PROGRAM)
+  endif()
+else(NOT CMAKE_CXX_COMPILER)
+  # User can set it in cmake command
+  get_filename_component(IOS_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
+endif(NOT CMAKE_CXX_COMPILER)
+if(NOT EXISTS ${IOS_CXX_COMPILER})
+  message(FATAL_ERROR "Cannot find CXX compiler: ${IOS_CXX_COMPILER}")
+endif()
+
+set(CMAKE_C_COMPILER ${IOS_C_COMPILER} CACHE PATH "C compiler" FORCE)
+set(CMAKE_CXX_COMPILER ${IOS_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
+
+set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
+set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
+set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
+set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
+
+# Set iOS specific C/C++ flags
+if(IOS_PLATFORM STREQUAL "OS")
+  if(XCODE_VERSION VERSION_LESS "7.0")
+    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
+  else()
+    # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM.
+    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
+  endif()
+else()
+  set(XCODE_IOS_FLATFORM_VERSION_FLAGS "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
+endif()
+
+if(IOS_ENABLE_BITCODE)
+  set(XCODE_IOS_BITCODE_FLAGS "${IOS_COMPILER_FLAGS} -fembed-bitcode")
+else()
+  set(XCODE_IOS_BITCODE_FLAGS "")
+endif()
+
+set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_FLAGS}")
+
+# Hidden visibilty is required for cxx on iOS 
+set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
+set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
+
+set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
+
+if(IOS_USE_VECLIB_FOR_BLAS)
+  # Find vecLib for iOS
+  set(VECLIB_SEARCH_DIRS
+      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks
+      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Frameworks
+      )
+  find_path(VECLIB_INC_DIR vecLib.h PATHS ${VECLIB_SEARCH_DIRS}/vecLib.framework/Headers)
+
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(vecLib DEFAULT_MSG VECLIB_INC_DIR)
+
+  if(VECLIB_FOUND)
+    if(VECLIB_INC_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
+      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework vecLib")
+      message(STATUS "Found standalone vecLib.framework")
+    else()
+      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework Accelerate")
+      message(STATUS "Found vecLib as part of Accelerate.framework")
+    endif()
+
+  endif()
+endif()
+
+set(CMAKE_C_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_C_LINK_FLAGS}")
+set(CMAKE_CXX_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_CXX_LINK_FLAGS}")
+
+set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
+if(NOT IOS_ENABLE_BITCODE)
+  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
+  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
+else()
+  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib")
+  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle")
+endif()
+set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
+set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
+set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
+
+# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
+# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
+# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
+# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
+if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
+  find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
+endif()
+
+# Set the find root to the iOS developer roots and to user defined paths
+set(CMAKE_FIND_ROOT_PATH ${IOS_DEVELOPER_ROOT} ${IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH}
+    CACHE string  "iOS find search path root")
+
+# default to searching for frameworks first
+set(CMAKE_FIND_FRAMEWORK FIRST)
+
+# set up the default search directories for frameworks
+set(CMAKE_SYSTEM_FRAMEWORK_PATH
+    ${IOS_SDK_ROOT}/System/Library/Frameworks
+    ${IOS_SDK_ROOT}/System/Library/PrivateFrameworks
+    ${IOS_SDK_ROOT}/Developer/Library/Frameworks
+    )
+
+# only search the iOS sdks, not the remainder of the host filesystem
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+message(STATUS "iOS: Targeting iOS '${CMAKE_SYSTEM_VERSION}', "
+        "building for '${IOS_PLATFORM}' platform, with architecture '${CMAKE_OSX_ARCHITECTURES}'")
+message(STATUS "System CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
+message(STATUS "System CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+
+# Used in ExternalProject command
+string(REPLACE ";" "\\$<SEMICOLON>" EXTERNAL_IOS_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+set(EXTERNAL_OPTIONAL_ARGS
+    -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT}
+    -DCMAKE_OSX_ARCHITECTURES=${EXTERNAL_IOS_ARCHITECTURES})
+
+# This little macro lets you set any XCode specific property
+macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
+  set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
+endmacro(set_xcode_property)
+
+# This macro lets you find executable programs on the host system
+macro(find_host_package)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
+  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+  set(IOS FALSE)
+
+  find_package(${ARGN})
+
+  set(IOS TRUE)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+endmacro(find_host_package)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 01a2f4d5fa357ca882162247cc52299a3d1d3030..957f8271e4841836956b0c3f2cf3d8c88a31192a 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -39,13 +39,14 @@ ExternalProject_Add(
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DBUILD_TESTING=OFF
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_TESTING=OFF
+                    -DCMAKE_BUILD_TYPE=Release
+                    ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=Release
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index b450a3016667dcb4ab229fe7ec8aaae8609d8171..b3fef738ccc0b5886bb0a32501bb7b7adade0ff1 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -34,16 +34,17 @@ ExternalProject_Add(
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
-    CMAKE_ARGS      -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DWITH_GFLAGS=ON
-    CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
-    CMAKE_ARGS      -DBUILD_TESTING=OFF
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
+                    -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DWITH_GFLAGS=ON
+                    -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
+                    -DBUILD_TESTING=OFF
+                    -DCMAKE_BUILD_TYPE=Release
+                    ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
                      -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index e3970073a1a0b946fa1db6642799719d7a9fcf4f..6a2a79b7631b32e8a099797de509af64533bbb95 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -48,15 +48,16 @@ IF(WITH_TESTING)
         PREFIX          ${GTEST_SOURCES_DIR}
         UPDATE_COMMAND  ""
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-        CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-        CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-        CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
-        CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-        CMAKE_ARGS      -DBUILD_GMOCK=ON
-        CMAKE_ARGS      -Dgtest_disable_pthreads=ON
-        CMAKE_ARGS      -Dgtest_force_shared_crt=ON
-        CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DBUILD_GMOCK=ON
+                        -Dgtest_disable_pthreads=ON
+                        -Dgtest_force_shared_crt=ON
+                        -DCMAKE_BUILD_TYPE=Release
+                        ${EXTERNAL_OPTIONAL_ARGS}
         CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
                          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                          -DCMAKE_BUILD_TYPE:STRING=Release
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 4fc8d43fc10891603b79c01a1c769cae21c52655..143b57a954e4e6b2bf273535ebdf0fa8e3dab768 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -29,30 +29,41 @@ IF(NOT ${CBLAS_FOUND})
         "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
         CACHE FILEPATH "openblas library." FORCE)
 
-    IF(APPLE)
-        SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
-        SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
-    ELSE()
-        SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
-    ENDIF()
+    SET(OPENBLAS_CC "${CMAKE_C_COMPILER}")
 
     IF(CMAKE_CROSSCOMPILING)
+        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
+        GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
+        SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
         IF(ANDROID)
             # arm_soft_fp_abi branch of OpenBLAS to support softfp
             #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
             SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
             IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-                SET(TARGET "ARMV7")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
             ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-                SET(TARGET "ARMV8")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
+            ENDIF()
+        ELSEIF(IOS)
+            # FIXME(liuyiqun): support multiple architectures
+            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
+            SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
+            IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
+            ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
             ENDIF()
-            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
         ELSEIF(RPI)
             # use hardfp
             SET(OPENBLAS_COMMIT "v0.2.20")
-            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0)
+            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
         ENDIF()
     ELSE()
+        IF(APPLE)
+            SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+        ENDIF()
         SET(OPENBLAS_COMMIT "v0.2.20")
         SET(OPTIONAL_ARGS "")
         IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
@@ -60,6 +71,8 @@ IF(NOT ${CBLAS_FOUND})
         ENDIF()
     ENDIF()
 
+    SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
+
     ExternalProject_Add(
         extern_openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index a887be2e2ae5e21562fc15c775bb24cc1553480e..7cf7ba85cca4c248dcc74e078124c0b3815ee380 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -173,7 +173,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
             "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
             "-Dprotobuf_WITH_ZLIB=ON"
-            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}")
+            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
+            ${EXTERNAL_OPTIONAL_ARGS})
         SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
     ENDIF()
 
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 490c87d67ed79a238dd506127cd4d9855fab6626..46c68cce324f565ec9985ef1a280d6d933f88f1f 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+IF(NOT WITH_PYTHON)
+    return()
+ENDIF()
+
 INCLUDE(python_module)
 
 FIND_PACKAGE(PythonInterp 2.7)
-IF(WITH_PYTHON)
-    FIND_PACKAGE(PythonLibs 2.7)
-    # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
-    ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
-ENDIF(WITH_PYTHON)
+FIND_PACKAGE(PythonLibs 2.7)
+# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
+ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 
 SET(py_env "")
 IF(PYTHONINTERP_FOUND)
@@ -36,9 +37,5 @@ IF(PYTHONINTERP_FOUND)
     ENDIF()
 ENDIF(PYTHONINTERP_FOUND)
 
-IF(WITH_PYTHON)
-    INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
-    INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
-ELSE()
-    SET(PYTHON_LIBRARIES "")
-ENDIF()
+INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
index 744c766ee7b067058b2cb4aa7f7b761cbb9778d4..ce088ae7eaa3355f2f9761e8c421da0d7ef89fa7 100644
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+IF(NOT WITH_SWIG_PY)
+    return()
+ENDIF()
+
 FIND_PACKAGE(SWIG)
 
 IF(NOT SWIG_FOUND)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 2d7daed9bcd5b8d854ffae6dc1ea191d154c16fe..bb258c7b5581fc22b44f4fe15c119f8081f4767e 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -16,25 +16,14 @@ INCLUDE(ExternalProject)
 
 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
-SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
 
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
-
-SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE)
-
-IF(WIN32)
-    SET(WARPCTC_LIBRARIES
-        "${WARPCTC_INSTALL_DIR}/lib/warpctc.dll" CACHE FILEPATH "Warp-ctc Library" FORCE)
-ELSE(WIN32)
-    IF(APPLE)
-        SET(_warpctc_SHARED_SUFFIX dylib)
-    ELSE(APPLE)
-        SET(_warpctc_SHARED_SUFFIX so)
-    ENDIF(APPLE)
-
-    SET(WARPCTC_LIBRARIES
-        "${WARPCTC_INSTALL_DIR}/lib/libwarpctc.${_warpctc_SHARED_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE)
-ENDIF(WIN32)
+SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
+    CACHE PATH "Warp-ctc Directory" FORCE)
+# Used in unit test test_WarpCTCLayer
+SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
+    CACHE PATH "Warp-ctc Library Directory" FORCE)
+SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+    CACHE FILEPATH "Warp-ctc Library" FORCE)
 
 IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
     SET(USE_OMP OFF)
@@ -49,22 +38,26 @@ ExternalProject_Add(
     PREFIX          ${WARPCTC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-    CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
-    CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
-    CMAKE_ARGS      -DWITH_TORCH=OFF
-    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-    CMAKE_ARGS      -DBUILD_SHARED=ON
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+                    -DWITH_GPU=${WITH_GPU}
+                    -DWITH_OMP=${USE_OMP}
+                    -DWITH_TORCH=OFF
+                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+                    -DBUILD_SHARED=ON
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_BUILD_TYPE=Release
+                    ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
 
+MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+
 ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
 ADD_DEPENDENCIES(warpctc extern_warpctc)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 5aecab90ca3cecdfdba0eac178a6ba07dfcb8745..c496a52b780364f3014f8fa3dfbc944a7aa7430e 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -34,15 +34,16 @@ ExternalProject_Add(
     GIT_TAG         "v1.2.8"
     PREFIX          ${ZLIB_SOURCES_DIR}
     UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
-    CMAKE_ARGS      -DBUILD_SHARED_LIBS=OFF
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DCMAKE_MACOSX_RPATH=ON
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
+                    -DBUILD_SHARED_LIBS=OFF
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_MACOSX_RPATH=ON
+                    -DCMAKE_BUILD_TYPE=Release
+                    ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=Release
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index ff246b2eb4ed97dd14d45763569b661cefd203c8..4593ae6180b6d7deb61d897eb634b17ac0bb1683 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -128,8 +128,10 @@ set(GPU_COMMON_FLAGS
 )
 
 if (APPLE)
-    # On Mac OS X build fat binaries with x86_64 architectures by default.
-    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
+    if(NOT CMAKE_CROSSCOMPILING)
+        # On Mac OS X build fat binaries with x86_64 architectures by default.
+        set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
+    endif()
 else()
     set(GPU_COMMON_FLAGS
         -Wall
diff --git a/cmake/system.cmake b/cmake/system.cmake
index adf5e2c539740076ad1808353522c7467d765e64..396bd1a0797edea0522bb1f02349373563b7726a 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -24,11 +24,10 @@ IF(WIN32)
     SET(HOST_SYSTEM "win32")
 ELSE(WIN32)
     IF(APPLE)
-        EXEC_PROGRAM (sw_vers ARGS -productVersion OUTPUT_VARIABLE MACOSX_VERSION)
-        STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
-        SET(MACOS_VERSION ${VERSION})
         SET(HOST_SYSTEM "macosx")
-        IF(NOT DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
+        EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION)
+        STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}")
+        IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET})
             # Set cache variable - end user may change this during ccmake or cmake-gui configure.
             SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
                 "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
@@ -49,6 +48,8 @@ ELSE(WIN32)
             ELSEIF(LINUX_ISSUE MATCHES "Fedora")
                 SET(HOST_SYSTEM "fedora")
             ENDIF()
+
+            STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}")
         ENDIF(EXISTS "/etc/issue")
 
         IF(EXISTS "/etc/redhat-release")
@@ -70,7 +71,7 @@ CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
 
 MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
 
-MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
+MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
 MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
 
 # configuration for cross-compiling
@@ -82,6 +83,9 @@ IF(DEFINED CMAKE_SYSTEM_NAME)
     ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi")
         SET(RPI TRUE)
         INCLUDE(cross_compiling/raspberry_pi)
+    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+        SET(IOS TRUE)
+        INCLUDE(cross_compiling/ios)
     ENDIF()
 ENDIF()
 
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 0da4969d310368ab27b0ed65237813c07d6e59f0..e814cad36f2a8ce95a2dc9fabc35cb39506d4cd7 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -25,7 +25,9 @@ function(target_circle_link_libraries TARGET_NAME)
             endif()
         endforeach()
         if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-            list(APPEND LIBS "-undefined dynamic_lookup")
+            if(IOS AND NOT IOS_ENABLE_BITCODE)
+                list(APPEND LIBS "-undefined dynamic_lookup")
+            endif()
         endif()
         list(REVERSE libsInArgn)
         target_link_libraries(${TARGET_NAME}
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index 138efb566e43fa71952f057829c2afbca96cadc9..00192aa69bd487787a8743d5589a365eacbd4ff3 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -321,3 +321,55 @@ pip uninstall py_paddle paddle
 然后安装paddle的python环境, 在build目录下执行
 
 pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+16. PaddlePaddle存储的参数格式是什么，如何和明文进行相互转化
+---------------------------------------------------------
+
+PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数两部分组成。头信息中，1~4字节表示PaddlePaddle版本信息，请直接填充0；5~8字节表示每个参数占用的字节数，当保存的网络参数为float类型时为4，double类型时为8；9~16字节表示保存的参数总个数。
+
+将PaddlePaddle保存的模型参数还原回明文时，可以使用相应数据类型的 :code:`numpy.array` 加载具体网络参数，此时可以跳过PaddlePaddle模型参数文件的头信息。若在PaddlePaddle编译时，未指定按照double精度编译，默认情况下按照float精度计算，保存的参数也是float类型。这时在使用 :code:`numpy.array` 时，一般设置 :code:`dtype=float32` 。示例如下：
+
+..  code-block:: python
+
+    def read_parameter(fname, width):
+        s = open(fname).read()
+        # skip header
+        vec = np.fromstring(s[16:], dtype=np.float32)
+        # width is the size of the corresponding layer
+        np.savetxt(fname + ".csv", vec.reshape(width, -1),
+                fmt="%.6f", delimiter=",")
+
+
+将明文参数转化为PaddlePaddle可加载的模型参数时，首先构造头信息，再写入网络参数。下面的代码将随机生成的矩阵转化为可以被PaddlePaddle加载的模型参数。
+
+..  code-block:: python
+
+    def gen_rand_param(param_file, width, height, need_trans):
+        np.random.seed()
+        header = struct.pack("iil", 0, 4, height * width)
+        param = np.float32(np.random.rand(height, width))
+        with open(param_file, "w") as fparam:
+            fparam.write(header + param.tostring())
+
+17. 如何加载预训练参数
+------------------------------
+
+* 对加载预训练参数的层，设置其参数属性 :code:`is_static=True`，使该层的参数在训练过程中保持不变。以embedding层为例，代码如下：
+
+..  code-block:: python
+
+    emb_para = paddle.attr.Param(name='emb', is_static=True)
+    paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
+
+
+* 从模型文件将预训练参数载入 :code:`numpy.array`，在创建parameters后，使用 :code:`parameters.set()` 加载预训练参数。PaddlePaddle保存的模型参数文件前16字节为头信息，用户将参数载入 :code:`numpy.array` 时须从第17字节开始。以embedding层为例，代码如下：
+
+..  code-block:: python
+
+    def load_parameter(file_name, h, w):
+        with open(file_name, 'rb') as f:
+            f.read(16)  # skip header.
+            return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+    parameters = paddle.parameters.create(my_cost)
+    parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index ec866b2907d4623e8a94a249bc9af624071ade97..b435de80a224571d16efdee168541aa301c3f73a 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -19,7 +19,7 @@ if(Boost_FOUND)
 endif()
 
 if(WITH_C_API)
-    add_subdirectory(capi)
+  add_subdirectory(capi)
 endif()
 
 if(WITH_SWIG_PY)
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index 3af111eb5738c3f2f399ff4e5c06c8d2ecd8973e..dd9e4f1cbd636e29a6934d1119fc93ebc9d0ecee 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -28,42 +28,38 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
 
 add_dependencies(paddle_capi paddle_proto)
 
-
 # combine all paddle static libraries together, into libpaddle_capi_whole.a
 # user should use PaddleCAPI as -lpaddle_capi_whole
-set(capi_whole_library libpaddle_capi_whole.a)
-add_custom_target(paddle_capi_whole ALL
-        COMMAND mkdir -p o_files/capi && cd o_files/capi/ && ar -x $<TARGET_FILE:paddle_capi>
-        COMMAND mkdir -p o_files/utils && cd o_files/utils/ && ar -x $<TARGET_FILE:paddle_utils>
-        COMMAND mkdir -p o_files/parameter && cd o_files/parameter/ && ar -x $<TARGET_FILE:paddle_parameter>
-        COMMAND mkdir -p o_files/math && cd o_files/math/  && ar -x $<TARGET_FILE:paddle_math>
-        COMMAND mkdir -p o_files/cuda && cd o_files/cuda/ && ar -x $<TARGET_FILE:paddle_cuda>
-        COMMAND mkdir -p o_files/function && cd o_files/function/ && ar -x $<TARGET_FILE:paddle_function>
-        COMMAND mkdir -p o_files/gserver && cd o_files/gserver/ && ar -x $<TARGET_FILE:paddle_gserver>
-        COMMAND mkdir -p o_files/proto && cd o_files/proto/ && ar -x $<TARGET_FILE:paddle_proto>
-        COMMAND mkdir -p o_files/network && cd o_files/network/ && ar -x $<TARGET_FILE:paddle_network>
-        COMMAND mkdir -p o_files/pserver && cd o_files/pserver/ && ar -x $<TARGET_FILE:paddle_pserver>
-        COMMAND ar crs ${capi_whole_library} `find ./o_files -name '*.o'`
-        COMMAND rm -rf o_files
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-        DEPENDS paddle_capi paddle_utils paddle_parameter paddle_math
-                paddle_cuda paddle_function paddle_gserver
-                paddle_proto paddle_pserver paddle_network
-        )
-set_target_properties(paddle_capi_whole
-  PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library})
+set(PADDLE_CAPI_INFER_LIBS
+    paddle_utils
+    paddle_parameter
+    paddle_math
+    paddle_cuda
+    paddle_function
+    paddle_gserver
+    paddle_proto
+    paddle_pserver
+    paddle_network)
+
+cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
 
-set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
-# TODO: merge mkl into paddle_capi_shared
-add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
-set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
-link_paddle_exe(paddle_capi_shared)
+# No shared library for iOS
+if(NOT IOS)
+  set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
+  # TODO: merge mkl into paddle_capi_shared
+  add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
+  set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+  link_paddle_exe(paddle_capi_shared)
+endif()
 
 # install library & headers.
 install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
 if(ANDROID)
+  install(TARGETS paddle_capi_whole paddle_capi_shared
+          ARCHIVE DESTINATION lib/${ANDROID_ABI}
+          LIBRARY DESTINATION lib/${ANDROID_ABI})
   execute_process(
     COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
     OUTPUT_VARIABLE GIT_COMMITS_LIST
@@ -72,9 +68,6 @@ if(ANDROID)
   if(${GIT_COMMITS_LIST_RESULT})
     set(GIT_COMMITS_LIST "No commits.")
   endif()
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library}
-          DESTINATION lib/${ANDROID_ABI})
-  install(TARGETS paddle_capi_shared DESTINATION lib/${ANDROID_ABI})
   install(CODE "FILE(WRITE ${CMAKE_INSTALL_PREFIX}/lib/${ANDROID_ABI}/BUILD.txt
           \"Compiler:\n\"
           \"\\t${CMAKE_C_COMPILER}\\n\"
@@ -88,8 +81,11 @@ if(ANDROID)
       )"
   )
 else(ANDROID)
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib)
-  install(TARGETS paddle_capi_shared DESTINATION lib)
+  install(TARGETS paddle_capi_whole
+          ARCHIVE DESTINATION lib)
+  if(NOT IOS)
+    install(TARGETS paddle_capi_shared DESTINATION lib)
+  endif()
 endif(ANDROID)
 
 # this variable used for unittest
diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp
index 18126152ea0b4ebfe4ec5c8084479787814ed173..38aa6670612b0771cdd8f1805a6d1bd9f281bdc1 100644
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
@@ -52,7 +52,7 @@ public:
     int outputHeight = output[2];
     int outputWidth = output[3];
     int filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
+    CHECK_EQ(static_cast<size_t>(inputChannels), groups_);
 
     // only support strideH() == strideW() and filterHeight == filterWidth.
     CHECK_EQ(strideH(), strideW());
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 78e958e06fac84fa956abc9faea60157bf6132eb..8b7b2e9b65898950e036ebc023cd28990cef303f 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -22,9 +22,12 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/parameter/Argument.h"
 #include "paddle/utils/ClassRegistrar.h"
-
 #include "paddle/utils/Logging.h"
 
+#ifdef PADDLE_USE_MKLDNN
+#include "MKLDNNActivation.h"
+#endif
+
 namespace paddle {
 
 static ClassRegistrar<ActivationFunction> gActivationRegistrar;
@@ -456,6 +459,12 @@ Error __must_check backward(Argument& act) {
 END_DEFINE_ACTIVATION(log)
 
 ActivationFunction* ActivationFunction::create(const std::string& type) {
+#ifdef PADDLE_USE_MKLDNN
+  if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
+    return MKLDNNActivation::create(type);
+  }
+#endif
+
   return gActivationRegistrar.createByType(type);
 }
 
diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac50937ef3e28c1ac5aae651f9cf266ad07abcc4
--- /dev/null
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@@ -0,0 +1,87 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNActivation.h"
+#include "mkldnn.hpp"
+#include "paddle/utils/ClassRegistrar.h"
+
+namespace paddle {
+
+static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
+/**
+ * @def MKLDNN_ACTIVATION_CLASS_NAME
+ * @note MKLDNN_ACTIVATION_CLASS_NAME(relu) relu_;
+ * means mkldnn_reluActivation relu_;
+ */
+#define MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) mkldnn_##ACT_TYPE##Activation
+
+/**
+ * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
+ */
+#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(ACT_TYPE, ALPHA, BWD_ALPHA)        \
+  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)                              \
+      : public MKLDNNEltwiseActivation {                                    \
+  private:                                                                  \
+    static const std::string name;                                          \
+    static const float alpha;                                               \
+    static const float bwdAlpha;                                            \
+                                                                            \
+  public:                                                                   \
+    const std::string& getName() const { return name; }                     \
+    float getAlpha() const { return alpha; }                                \
+    float getBwdAlpha() const { return bwdAlpha; }                          \
+  };                                                                        \
+  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name =          \
+      "mkldnn_" #ACT_TYPE;                                                  \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA;        \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA; \
+  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {              \
+    gMKLDNNActivationRegistrar                                              \
+        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(             \
+            "mkldnn_" #ACT_TYPE);                                           \
+  });
+
+/**
+ * @brief MKLDNN Relu Activation.
+ * Actually mkldnn_relu is Leaky Relu.
+ *  f(x) = x                   (x >= 0)
+ *  f(x) = negative_slope * x  (x <  0)
+ * @note the negative_slope should be -0.f in forward
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, -0.f, 0.f)
+
+/**
+ * @brief MKLDNN Tanh Activation.
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, 0.f, 0.f)
+
+/**
+ * @brief MKLDNN ELU(Exponential Linear Unit) Activation.
+ *  f(x) = x                              (x >= 0)
+ *  f(x) = negative_slope * (exp(x) - 1)  (x <  0)
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, 0.f, 0.f)
+
+ActivationFunction* MKLDNNActivation::create(const std::string& type) {
+  return gMKLDNNActivationRegistrar.createByType(type);
+}
+
+std::vector<std::string> MKLDNNActivation::getAllRegisteredTypes() {
+  std::vector<std::string> types;
+  gMKLDNNActivationRegistrar.forEachType(
+      [&](const std::string& type) { types.push_back(type); });
+  return types;
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/activations/MKLDNNActivation.h b/paddle/gserver/activations/MKLDNNActivation.h
new file mode 100644
index 0000000000000000000000000000000000000000..bda9bbebe5600dbe26d11ff32058f7b2647b763e
--- /dev/null
+++ b/paddle/gserver/activations/MKLDNNActivation.h
@@ -0,0 +1,182 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "ActivationFunction.h"
+#include "mkldnn.hpp"
+#include "paddle/gserver/layers/MKLDNNBase.h"
+#include "paddle/math/MKLDNNMatrix.h"
+#include "paddle/parameter/Argument.h"
+
+namespace paddle {
+
+/**
+ * @brief Base class of MKLDNN Activation.
+ * Common activation function are provieded,
+ * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
+ */
+class MKLDNNActivation : public ActivationFunction {
+protected:
+  // input value element count
+  size_t cnt_;
+  // should not merge the resetBwd into resetFwd,
+  // because the grad data would be changing before backward.
+  bool needResetBwd_;
+  // mkldnn matrix, primitive, stream and pipeline
+  MKLDNNMatrixPtr val_;
+  MKLDNNMatrixPtr grad_;
+  std::shared_ptr<MKLDNNStream> stream_;
+  std::shared_ptr<mkldnn::primitive> fwd_;
+  std::shared_ptr<mkldnn::primitive> bwd_;
+  std::vector<mkldnn::primitive> pipelineFwd_;
+  std::vector<mkldnn::primitive> pipelineBwd_;
+
+public:
+  MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
+  ~MKLDNNActivation() {}
+  static ActivationFunction* create(const std::string& type);
+  static std::vector<std::string> getAllRegisteredTypes();
+  virtual const std::string& getName() const = 0;
+  virtual Error __must_check forward(Argument& act) = 0;
+  virtual Error __must_check backward(Argument& act) = 0;
+};
+
+/**
+ * @brief Base class of MKLDNN Eltwise Activation,
+ * includes mkldnn_relu, mkldnn_elu and mkldnn_tanh.
+ */
+class MKLDNNEltwiseActivation : public MKLDNNActivation {
+  typedef mkldnn::eltwise_forward eltwise_fwd;
+  typedef mkldnn::eltwise_backward eltwise_bwd;
+
+protected:
+  // save the forward primitive desc, which can be used backward
+  std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
+  // eltwise_bwd need src input value
+  MKLDNNMatrixPtr inVal_;
+  // use for copy data
+  std::shared_ptr<mkldnn::reorder> copyInVal_;
+
+public:
+  MKLDNNEltwiseActivation() {}
+
+  ~MKLDNNEltwiseActivation() {}
+
+  virtual const std::string& getName() const = 0;
+
+  // in common, the alpha of forward and backward should be equal.
+  // but for relu, to avoid negative value, they should be opposite
+  virtual float getAlpha() const = 0;
+  virtual float getBwdAlpha() const = 0;
+  virtual float getBeta() const { return 0.f; }
+  virtual mkldnn::algorithm getAlgo(const std::string& type) const {
+    if (type == "mkldnn_relu") {
+      return mkldnn::algorithm::eltwise_relu;
+    } else if (type == "mkldnn_tanh") {
+      return mkldnn::algorithm::eltwise_tanh;
+    } else if (type == "mkldnn_elu") {
+      return mkldnn::algorithm::eltwise_elu;
+    } else {
+      LOG(FATAL) << "Unkown eltwise activation type: " << type;
+    }
+    return (mkldnn::algorithm)0;
+  }
+
+  /**
+   * reshape and reset the forward primitives
+   */
+  void resetFwd(Argument& act) {
+    if (cnt_ == act.value->getElementCnt()) {
+      return;
+    }
+    cnt_ = act.value->getElementCnt();
+    stream_.reset(new MKLDNNStream());
+    auto eng = CPUEngine::Instance().getEngine();
+
+    // get algo setting
+    mkldnn::algorithm algo = getAlgo(this->getName());
+    // note: alpha represents the NegativeSlope when used in relu.
+    float alpha = getAlpha();
+    float beta = getBeta();
+
+    /// forward
+    pipelineFwd_.clear();
+    val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
+    if (val_ == nullptr) {
+      int bs = act.getBatchSize();
+      int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
+      int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
+      int ic = cnt_ / bs / ih / iw;
+      CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
+      val_ = MKLDNNMatrix::create(
+          act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, eng);
+      CHECK(val_);
+    }
+    auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
+                                     algo,
+                                     val_->getMemoryDesc(),
+                                     alpha,
+                                     beta);
+    fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, eng));
+    // use inplace for forward but save input value before submit
+    inVal_ = val_;
+    if (act.grad) {
+      // only copy when need do backward
+      inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
+      copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
+      CHECK(copyInVal_) << "should not be emptry";
+      pipelineFwd_.push_back(*copyInVal_);
+    }
+    fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
+    pipelineFwd_.push_back(*fwd_);
+    needResetBwd_ = true;
+  }
+
+  /**
+   * reset the backward primitives, can not merge into resetFwd as the grad data
+   * would be changing before backward.
+   */
+  void resetBwd(Argument& act) {
+    if (!needResetBwd_) {
+      return;
+    }
+    needResetBwd_ = false;
+    mkldnn::algorithm algo = getAlgo(this->getName());
+    float alpha = getBwdAlpha();
+    float beta = getBeta();
+    grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
+    auto eng = CPUEngine::Instance().getEngine();
+    auto bwdDesc = eltwise_bwd::desc(
+        algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
+    auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
+    CHECK(inVal_);
+    bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
+    pipelineBwd_.clear();
+    pipelineBwd_.push_back(*bwd_);
+  }
+
+  Error __must_check forward(Argument& act) {
+    resetFwd(act);
+    stream_->submit(pipelineFwd_);
+    return Error();
+  }
+
+  Error __must_check backward(Argument& act) {
+    resetBwd(act);
+    stream_->submit(pipelineBwd_);
+    return Error();
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 2bc20eee6c452d0943dbf43b17ebe77976c97489..e95f42c863b3733ca66055e1b3939e734cae8ad1 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -14,26 +14,12 @@ limitations under the License. */
 
 #include "paddle/utils/Util.h"
 
+#include "CostLayer.h"
+#include "ValidationLayer.h"
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Error.h"
 #include "paddle/utils/Logging.h"
 
-#include "AddtoLayer.h"
-#include "CRFLayer.h"
-#include "CosSimLayer.h"
-#include "CostLayer.h"
-#include "DataLayer.h"
-#include "ExpandConvLayer.h"
-#include "FullyConnectedLayer.h"
-#include "HierarchicalSigmoidLayer.h"
-#include "MaxLayer.h"
-#include "MixedLayer.h"
-#include "NormLayer.h"
-#include "PoolLayer.h"
-#include "TensorLayer.h"
-#include "TransLayer.h"
-#include "ValidationLayer.h"
-
 DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
 
 namespace paddle {
@@ -109,6 +95,10 @@ ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
 LayerPtr Layer::create(const LayerConfig& config) {
   std::string type = config.type();
 
+  // NOTE: As following types have illegal character '-',
+  // they can not use REGISTER_LAYER to registrar.
+  // Besides, to fit with old training models,
+  // they can not use '_' instead.
   if (type == "multi-class-cross-entropy")
     return LayerPtr(new MultiClassCrossEntropy(config));
   else if (type == "rank-cost")
@@ -117,8 +107,6 @@ LayerPtr Layer::create(const LayerConfig& config) {
     return LayerPtr(new AucValidation(config));
   else if (type == "pnpair-validation")
     return LayerPtr(new PnpairValidation(config));
-  // NOTE: stop adding "if" statements here.
-  // Instead, use REGISTER_LAYER to add more layer types
 
   return LayerPtr(registrar_.createByType(config.type(), config));
 }
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 9088744beebd25ac105737fe3b012de143c66a7c..2647cb600653b4f43322016afb231a55f4db5642 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -294,12 +294,9 @@ void MKLDNNConvLayer::resetOutValue(
     std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& out) {
   out = MKLDNNMatrix::create(output_.value, pd->dst_primitive_desc());
 
-  // change original output value from cpu matrix to mkldnn matrix
-  output_.value = std::dynamic_pointer_cast<Matrix>(out);
-
   // create reorder if output value has cpu device and pd do not match
   cpuOutVal_ = nullptr;
-  cpuOutVal_ = nullptr;
+  cvtOutVal_ = nullptr;
   if (!outputIsOnlyMKLDNN()) {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
     memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index f60e221a6ec2ff513789a24e9f59bb25aef437b5..66b358bcea53f61ddcc15323704fa9f154fb2a73 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -172,12 +172,10 @@ void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
 
 void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
   out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
-  // change original output value to mkldnn output value
-  output_.value = std::dynamic_pointer_cast<Matrix>(out);
   if (!outputIsOnlyMKLDNN()) {
     // fc cpu output value do not need create convert
     // just share point
-    getOutput(CPU_DEVICE).value->setData(output_.value->getData());
+    getOutput(CPU_DEVICE).value->setData(out->getData());
   }
 }
 
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 169679c8297542cac4a43f5a8e1af311ad9282df..c4e4a6874e6fdb491c344c70dfea422dc0924cd9 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -119,6 +119,10 @@ public:
         inputElemenCnt_ = elemenCnt;
         reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
         resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
+        if (outVal_) {
+          // change original output value to mkldnn output value
+          output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
+        }
         convertWeightsFromPaddle();
         needResetBwd_ = true;
       }
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index 48b2f5a4cb37f6a9c4b1fdc6178c914b46c76e63..b62dfb7c54258a593aa50d5b30096423f375c69d 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -134,7 +134,6 @@ void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
   memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
   out = MKLDNNMatrix::create(
       output_.value, outDims, inVal_->getFormat(), engine_);
-  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 
   // create reorder if output value has cpu device and pd do not match
   cpuOutVal_ = nullptr;
diff --git a/paddle/gserver/layers/SequenceSliceLayer.cpp b/paddle/gserver/layers/SequenceSliceLayer.cpp
index d3a83fad276a384ab3fddd5349912c56be6f3cc0..ce68ca449429711eeee692be750a4a2f1dac61a6 100644
--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ b/paddle/gserver/layers/SequenceSliceLayer.cpp
@@ -73,9 +73,10 @@ void SequenceSliceLayer::checkInputs() {
   CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
                            << "must be a sequence.";
   const MatrixPtr indices1 = getInputValue(1);
-  CHECK_EQ(static_cast<size_t>(indices1->getHeight()),
-           inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
-                                : inputSeq.getNumSequences())
+  CHECK_EQ(
+      indices1->getHeight(),
+      static_cast<size_t>(inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
+                                               : inputSeq.getNumSequences()))
       << "Height of the second input should be equal to number of sequence "
       << "in the first input.";
   if (inputLayers_.size() == 3) {
@@ -151,7 +152,7 @@ void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
         if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
 
         int seqLen = endPos - begPos + 1;
-        CHECK_GT(seqLen, 0U);
+        CHECK_GT(seqLen, 0);
         for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
         hasSubseq
             ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
index 2f48e5b2d3ffc9337ed1314f6db6549e56263fdd..f59618be9d09d146be52fb51cae84f4d24c15ef1 100644
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -64,15 +64,17 @@ void MKLDNNTester::reset(const TestConfig& dnn,
         configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
   }
   refLayer_ = testLayers_[REF];
-  dnnLayer_ = std::dynamic_pointer_cast<MKLDNNLayer>(testLayers_[DNN]);
-  CHECK(dnnLayer_);
-  // for comparison with Paddle reference results,
-  // need manually add cpu device output for test
-  dnnLayer_->addOutputArgument(CPU_DEVICE);
+  dnnLayer_ = testLayers_[DNN];
   EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
   EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
-
   setInputImgSize();
+
+  // for comparison with Paddle reference results,
+  // need manually add cpu device output for test
+  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  if (dnnLayer) {
+    dnnLayer->addOutputArgument(CPU_DEVICE);
+  }
 }
 
 void MKLDNNTester::setInputImgSize() {
@@ -122,7 +124,7 @@ void MKLDNNTester::randomTopDiffs() {
 void MKLDNNTester::checkForward() {
   VLOG(MKLDNN_ALL) << "Check Forward";
   printTopDatas();
-  double delta = compareMatrix(dnnLayer_->getOutput(-1).value,
+  double delta = compareMatrix(dnnLayer_->getOutput(CPU_DEVICE).value,
                                refLayer_->getOutputValue());
   EXPECT_LE(fabs(delta), eps_);
 }
@@ -155,7 +157,10 @@ void MKLDNNTester::checkBackwardWgts() {
   vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
   saveWgt(parameters_[DNN], dnnWgts);
 
-  dnnLayer_->convertWeightsToPaddle();
+  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  if (dnnLayer) {
+    dnnLayer->convertWeightsToPaddle();
+  }
   for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
     const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
     const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
@@ -322,6 +327,10 @@ void MKLDNNTester::runOnce() {
   // and clearTopDatas(REF) should be coverd by ref layers
   clearBotDiffs(REF);
   clearWgtDiffs(REF);
+  // it is necessary to clear bottom diffs when only activation is dnn type
+  if (configs_[DNN].layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) {
+    clearBotDiffs(DNN);
+  }
 }
 
 void MKLDNNTester::run(const TestConfig& dnn,
@@ -333,8 +342,19 @@ void MKLDNNTester::run(const TestConfig& dnn,
                        float epsilon,
                        bool log,
                        int level) {
-  VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: " << dnn.layerConfig.type()
-                     << " vs " << ref.layerConfig.type();
+  CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
+        dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
+      << "should be MKLDNN layer or MKLDNN activation";
+  if (dnn.layerConfig.type() == ref.layerConfig.type()) {
+    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
+                       << dnn.layerConfig.active_type() << " vs "
+                       << ref.layerConfig.active_type();
+  } else {
+    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
+                       << dnn.layerConfig.type() << " vs "
+                       << ref.layerConfig.type();
+  }
+
   ih_ = inputImgH;
   iw_ = inputImgW;
   iter_ = iter;
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index 5ac885638cde7693a0c847733e7a6149c1b7e6c2..171d176ee757f1164c38d86273bdf9e5aefeda06 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -41,8 +41,7 @@ protected:
   vector<LayerMap> layerMaps_;
   vector<vector<ParameterPtr>> parameters_;
   vector<LayerPtr> testLayers_;
-  LayerPtr refLayer_;
-  MKLDNNLayerPtr dnnLayer_;
+  LayerPtr refLayer_, dnnLayer_;
 
   /// run some iterations, all the result should pass
   size_t iter_;
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index b593f65fe49ef2271ad7cd0f609c9b828be03037..406181370faf90d29167b62173ce4c8af44d243e 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <vector>
 #include "MKLDNNTester.h"
 #include "ModelConfig.pb.h"
+#include "paddle/gserver/activations/MKLDNNActivation.h"
 #include "paddle/math/MathUtils.h"
 
 using namespace paddle;  // NOLINT
@@ -162,7 +163,6 @@ void testPoolLayer(const testPoolDesc& pm) {
        0});
   LayerInputConfig* input = cfg.layerConfig.add_inputs();
   PoolConfig* pool = input->mutable_pool_conf();
-  // pool->set_pool_type(poolType);
   pool->set_channels(pm.ch);
   pool->set_img_size(pm.iw);
   pool->set_img_size_y(pm.ih);
@@ -191,7 +191,7 @@ void testPoolLayer(const testPoolDesc& pm) {
   }
 }
 
-TEST(MkldnnLayer, PoolLayer) {
+TEST(MKLDNNLayer, PoolLayer) {
   /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw*/
   testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
   testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
@@ -203,6 +203,49 @@ TEST(MkldnnLayer, PoolLayer) {
   testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
 }
 
+struct testActDesc {
+  int bs, ch;
+  int ih, iw;
+};
+
+static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("addto");
+  cfg.layerConfig.set_size(pm.ch * pm.ih * pm.iw);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ch * pm.ih * pm.iw),
+       0});
+  cfg.layerConfig.add_inputs();
+}
+
+void testActivation(std::string& type, const testActDesc& pm) {
+  const std::string compareTypes[] = {type, type.erase(0, 7)};
+  TestConfig cfg;
+  getAddtoConfig(cfg, pm);
+
+  TestConfig ref = cfg;
+  cfg.layerConfig.set_active_type(compareTypes[0]);
+  ref.layerConfig.set_active_type(compareTypes[1]);
+  MKLDNNTester tester;
+  for (auto bs : {pm.bs, 1}) {
+    tester.run(cfg, ref, bs, pm.ih, pm.iw);
+  }
+}
+
+TEST(MKLDNNActivation, Activations) {
+  auto types = MKLDNNActivation::getAllRegisteredTypes();
+  // TODO(TJ): mkldnn_softmax not implemented, paddle do not have elu activation
+  std::set<string> excluded{"mkldnn_softmax", "mkldnn_elu"};
+  for (auto type : types) {
+    if (excluded.count(type)) {
+      continue;
+    }
+    testActivation(type, {16, 64, 32, 32});
+  }
+}
+
 // TODO(TJ): add branch test
 
 int main(int argc, char** argv) {
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index e8ea6e37ac527a19c529d1731b94bed970211755..8193aa4adffc0409d8ea68417c68fa153a2942d8 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include <mkl_lapacke.h>
 #endif
 
-#ifdef PADDLE_USE_ATLAS
+#if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB)
 extern "C" {
 #include <cblas.h>
 #include <clapack.h>
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5de8f1489d31cd94dfdea09ce4ba3d27cff62f82
--- /dev/null
+++ b/paddle/operators/cross_entropy_op.cc
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::LoDTensor;
+
+class CrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Label) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"), "Output(Y) must not be null.");
+
+    auto x = ctx.Input<Tensor>("X");
+    auto label = ctx.Input<Tensor>("Label");
+    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(label->dims().size(), 2,
+                      "Input(Label)'s rank must be 2.");
+    // TODO(xinghai-sun): remove this check after swtiching to bool
+    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
+                   ctx.Attr<int>("soft_label") == 1);
+    PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
+                      "The 1st dimension of Input(X) and Input(Label) must "
+                      "be equal.");
+    if (ctx.Attr<int>("soft_label") == 1) {
+      PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
+                        "If Attr(soft_label) == 1, The 2nd dimension of "
+                        "Input(X) and Input(Label) must be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(label->dims()[1], 1,
+                        "If Attr(soft_label) == 0, The 2nd dimension of "
+                        "Input(Label) must be 1.");
+    }
+
+    ctx.Output<LoDTensor>("Y")->Resize({x->dims()[0], 1});
+    ctx.ShareLoD("X", "Y");
+  }
+};
+
+class CrossEntropyGradientOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Label) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
+                            "Input(Y@GRAD) must not be null.");
+
+    auto x = ctx.Input<Tensor>("X");
+    auto label = ctx.Input<Tensor>("Label");
+    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(dy->dims().size(), 2, "Input(Y@Grad)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(label->dims().size(), 2,
+                      "Input(Label)'s rank must be 2.");
+    // TODO(xinghai-sun): remove this check after swtiching to bool
+    PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
+                   ctx.Attr<int>("soft_label") == 1);
+    PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
+                      "The 1st dimension of Input(X) and Input(Label) must "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x->dims()[0], dy->dims()[0],
+                      "The 1st dimension of Input(X) and Input(Y@Grad) must "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(dy->dims()[1], 1,
+                      "The 2nd dimension of Input(Y@Grad) must be 1.");
+    if (ctx.Attr<int>("soft_label") == 1) {
+      PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
+                        "If Attr(soft_label) == 1, The 2nd dimension of "
+                        "Input(X) and Input(Label) must be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(label->dims()[1], 1,
+                        "If Attr(soft_label) == 0, The 2nd dimension of "
+                        "Input(Label) must be 1.");
+    }
+
+    auto dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    dx->Resize(x->dims());
+  }
+};
+
+class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CrossEntropyOpMaker(framework::OpProto *proto,
+                      framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of CrossEntropyOp");
+    AddInput("Label", "The second input of CrossEntropyOp");
+    AddOutput("Y", "The output of CrossEntropyOp");
+    AddAttr<int>("soft_label", "Is soft label. Default zero.").SetDefault(0);
+
+    AddComment(R"DOC(
+CrossEntropy Operator.
+
+It supports both standard cross-entropy and soft-label cross-entropy loss
+computation.
+1) One-hot cross-entropy:
+    soft_label = 0, Label[i, 0] indicates the class index for sample i:
+
+                Y[i] = -log(X[i, Label[i]])
+
+2) Soft-label cross-entropy:
+    soft_label = 1, Label[i, j] indicates the soft label of class j
+    for sample i:
+
+                Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
+
+   Please make sure that in this case the summuation of each row of Label
+   equals one.
+
+3) One-hot cross-entropy with vecterized Input(Label):
+     As a special case of 2), when each row of Input(Label) has only one
+     non-zero element (equals 1), soft-label cross-entropy degenerates to a
+     one-hot cross-entropy with one-hot label representation.
+
+Both the input `X` and `Label` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
+            cross_entropy_grad, ops::CrossEntropyGradientOp);
+REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
+                       ops::CrossEntropyGradientOpKernel<float>);
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ab6ad0e062269483948bf70e492c9431991221fb
--- /dev/null
+++ b/paddle/operators/cross_entropy_op.cu
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
+                                   const int N, const int D) {
+  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
+  // CUDA_1D_KERNEL_LOOP(i, N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
+    Y[i] = -tolerable_value(log(X[i * D + label[i]]));
+  }
+}
+
+template <typename T>
+__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
+                                       const int N, const int D) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    T sum = static_cast<T>(0);
+    for (int j = 0; j < D; j++) {
+      sum += label[i * D + j] * tolerable_value(log(X[i * D + j]));
+    }
+    Y[i] = -sum;
+  }
+}
+
+// TODO(qingqing): make zero setting an common function.
+template <typename T>
+__global__ void zero(T* X, const int N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    X[i] = 0.0;
+  }
+}
+
+template <typename T>
+__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
+                                           const int* label, const int N,
+                                           const int D) {
+  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
+  // CUDA_1D_KERNEL_LOOP(i, N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    int idx = i * D + label[i];
+    dX[idx] = -dY[i] / X[idx];
+  }
+}
+
+template <typename T>
+__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
+                                               const T* label, const int N,
+                                               const int D) {
+  // TOOD(qingqing): optimize for this kernel
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    for (int j = 0; j < D; ++j) {
+      int idx = i * D + j;
+      dX[idx] = -label[idx] * dY[i] / X[idx];
+    }
+  }
+}
+
+template <typename T>
+class CrossEntropyOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    auto x = ctx.Input<Tensor>("X");
+    auto y = ctx.Output<Tensor>("Y");
+    auto label = ctx.Input<Tensor>("Label");
+
+    auto* x_data = x->data<T>();
+    y->mutable_data<T>(ctx.GetPlace());
+    auto* y_data = y->data<T>();
+
+    int n = x->dims()[0];
+    int d = x->dims()[1];
+    int block = 512;
+    int grid = (n + block - 1) / block;
+    // TODO(qingqing) launch kernel on specified stream
+    // base on ExecutionContext.
+    if (ctx.Attr<int>("soft_label") == 1) {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
+      SoftCrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n,
+                                                 d);
+    } else {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
+      CrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, d);
+    }
+  }
+};
+
+template <typename T>
+class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    auto x = ctx.Input<Tensor>("X");
+    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto label = ctx.Input<Tensor>("Label");
+
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    auto* dy_data = dy->data<T>();
+    auto* x_data = x->data<T>();
+
+    int n = x->dims()[0];
+    int d = x->dims()[1];
+    int block = 512;
+    int grid = (n * d + block - 1) / block;
+    zero<T><<<grid, block>>>(dx_data, n * d);
+    grid = (n + block - 1) / block;
+    // TODO(qingqing): launch kernel on specified stream
+    // base on ExecutionContext.
+    if (ctx.Attr<int>("soft_label") == 1) {
+      auto* label_data = label->data<T>();
+      SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
+          dx_data, dy_data, x_data, label_data, n, d);
+    } else {
+      auto* label_data = label->data<int>();
+      CrossEntropyGradientKernel<T><<<grid, block>>>(dx_data, dy_data, x_data,
+                                                     label_data, n, d);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(cross_entropy_grad,
+                       ops::CrossEntropyGradientOpCUDAKernel<float>);
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b4b23ac2029138afadef0168262203ac2e20430
--- /dev/null
+++ b/paddle/operators/cross_entropy_op.h
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+HOSTDEVICE T tolerable_value(const T x) {
+  PADDLE_ASSERT(std::is_floating_point<T>::value);
+  const T kApproInf = 1e20;
+  if (x == INFINITY) {
+    return kApproInf;
+  }
+  if (x == -INFINITY) {
+    return -kApproInf;
+  }
+  return x;
+}
+
+template <typename T>
+class CrossEntropyOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto x = ctx.Input<Tensor>("X");
+    auto y = ctx.Output<Tensor>("Y");
+
+    auto* x_data = x->data<T>();
+    y->mutable_data<T>(ctx.GetPlace());
+    auto* y_data = y->data<T>();
+
+    int batch_size = x->dims()[0];
+    int class_num = x->dims()[1];
+
+    if (ctx.Attr<int>("soft_label") == 1) {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
+      int index = 0;
+      for (int i = 0; i < batch_size; ++i) {
+        T sum = static_cast<T>(0);
+        for (int j = 0; j < class_num; ++j) {
+          sum += label_data[index] * tolerable_value(std::log(x_data[index]));
+          y_data[i] = -sum;
+          index++;
+        }
+      }
+    } else {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
+      for (int i = 0; i < batch_size; ++i) {
+        int index = i * class_num + label_data[i];
+        y_data[i] = -tolerable_value(std::log(x_data[index]));
+      }
+    }
+  }
+};
+
+template <typename T>
+class CrossEntropyGradientOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto x = ctx.Input<Tensor>("X");
+    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto label = ctx.Input<Tensor>("Label");
+
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    auto* dy_data = dy->data<T>();
+    auto* x_data = x->data<T>();
+
+    int batch_size = x->dims()[0];
+    int class_num = x->dims()[1];
+
+    // TODO(qingqing): make zero setting an common function.
+    if (ctx.Attr<int>("soft_label") == 1) {
+      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
+      int index = 0;
+      for (int i = 0; i < batch_size; ++i) {
+        for (int j = 0; j < class_num; ++j) {
+          dx_data[index] = -label_data[index] * dy_data[i] / x_data[index];
+          index++;
+        }
+      }
+    } else {
+      auto* label_data = label->data<int>();
+      memset(dx_data, 0, sizeof(T) * batch_size * class_num);
+      for (int i = 0; i < batch_size; ++i) {
+        PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
+        int index = i * class_num + label_data[i];
+        dx_data[index] = -dy_data[i] / x_data[index];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b111b9fccb2310bd5fb92bda878a497c51f62ce0
--- /dev/null
+++ b/paddle/operators/dropout_op.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/dropout_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using framework::LoDTensor;
+
+class DropoutOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_GE(ctx.Attr<float>("dropout_prob"), 0);
+    PADDLE_ENFORCE_LE(ctx.Attr<float>("dropout_prob"), 1);
+    // TODO(xinghai-sun): remove this check after swtiching to bool
+    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
+                   ctx.Attr<int>("is_training") == 1);
+
+    auto dims = ctx.Input<Tensor>("X")->dims();
+    ctx.Output<LoDTensor>("Out")->Resize(dims);
+    if (ctx.Attr<int>("is_training") == 1) {
+      ctx.Output<LoDTensor>("Mask")->Resize(dims);
+    }
+  }
+};
+
+template <typename AttrType>
+class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DropoutOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<AttrType>("dropout_prob", "Probability of setting units to zero.")
+        .SetDefault(.5f);
+    // TODO(xinghai-sun): use bool for is_training after bool is supported.
+    AddAttr<int>("is_training", "Whether in training phase.").SetDefault(1);
+    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
+    AddInput("X", "The input of dropout op.");
+    AddOutput("Out", "The output of dropout op.");
+    AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
+
+    AddComment(R"DOC(
+Dropout Operator.
+
+"Dropout" refers to randomly dropping out units in a nerual network. It is a
+regularization technique for reducing overfitting by preventing neuron
+co-adaption during training. The dropout operator randomly set (according to
+the given dropout probability) the outputs of some units to zero, while others
+being set to their inputs.
+)DOC");
+  }
+};
+
+template <typename AttrType>
+class DropoutOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx.Attr<int>("is_training"), 1,
+                      "GradOp is only callable when is_training is true");
+
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Mask"), "Mask must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
+                            "Input(Out@GRAD) must not be null.");
+
+    PADDLE_ENFORCE_GE(ctx.Attr<AttrType>("dropout_prob"), 0);
+    PADDLE_ENFORCE_LE(ctx.Attr<AttrType>("dropout_prob"), 1);
+    // TODO(xinghai-sun): remove this check after swtiching to bool
+    PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
+                   ctx.Attr<int>("is_training") == 1);
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    PADDLE_ENFORCE_EQ(x_dims, out_dims,
+                      "Dimensions of Input(X) and Out@Grad must be the same.");
+    auto mask_dims = ctx.Input<Tensor>("Mask")->dims();
+    PADDLE_ENFORCE_EQ(x_dims, mask_dims,
+                      "Dimensions of Input(X) and Mask must be the same.");
+
+    auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    x_grad->Resize(x_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
+            ops::DropoutOpGrad<float>);
+REGISTER_OP_CPU_KERNEL(
+    dropout, ops::CPUDropoutKernel<paddle::platform::CPUPlace, float, float>);
+REGISTER_OP_CPU_KERNEL(
+    dropout_grad, ops::DropoutGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..186237fb238add37f32403309a0f7e8a9846d335
--- /dev/null
+++ b/paddle/operators/dropout_op.cu
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/operators/dropout_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename AttrType>
+struct MaskGenerator {
+  AttrType dropout_prob;
+  int seed;
+
+  __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
+      : dropout_prob(dropout_prob), seed(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<AttrType> dist(0, 1);
+    rng.discard(n);
+    if (dist(rng) < dropout_prob) {
+      return static_cast<T>(0);
+    } else {
+      return static_cast<T>(1);
+    }
+  }
+};
+
+// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename Place, typename T, typename AttrType>
+class GPUDropoutKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Output<Tensor>("Out");
+    y->mutable_data<T>(context.GetPlace());
+    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
+
+    auto X = EigenMatrix<T>::Reshape(*x, 1);
+    auto Y = EigenMatrix<T>::Reshape(*y, 1);
+
+    auto place = context.GetEigenDevice<Place>();
+    if (context.Attr<int>("is_training") == 1) {
+      auto* mask = context.Output<Tensor>("Mask");
+      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
+      int size = framework::product(mask->dims());
+      int seed = context.Attr<int>("seed");
+      thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(mask_data),
+                        MaskGenerator<T, AttrType>(dropout_prob, seed));
+      auto M = EigenMatrix<T>::Reshape(*mask, 1);
+      Y.device(place) = X * M;
+    } else {
+      Y.device(place) = X * dropout_prob;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    dropout, ops::GPUDropoutKernel<paddle::platform::GPUPlace, float, float>);
+REGISTER_OP_GPU_KERNEL(
+    dropout_grad, ops::DropoutGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..82eafee0e0e7db7b4b4ae5405f37146d061aefd5
--- /dev/null
+++ b/paddle/operators/dropout_op.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <random>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T, typename AttrType>
+class CPUDropoutKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Output<Tensor>("Out");
+    const auto* x_data = x->data<T>();
+    auto* y_data = y->mutable_data<T>(context.GetPlace());
+    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
+
+    if (context.Attr<int>("is_training") == 1) {
+      auto* mask = context.Output<Tensor>("Mask");
+      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
+      int seed = context.Attr<int>("seed");
+      std::minstd_rand engine;
+      engine.seed(seed);
+      std::uniform_real_distribution<AttrType> dist(0, 1);
+      size_t size = framework::product(mask->dims());
+      for (size_t i = 0; i < size; ++i) {
+        if (dist(engine) < dropout_prob) {
+          mask_data[i] = 0;
+          y_data[i] = 0;
+        } else {
+          mask_data[i] = 1;
+          y_data[i] = x_data[i];
+        }
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(*x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
+      auto place = context.GetEigenDevice<Place>();
+      Y.device(place) = X * dropout_prob;
+    }
+  }
+};
+
+template <typename Place, typename T>
+class DropoutGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(context.Attr<int>("is_training"), 1,
+                      "GradOp is only callable when is_training is true");
+
+    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* mask = context.Input<Tensor>("Mask");
+    grad_x->mutable_data<T>(context.GetPlace());
+
+    auto M = EigenMatrix<T>::Reshape(*mask, 1);
+    auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
+    auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
+
+    auto place = context.GetEigenDevice<Place>();
+    dX.device(place) = dY * M;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/onehot_cross_entropy_op.cc b/paddle/operators/onehot_cross_entropy_op.cc
deleted file mode 100644
index 1d87032d275dbef8c9cf6569c897dbbfce46bd16..0000000000000000000000000000000000000000
--- a/paddle/operators/onehot_cross_entropy_op.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/onehot_cross_entropy_op.h"
-
-namespace paddle {
-namespace operators {
-
-class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("X"),
-        "Input(X) of OnehotCrossEntropyOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("label"),
-        "Input(label) of OnehotCrossEntropyOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Y"),
-        "Output(Y) of OnehotCrossEntropyOp should not be null.");
-
-    auto *X = ctx.Input<Tensor>("X");
-    auto *label = ctx.Input<Tensor>("label");
-
-    PADDLE_ENFORCE_EQ(X->dims().size(), 2, "X's dimension must be 2.");
-    PADDLE_ENFORCE_EQ(label->dims().size(), 1, "label's dimension must be 1.");
-    PADDLE_ENFORCE_EQ(X->dims()[0], label->dims()[0]);
-    ctx.Output<framework::LoDTensor>("Y")->Resize({X->dims()[0], 1});
-    ctx.ShareLoD("X", "Y");
-  }
-};
-
-class OnehotCrossEntropyGradientOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto dX = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto X = ctx.Input<Tensor>("X");
-
-    dX->Resize(X->dims());
-  }
-};
-
-class OnehotCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  OnehotCrossEntropyOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of OnehotCrossEntropyOp");
-    AddInput("label", "The second input of OnehotCrossEntropyOp");
-    AddOutput("Y", "The output of OnehotCrossEntropyOp");
-    AddComment(R"DOC(
-OnehotCrossEntropy Operator.
-
-                Y[i] = -log(X[i][j])
-
-Both the input `X` and `Label` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
-            ops::OnehotCrossEntropyOpMaker, onehot_cross_entropy_grad,
-            ops::OnehotCrossEntropyGradientOp);
-REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
-                       ops::OnehotCrossEntropyOpKernel<float>);
-REGISTER_OP_CPU_KERNEL(onehot_cross_entropy_grad,
-                       ops::OnehotCrossEntropyGradientOpKernel<float>);
diff --git a/paddle/operators/onehot_cross_entropy_op.cu b/paddle/operators/onehot_cross_entropy_op.cu
deleted file mode 100644
index d999bfce58c8a6db5c811aad677c07094b881841..0000000000000000000000000000000000000000
--- a/paddle/operators/onehot_cross_entropy_op.cu
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/platform/assert.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__host__ __device__ T clipping_log(const T x) {
-  PADDLE_ASSERT(std::is_floating_point<T>::value);
-  const T kApproInf = 1e20;
-  T v = log(x);
-  if (v == INFINITY) {
-    return kApproInf;
-  }
-  if (v == -INFINITY) {
-    return -kApproInf;
-  }
-  return v;
-}
-
-template <typename T>
-__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
-                                   const int N, const int D) {
-  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
-  // CUDA_1D_KERNEL_LOOP(i, N) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
-    Y[i] = -clipping_log(X[i * D + label[i]]);
-  }
-}
-
-// TODO(qingqing): make zero setting an common function.
-template <typename T>
-__global__ void zero(T* X, const int N) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    X[i] = 0.0;
-  }
-}
-
-template <typename T>
-__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
-                                           const int* label, const int N,
-                                           const int D) {
-  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
-  // CUDA_1D_KERNEL_LOOP(i, N) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    int idx = i * D + label[i];
-    dX[idx] = -dY[i] / X[idx];
-  }
-}
-
-template <typename T>
-class OnehotCrossEntropyOpCUDAKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
-
-    auto X = ctx.Input<Tensor>("X");
-    const T* Xdata = X->data<T>();
-    const int* label_data = ctx.Input<Tensor>("label")->data<int>();
-    auto Y = ctx.Output<Tensor>("Y");
-    Y->mutable_data<T>(ctx.GetPlace());
-    T* Ydata = Y->data<T>();
-
-    int N = X->dims()[0];
-    int D = X->dims()[1];
-    int block = 512;
-    int grid = (N + block - 1) / block;
-    // TODO(qingqing) launch kernel on specified stream
-    // base on ExecutionContext.
-    CrossEntropyKernel<T><<<grid, block>>>(Ydata, Xdata, label_data, N, D);
-  }
-};
-
-template <typename T>
-class OnehotCrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
-
-    auto X = ctx.Input<Tensor>("X");
-    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto label = ctx.Input<Tensor>("label");
-
-    auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace());
-    auto* dYdata = dY->template data<T>();
-    auto* Xdata = X->template data<T>();
-    auto* label_data = label->data<int>();
-
-    int N = X->dims()[0];
-    int D = X->dims()[1];
-    int block = 512;
-    int grid = (N * D + block - 1) / block;
-    zero<T><<<grid, block>>>(dXdata, N * D);
-
-    grid = (N + block - 1) / block;
-    // TODO(qingqing): launch kernel on specified stream
-    // base on ExecutionContext.
-    CrossEntropyGradientKernel<T><<<grid, block>>>(dXdata, dYdata, Xdata,
-                                                   label_data, N, D);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
-                       ops::OnehotCrossEntropyOpCUDAKernel<float>);
-REGISTER_OP_GPU_KERNEL(onehot_cross_entropy_grad,
-                       ops::OnehotCrossEntropyGradientOpCUDAKernel<float>);
diff --git a/paddle/operators/onehot_cross_entropy_op.h b/paddle/operators/onehot_cross_entropy_op.h
deleted file mode 100644
index eb4d1348de1d940e2648c83c8ba94b289f10c5b2..0000000000000000000000000000000000000000
--- a/paddle/operators/onehot_cross_entropy_op.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-inline T tolerable_value(const T x) {
-  static_assert(std::is_floating_point<T>::value,
-                "tolerable_value works only on float, "
-                "double and double double.");
-
-  const T kApproInf = 1e20;
-
-  if (x == INFINITY) {
-    return kApproInf;
-  }
-
-  if (x == -INFINITY) {
-    return -kApproInf;
-  }
-
-  return x;
-}
-
-template <typename T>
-class OnehotCrossEntropyOpKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto X = ctx.Input<Tensor>("X");
-    const T* Xdata = X->data<T>();
-    const int* label_data = ctx.Input<Tensor>("label")->data<int>();
-    auto Y = ctx.Output<Tensor>("Y");
-
-    Y->mutable_data<T>(ctx.GetPlace());
-
-    T* Ydata = Y->data<T>();
-
-    int batch_size = X->dims()[0];
-    int class_num = X->dims()[1];
-
-    for (int i = 0; i < batch_size; ++i) {
-      int index = i * class_num + label_data[i];
-      Ydata[i] = -tolerable_value(std::log(Xdata[index]));
-    }
-  }
-};
-
-template <typename T>
-class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto X = ctx.Input<Tensor>("X");
-    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto label = ctx.Input<Tensor>("label");
-
-    auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace());
-    auto* dYdata = dY->template data<T>();
-    auto* Xdata = X->template data<T>();
-    auto* label_data = label->data<int>();
-
-    const int batch_size = X->dims()[0];
-    const int class_num = X->dims()[1];
-
-    // TODO(qingqing): make zero setting an common function.
-    memset(dXdata, 0, sizeof(T) * batch_size * class_num);
-    for (int i = 0; i < batch_size; ++i) {
-      int index = i * class_num + label_data[i];
-      dXdata[index] = -tolerable_value(dYdata[i] / Xdata[index]);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/prelu_op.h b/paddle/operators/prelu_op.h
index ece2a836a65e6508580bc32b84f7833388ce55f3..3269116c112f115e1e8fbbee0dc3b81dbe736e69 100644
--- a/paddle/operators/prelu_op.h
+++ b/paddle/operators/prelu_op.h
@@ -54,8 +54,9 @@ class PReluKernel : public framework::OpKernel {
 
     int numel = x->numel();
 
-    auto place = context.GetPlace();
-    Transform(place, x_ptr, x_ptr + numel, o_ptr, PReluFunctor<T>(alpha_ptr));
+    Transform<Place> trans;
+    trans(context.device_context(), x_ptr, x_ptr + numel, o_ptr,
+          PReluFunctor<T>(alpha_ptr));
   }
 };
 
@@ -91,9 +92,9 @@ class PReluGradKernel : public framework::OpKernel {
     const T* out_ptr = out->data<T>();
     int numel = dx->numel();
 
-    auto place = context.GetPlace();
-    Transform(place, out_ptr, out_ptr + numel, dout_ptr, dx_ptr,
-              PReluGradFunctor<T>(alpha_ptr));
+    Transform<Place> trans;
+    trans(context.device_context(), out_ptr, out_ptr + numel, dout_ptr, dx_ptr,
+          PReluGradFunctor<T>(alpha_ptr));
 
     // TODO (Zhuoyuan): add dalpha upgrade when GPU kernels ready
   }
diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
index 8eaab047fd4daa386f5ebdbb99a4caeed5fe2fbf..f196868c725cbb91b3df710260c5b60f14d53f37 100644
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
@@ -29,45 +29,71 @@
 
 namespace paddle {
 namespace platform {
+
 // Transform on host or device. It provides the same API in std library.
-template <typename InputIter, typename OutputIter, typename UnaryOperation>
-void Transform(const DeviceContext& context, InputIter first, InputIter last,
-               OutputIter result, UnaryOperation op) {
-  auto place = context.GetPlace();
-  if (is_cpu_place(place)) {
+template <typename Place>
+struct Transform {
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const DeviceContext& context, InputIter first, InputIter last,
+                  OutputIter result, UnaryOperation op);
+
+  template <typename InputIter1, typename InputIter2, typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const DeviceContext& context, InputIter1 first1,
+                  InputIter1 last1, InputIter2 first2, OutputIter result,
+                  BinaryOperation op);
+};
+
+template <>
+struct Transform<platform::CPUPlace> {
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const DeviceContext& context, InputIter first, InputIter last,
+                  OutputIter result, UnaryOperation op) {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
     std::transform(first, last, result, op);
-  } else {
-#ifdef __NVCC__
-    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
-    using namespace details;
-    thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first),
-                      DevPtrCast(last), DevPtrCast(result), op);
-#else
-    PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
-#endif
   }
-}
 
-template <typename InputIter1, typename InputIter2, typename OutputIter,
-          typename BinaryOperation>
-void Transform(const DeviceContext& context, InputIter1 first1,
-               InputIter1 last1, InputIter2 first2, OutputIter result,
-               BinaryOperation op) {
-  auto place = context.GetPlace();
-  if (is_cpu_place(place)) {
+  template <typename InputIter1, typename InputIter2, typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const DeviceContext& context, InputIter1 first1,
+                  InputIter1 last1, InputIter2 first2, OutputIter result,
+                  BinaryOperation op) {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
     std::transform(first1, last1, first2, result, op);
-  } else {
+  }
+};
+
 #ifdef __NVCC__
+template <>
+struct Transform<platform::GPUPlace> {
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const DeviceContext& context, InputIter first, InputIter last,
+                  OutputIter result, UnaryOperation op) {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
     auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
-    using namespace details;
-    thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first1),
-                      DevPtrCast(last1), DevPtrCast(first2), DevPtrCast(result),
+    thrust::transform(thrust::cuda::par.on(ctx.stream()),
+                      details::DevPtrCast(first), details::DevPtrCast(last),
+                      details::DevPtrCast(result), op);
+  }
+
+  template <typename InputIter1, typename InputIter2, typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const DeviceContext& context, InputIter1 first1,
+                  InputIter1 last1, InputIter2 first2, OutputIter result,
+                  BinaryOperation op) {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
+    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
+    thrust::transform(thrust::cuda::par.on(ctx.stream()),
+                      details::DevPtrCast(first1), details::DevPtrCast(last1),
+                      details::DevPtrCast(first2), details::DevPtrCast(result),
                       op);
-#else
-    PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
-#endif
   }
 };
+#endif
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu
index b8a6200bb03c9a40b67be8d113012856e2a407e9..c76cab80e4b0e8df98a7be15f86699cfb6f93af2 100644
--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
@@ -15,6 +15,7 @@
 #include <gtest/gtest.h>
 #include "paddle/memory/memcpy.h"
 #include "paddle/memory/memory.h"
+#include "paddle/platform/hostdevice.h"
 #include "paddle/platform/transform.h"
 
 template <typename T>
@@ -38,7 +39,8 @@ TEST(Transform, CPUUnary) {
   using namespace paddle::platform;
   CPUDeviceContext ctx;
   float buf[4] = {0.1, 0.2, 0.3, 0.4};
-  Transform(ctx, buf, buf + 4, buf, Scale<float>(10));
+  Transform<paddle::platform::CPUPlace> trans;
+  trans(ctx, buf, buf + 4, buf, Scale<float>(10));
   for (int i = 0; i < 4; ++i) {
     ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
   }
@@ -52,7 +54,8 @@ TEST(Transform, GPUUnary) {
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
   float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
   Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
-  Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
+  Transform<paddle::platform::GPUPlace> trans;
+  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
   ctx.Wait();
   Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
   Free(gpu0, gpu_buf);
@@ -65,7 +68,9 @@ TEST(Transform, CPUBinary) {
   using namespace paddle::platform;
   using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
-  Transform(CPUDeviceContext(), buf, buf + 4, buf, buf, Multiply<int>());
+  Transform<paddle::platform::CPUPlace> trans;
+  CPUDeviceContext ctx;
+  trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
   for (int i = 0; i < 4; ++i) {
     ASSERT_EQ((i + 1) * (i + 1), buf[i]);
   }
@@ -79,11 +84,12 @@ TEST(Transform, GPUBinary) {
   CUDADeviceContext ctx(gpu0);
   int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
   Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
-  Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
+  Transform<paddle::platform::GPUPlace> trans;
+  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
   ctx.Wait();
   Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
   Free(gpu0, gpu_buf);
   for (int i = 0; i < 4; ++i) {
     ASSERT_EQ((i + 1) * (i + 1), buf[i]);
   }
-}
\ No newline at end of file
+}
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index 2245c7d88ca74922f9919db91977dfa6cb3ca468..ccfc0e76020c7b4f54a493cc4048e7571379ec1a 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -45,14 +45,18 @@ add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})
 set(PSERVER_MAIN_SOURCES
     ParameterServer2Main.cpp)
 
-add_executable(paddle_pserver_main
-    ${PSERVER_MAIN_SOURCES})
-link_paddle_exe(paddle_pserver_main)
 if(WITH_TESTING)
   add_subdirectory(test)
 endif()
-install(TARGETS paddle_pserver_main
-    RUNTIME DESTINATION opt/paddle/bin
-    PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-        GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
-set_target_properties(paddle_pserver_main PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+
+if(NOT WITH_C_API)
+  add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES})
+  link_paddle_exe(paddle_pserver_main)
+
+  install(TARGETS paddle_pserver_main
+          RUNTIME DESTINATION opt/paddle/bin
+          PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+          GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
+
+  set_target_properties(paddle_pserver_main PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+endif()
diff --git a/paddle/scripts/travis/build_ios.sh b/paddle/scripts/travis/build_ios.sh
new file mode 100755
index 0000000000000000000000000000000000000000..dee7cf7cbbcccffd727002108ae7f6b6ee2fbba8
--- /dev/null
+++ b/paddle/scripts/travis/build_ios.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -e
+
+# Create the build directory for CMake.
+mkdir -p $TRAVIS_BUILD_DIR/build_ios
+cd $TRAVIS_BUILD_DIR/build_ios
+
+# Compile paddle binaries
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DCMAKE_OSX_ARCHITECTURES="arm64" \
+      -DWITH_C_API=ON \
+      -DUSE_EIGEN_FOR_BLAS=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_STYLE_CHECK=OFF \
+      -DCMAKE_BUILD_TYPE=Release \
+      ..
+
+make -j 2
diff --git a/paddle/scripts/travis/check_style.sh b/paddle/scripts/travis/check_style.sh
index ec499a839ac6593bac788f4cca5e33afbed73010..cb483b0ffc0a1d99978508bc16464a7716d2bac2 100755
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
@@ -8,6 +8,12 @@ function abort(){
 trap 'abort' 0
 set -e
 
+# install glide
+curl https://glide.sh/get | bash
+eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+go get -u github.com/alecthomas/gometalinter
+gometalinter --install
+
 cd $TRAVIS_BUILD_DIR
 export PATH=/usr/bin:$PATH
 pre-commit install
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index eac0584d30958ab78a935d89d217a4876fb07a19..3d471a0c01ca17cb98272159baf6d489c18824d5 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -50,22 +50,22 @@ macro(add_paddle_exe TARGET_NAME)
   link_paddle_exe(${TARGET_NAME})
 endmacro()
 
-add_paddle_exe(paddle_trainer
-    TrainerMain.cpp)
-
-add_paddle_exe(paddle_merge_model
-    MergeModel.cpp)
-
 if(WITH_TESTING)
-    add_subdirectory(tests)
+  add_subdirectory(tests)
 endif()
-install(TARGETS paddle_trainer paddle_merge_model
-    RUNTIME DESTINATION opt/paddle/bin
-    PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-        GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
 
-set_target_properties(paddle_trainer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
-set_target_properties(paddle_merge_model PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+if(NOT WITH_C_API)
+  add_paddle_exe(paddle_trainer TrainerMain.cpp)
+  add_paddle_exe(paddle_merge_model MergeModel.cpp)
+
+  install(TARGETS paddle_trainer paddle_merge_model
+          RUNTIME DESTINATION opt/paddle/bin
+          PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+          GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
+
+  set_target_properties(paddle_trainer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+  set_target_properties(paddle_merge_model PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+endif()
 
 if(APPLE)
   set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
@@ -73,6 +73,8 @@ endif()
 
 if(WITH_GOLANG)
   add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
-  target_link_libraries(paddle_trainer paddle_pserver_cclient)
   target_link_libraries(paddle_trainer_lib paddle_pserver_cclient)
+  if(NOT WITH_C_API)
+    target_link_libraries(paddle_trainer paddle_pserver_cclient)
+  endif()
 endif(WITH_GOLANG)
diff --git a/paddle/utils/Excepts.h b/paddle/utils/Excepts.h
index 5c2c504f53a586f2991ccfae891991465fdb39b6..0add66da7464293795927431daf0e90359f40b52 100644
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
@@ -17,7 +17,8 @@ limitations under the License. */
 
 #include <fenv.h>
 
-#if defined(__APPLE__) || defined(__OSX__)
+#if (defined(__APPLE__) || defined(__OSX__)) && !defined(__arm__) && \
+    !defined(__aarch64__)
 
 int fegetexcept(void);
 int feenableexcept(unsigned int excepts);
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index 3a0903d1f268cf0132da3de43396391219edf004..a4e6c8f7b8397adc262588612c250bac5ef5eaa6 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -40,6 +40,8 @@ void Semaphore::wait() { sem_wait(&m->sem); }
 
 void Semaphore::post() { sem_post(&m->sem); }
 
+/// SpinLockPrivate
+
 #ifdef PADDLE_USE_PTHREAD_SPINLOCK
 
 class SpinLockPrivate {
@@ -79,6 +81,8 @@ SpinLock::~SpinLock() { delete m; }
 void SpinLock::lock() { m->lock(); }
 void SpinLock::unlock() { m->unlock(); }
 
+/// ThreadBarrierPrivate
+
 #ifdef PADDLE_USE_PTHREAD_BARRIER
 
 class ThreadBarrierPrivate {
@@ -136,6 +140,8 @@ public:
 
 #endif
 
+/// ThreadBarrier
+
 ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
 ThreadBarrier::~ThreadBarrier() { delete m; }
 void ThreadBarrier::wait() { m->wait(); }
diff --git a/paddle/utils/arch/osx/Excepts.cpp b/paddle/utils/arch/osx/Excepts.cpp
index c8e904d8f9fe29e51447994af43dc62bf3514306..42ecaa06d256c9d259a20c648626605d77ce0308 100644
--- a/paddle/utils/arch/osx/Excepts.cpp
+++ b/paddle/utils/arch/osx/Excepts.cpp
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #include "paddle/utils/Excepts.h"
 
-#if defined(__APPLE__) || defined(__OSX__)
+#if (defined(__APPLE__) || defined(__OSX__)) && !defined(__arm__) && \
+    !defined(__aarch64__)
 
 int fegetexcept(void) {
   static fenv_t fenv;
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 9bdcca1716fca73e87adee861e91b6b90d1ef70d..c97e6c0a36774caaa4fd8f8130220849975451a0 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -781,11 +781,11 @@ class MixedLayerType(LayerOutput):
         :type size: int
         :param act: activation type.
         :type act: BaseActivation
-        :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                          something not type of ParameterAttribute. None will
-                          get a default Bias.
-        :type bias_attr: ParameterAttribute or None means has bias. Any other
-                         type means no bias.
+        :param bias_attr: The Bias Attribute. If the parameter is set to
+                          False or something not type of ParameterAttribute,
+                          no bias is defined. If the parameter is set to
+                          True, the bias is initialized to zero.
+        :type bias_attr: ParameterAttribute|None|Bool|Any
         :param layer_attr: Extra Layer Attribute.
         :type layer_attr: ExtraLayerAttribute or None
         """
@@ -881,10 +881,11 @@ def mixed_layer(size=0,
                   then this function will just return layer's name.
     :param act: Activation Type.
     :type act: BaseActivation
-    :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                      something not type of ParameterAttribute. None will get a
-                      default Bias.
-    :type bias_attr: ParameterAttribute or None or bool
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param layer_attr: The extra layer config. Default is None.
     :type layer_attr: ExtraLayerAttribute
     :return: MixedLayerType object can add inputs or layer name.
@@ -920,7 +921,7 @@ def data_layer(name, size, depth=None, height=None, width=None,
 
         data = data_layer(name="input", size=1000)
 
-    :param name: Name of this data layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param size: Size of this data layer.
     :type size: int
@@ -960,7 +961,7 @@ def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
     """
     Define a embedding Layer.
 
-    :param name: Name of this embedding layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input layer for this embedding. NOTE: must be Index Data.
     :type input: LayerOutput
@@ -1015,7 +1016,7 @@ def fc_layer(input,
        with mixed_layer(size=1024) as fc:
            fc += full_matrix_projection(input=layer)
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input layer. Could be a list/tuple of input layer.
     :type input: LayerOutput|list|tuple
@@ -1025,10 +1026,11 @@ def fc_layer(input,
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute|list.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                      something not type of ParameterAttribute. None will get a
-                      default Bias.
-    :type bias_attr: ParameterAttribute|None|Any
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
@@ -1065,7 +1067,7 @@ def printer_layer(input, format=None, name=None):
     """
     Print the output value of input layers. This layer is useful for debugging.
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input layer. Could be a list/tuple of input layer.
     :type input: LayerOutput|list|tuple
@@ -1103,7 +1105,7 @@ def priorbox_layer(input,
     """
     Compute the priorbox and set the variance. This layer is necessary for ssd.
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input layer.
     :type input: LayerOutput
@@ -1152,7 +1154,7 @@ def multibox_loss_layer(input_loc,
     """
     Compute the location loss and the confidence loss for ssd.
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input_loc: The input predict locations.
     :type input_loc: LayerOutput | List of LayerOutput
@@ -1227,7 +1229,7 @@ def detection_output_layer(input_loc,
     box location. The output's shape of this layer could be zero if there is
     no valid bounding box.
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input_loc: The input predict locations.
     :type input_loc: LayerOutput | List of LayerOutput.
@@ -1299,7 +1301,7 @@ def cross_channel_norm_layer(input, name=None, param_attr=None):
     a conv layer's output and scale the output by a group of trainable
     factors which dimensions equal to the channel's number.
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input layer.
     :type input: LayerOutput
@@ -1364,7 +1366,7 @@ def pooling_layer(input,
     :param agg_level: AggregateLevel.TO_NO_SEQUENCE or
                       AggregateLevel.TO_SEQUENCE
     :type agg_level: AggregateLevel
-    :param name: layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: input layer name.
     :type input: LayerOutput
@@ -1373,8 +1375,11 @@ def pooling_layer(input,
     :type pooling_type: BasePoolingType|None
     :param stride: The step size between successive pooling regions.
     :type stride: Int
-    :param bias_attr: Bias parameter attribute. False if no bias.
-    :type bias_attr: ParameterAttribute|None|False
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param layer_attr: The Extra Attributes for layer, such as dropout.
     :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
@@ -1471,10 +1476,11 @@ def lstmemory(input,
     :type gate_act: BaseActivation
     :param state_act: state activation type, TanhActivation by default.
     :type state_act: BaseActivation
-
-    :param bias_attr: Bias attribute. None means default bias. False means no
-                      bias.
-    :type bias_attr: ParameterAttribute|None|False
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param param_attr: Parameter Attribute.
     :type param_attr: ParameterAttribute|None|False
     :param layer_attr: Extra Layer attribute
@@ -1596,9 +1602,11 @@ def grumemory(input,
                      This activation affects the :math:`z_t` and :math:`r_t`. It is the
                      :math:`\\sigma` in the above formula.
     :type gate_act: BaseActivation
-    :param bias_attr: Bias attribute. None means default bias. False means no
-                      bias.
-    :type bias_attr: ParameterAttribute|None|False
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param param_attr: Parameter Attribute.
     :type param_attr: ParameterAttribute|None|False
     :param layer_attr: Extra Layer attribute
@@ -1657,7 +1665,7 @@ def last_seq(input,
        seq = last_seq(input=layer)
 
     :param agg_level: Aggregated level
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
@@ -1713,7 +1721,7 @@ def first_seq(input,
        seq = first_seq(input=layer)
 
     :param agg_level: aggregation level
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
@@ -1792,11 +1800,13 @@ def expand_layer(input,
     :type input: LayerOutput
     :param expand_as: Expand as this layer's sequence info.
     :type expand_as: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param bias_attr: Bias attribute. None means default bias. False means no
-                      bias.
-    :type bias_attr: ParameterAttribute|None|False
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param expand_level: whether input layer is timestep(default) or sequence.
     :type expand_level: ExpandLevel
     :param layer_attr: extra layer attributes.
@@ -1849,7 +1859,7 @@ def repeat_layer(input,
     :type input: LayerOutput
     :param num_repeats: Repeat the input so many times
     :type num_repeats: int
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :param as_row_vector: True for treating input as row vector and repeating
                           in the column direction.  This is equivalent to apply
                           concat_layer() with num_repeats same input.
@@ -1908,16 +1918,17 @@ def seq_reshape_layer(input,
     :type input: LayerOutput
     :param reshape_size: the size of reshaped sequence.
     :type reshape_size: int
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param act: Activation type.
     :type act: BaseActivation
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
-    :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                      something not type of ParameterAttribute. None will get a
-                      default Bias.
-    :type bias_attr: ParameterAttribute or None or bool
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1960,7 +1971,7 @@ def interpolation_layer(input, weight, name=None, layer_attr=None):
     :type input: list|tuple
     :param weight: Weight layer.
     :type weight: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -2065,7 +2076,7 @@ def power_layer(input, weight, name=None, layer_attr=None):
     :type input: LayerOutput
     :param weight: Weight layer.
     :type weight: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -2109,7 +2120,7 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
     :type input: LayerOutput
     :param weight: Weight layer.
     :type weight: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -2147,7 +2158,7 @@ def trans_layer(input, name=None, layer_attr=None):
 
     :param input: Input layer.
     :type input: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -2187,7 +2198,7 @@ def rotate_layer(input, height, width, name=None, layer_attr=None):
     :type input: LayerOutput
     :param height: The height of the sample matrix
     :type height: int
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -2232,7 +2243,7 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
 
        cos = cos_sim(a=layer1, b=layer2, size=3)
 
-    :param name: layer name
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param a: input layer a
     :type a: LayerOutput
@@ -2299,11 +2310,13 @@ def hsigmoid(input,
     :type label: LayerOutput
     :param num_classes: number of classes.
     :type num_classes: int|None
-    :param name: layer name
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param bias_attr: Bias attribute. None means default bias.
-                      False means no bias.
-    :type bias_attr: ParameterAttribute|False
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param param_attr: Parameter Attribute. None means default parameter.
     :type param_attr: ParameterAttribute|None
     :param layer_attr: Extra Layer Attribute.
@@ -2411,7 +2424,7 @@ def img_conv_layer(input,
                               bias_attr=False,
                               act=ReluActivation())
 
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: Layer Input.
     :type input: LayerOutput
@@ -2442,9 +2455,11 @@ def img_conv_layer(input,
     :type dilation: int|tuple|list
     :param dilation_y: The y dimension of the dilation.
     :type dilation_y: int
-    :param bias_attr: Convolution bias attribute. None means default bias.
-                      False means no bias.
-    :type bias_attr: ParameterAttribute|False
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param num_channels: number of input channels. If None will be set
                         automatically from previous output.
     :type num_channels: int
@@ -2835,7 +2850,7 @@ def spp_layer(input,
                         num_channels=16,
                         pool_type=MaxPooling())
 
-    :param name: layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: layer's input.
     :type input: LayerOutput
@@ -2929,7 +2944,7 @@ def img_cmrnorm_layer(input,
 
         norm = img_cmrnorm_layer(input=net, size=5)
 
-    :param name: layer name.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring
     :param input: layer's input.
     :type input: LayerOutput
@@ -2992,7 +3007,7 @@ def batch_norm_layer(input,
 
         norm = batch_norm_layer(input=net, act=ReluActivation())
 
-    :param name: layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: batch normalization input. Better be linear activation.
                 Because there is an activation inside batch_normalization.
@@ -3016,7 +3031,7 @@ def batch_norm_layer(input,
     :type num_channels: int
     :param bias_attr: :math:`\\beta`, better be zero when initialize. So the
                       initial_std=0, initial_mean=1 is best practice.
-    :type bias_attr: ParameterAttribute
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param param_attr: :math:`\\gamma`, better be one when initialize. So the
                        initial_std=0, initial_mean=1 is best practice.
     :type param_attr: ParameterAttribute
@@ -3091,7 +3106,7 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None):
 
     :param input: Input layer.
     :type input: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -3127,7 +3142,7 @@ def row_l2_norm_layer(input, name=None, layer_attr=None):
 
     :param input: Input layer.
     :type input: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -3179,16 +3194,18 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
     dropout here.
     Please refer to dropout_layer for details.
 
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: Input layers. It could be a LayerOutput or list/tuple of
                  LayerOutput.
     :type input: LayerOutput|list|tuple
     :param act: Activation Type, default is tanh.
     :type act: BaseActivation
-    :param bias_attr: Bias attribute. If False, means no bias. None is default
-                      bias.
-    :type bias_attr: ParameterAttribute|bool
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -3237,7 +3254,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
 
         concat = concat_layer(input=[layer1, layer2])
 
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: input layers or projections
     :type input: list|tuple|collections.Sequence
@@ -3330,7 +3347,7 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
 
         concat = seq_concat_layer(a=layer1, b=layer2)
 
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param a: input sequence layer
     :type a: LayerOutput
@@ -3340,10 +3357,11 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
     :type act: BaseActivation
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
-    :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                      something not type of ParameterAttribute. None will get a
-                      default Bias.
-    :type bias_attr: ParameterAttribute or None or bool
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3506,7 +3524,7 @@ def lstm_step_layer(input,
     output is :math:`o_t`, whose name is 'state' and can use
     :code:`get_output_layer` to extract this output.
 
-    :param name: Layer's name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param size: Layer's size. NOTE: lstm layer's size, should be equal to
                  :code:`input.size/4`, and should be equal to
@@ -3524,8 +3542,11 @@ def lstm_step_layer(input,
     :param state_act: State Activation Type. Default is sigmoid, and should
                            be sigmoid only.
     :type state_act: BaseActivation
-    :param bias_attr: Bias Attribute.
-    :type bias_attr: ParameterAttribute
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -3576,9 +3597,13 @@ def gru_step_layer(input,
     :param output_mem:
     :param size:
     :param act:
-    :param name:
+    :param name: The name of this layer. It is optional.
     :param gate_act:
-    :param bias_attr:
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param param_attr: the parameter_attribute for transforming the output_mem
                        from previous step.
     :param layer_attr:
@@ -3632,10 +3657,14 @@ def gru_step_naive_layer(input,
     :param input:
     :param output_mem:
     :param size:
-    :param name:
+    :param name: The name of this layer. It is optional.
     :param act:
     :param gate_act:
-    :param bias_attr:
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param param_attr:
     :param layer_attr:
     :return:
@@ -3691,7 +3720,7 @@ def get_output_layer(input, arg_name, name=None, layer_attr=None):
     output besides the default one, please use get_output_layer first to get
     the output from input.
 
-    :param name: Layer's name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: get output layer's input. And this layer should contains
                    multiple outputs.
@@ -3757,11 +3786,14 @@ def recurrent_layer(input,
     :type input: LayerOutput
     :param act: activation.
     :type act: BaseActivation
-    :param bias_attr: bias attribute.
-    :type bias_attr: ParameterAttribute
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param param_attr: parameter attribute.
     :type param_attr: ParameterAttribute
-    :param name: name of the layer
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -4000,7 +4032,7 @@ def maxid_layer(input, name=None, layer_attr=None):
 
     :param input: Input layer name.
     :type input: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -4033,7 +4065,7 @@ def out_prod_layer(input1, input2, name=None, layer_attr=None):
 
        out_prod = out_prod_layer(input1=vec1, input2=vec2)
 
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input1: The first input layer name.
     :type input: LayerOutput
@@ -4074,7 +4106,7 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):
 
        eos = eos_layer(input=layer, eos_id=id)
 
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
@@ -4265,7 +4297,7 @@ def square_error_cost(input,
 
         cost = \\sum_{i=1}^N(t_i-y_i)^2
 
-    :param name: layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: Network prediction.
     :type input: LayerOutput
@@ -4307,7 +4339,7 @@ def classification_cost(input,
     """
     classification cost Layer.
 
-    :param name: layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: input layer name. network output.
     :type input: LayerOutput
@@ -4611,7 +4643,7 @@ def pad_layer(input,
     :type pad_w: list|None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
-    :param name: layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -4679,7 +4711,7 @@ def conv_shift_layer(a, b, name=None, layer_attr=None):
 
        conv_shift = conv_shift_layer(a=layer1, b=layer2)
 
-    :param name: layer name
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param a: Input layer a.
     :type a: LayerOutput
@@ -4735,7 +4767,7 @@ def tensor_layer(a,
 
        tensor = tensor_layer(a=layer1, b=layer2, size=1000)
 
-    :param name: layer name
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param a: Input layer a.
     :type a: LayerOutput
@@ -4747,10 +4779,11 @@ def tensor_layer(a,
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                      something not type of ParameterAttribute. None will get a
-                      default Bias.
-    :type bias_attr: ParameterAttribute|None|Any
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
@@ -4797,7 +4830,7 @@ def selective_fc_layer(input,
 
        sel_fc = selective_fc_layer(input=input, size=128, act=TanhActivation())
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input layer.
     :type input: LayerOutput|list|tuple
@@ -4811,10 +4844,11 @@ def selective_fc_layer(input,
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                      something not type of ParameterAttribute. None will get a
-                      default Bias.
-    :type bias_attr: ParameterAttribute|None|Any
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
@@ -4870,7 +4904,7 @@ def sampling_id_layer(input, name=None, layer_attr=None):
 
     :param input: The input layer.
     :type input: LayerOutput
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None
@@ -4908,7 +4942,7 @@ def slope_intercept_layer(input,
 
     :param input: The input layer.
     :type input: LayerOutput
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param slope: the scale factor.
     :type slope: float.
@@ -4972,7 +5006,7 @@ def linear_comb_layer(weights, vectors, size=None, name=None, layer_attr=None):
     :type vectors: LayerOutput
     :param size: the dimension of this layer.
     :type size: int
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None
@@ -5055,7 +5089,7 @@ def block_expand_layer(input,
     :type padding_x: int
     :param padding_y: The padding size in vertical direction.
     :type padding_y: int
-    :param name: The name of this layer, which can not specify.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring.
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None
@@ -5124,7 +5158,7 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
     :type num_channels: int|None
     :param groups: The group number of input layer.
     :type groups: int
-    :param name: The name of this layer, which can not specify.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring.
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -5188,7 +5222,7 @@ def ctc_layer(input,
     :type label: LayerOutput
     :param size: category numbers + 1.
     :type size: int
-    :param name: The name of this layer
+    :param name: The name of this layer. It is optional.
     :type name: basestring|None
     :param norm_by_times: Whether to normalization by times. False by default.
     :type norm_by_times: bool
@@ -5265,7 +5299,7 @@ def warp_ctc_layer(input,
     :type label: LayerOutput
     :param size: category numbers + 1.
     :type size: int
-    :param name: The name of this layer, which can not specify.
+    :param name: The name of this layer. It is optional.
     :type name: basestring|None
     :param blank: the 'blank' label used in ctc
     :type blank: int
@@ -5329,7 +5363,7 @@ def crf_layer(input,
     :type weight: LayerOutput
     :param param_attr: Parameter attribute. None means default attribute
     :type param_attr: ParameterAttribute
-    :param name: The name of this layers. It is not necessary.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
@@ -5399,7 +5433,7 @@ def crf_decoding_layer(input,
     :type label: LayerOutput or None
     :param param_attr: Parameter attribute. None means default attribute
     :type param_attr: ParameterAttribute
-    :param name: The name of this layers. It is not necessary.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None
@@ -5458,9 +5492,9 @@ def nce_layer(input,
                         param_attr=[attr1, attr2], weight=layer3,
                         num_classes=3, neg_distribution=[0.1,0.3,0.6])
 
-    :param name: layer name
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: input layers. It could be a LayerOutput of list/tuple of LayerOutput.
+    :param input: The input layers. It could be a LayerOutput of list/tuple of LayerOutput.
     :type input: LayerOutput|list|tuple|collections.Sequence
     :param label: label layer
     :type label: LayerOutput
@@ -5478,8 +5512,11 @@ def nce_layer(input,
                              A uniform distribution will be used if not provided.
                              If not None, its length must be equal to num_classes.
     :type neg_distribution: list|tuple|collections.Sequence|None
-    :param bias_attr: Bias parameter attribute. True if no bias.
-    :type bias_attr: ParameterAttribute|None|False
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: layer name.
@@ -5594,7 +5631,7 @@ def rank_cost(left,
     :param weight: The weight affects the cost, namely the scale of cost.
                    It is an optional argument.
     :type weight: LayerOutput
-    :param name: The name of this layers. It is not necessary.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
@@ -5648,7 +5685,7 @@ def lambda_cost(input,
     :param score: The 2nd input. Score of each sample.
     :type input: LayerOutput
     :param NDCG_num: The size of NDCG (Normalized Discounted Cumulative Gain),
-                     e.g., 5 for NDCG@5. It must be less than for equal to the
+                     e.g., 5 for NDCG@5. It must be less than or equal to the
                      minimum size of lists.
     :type NDCG_num: int
     :param max_sort_size: The size of partial sorting in calculating gradient.
@@ -5659,7 +5696,7 @@ def lambda_cost(input,
                           than the size of a list, the algorithm will sort the
                           entire list of get gradient.
     :type max_sort_size: int
-    :param name: The name of this layers. It is not necessary.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -5703,7 +5740,7 @@ def cross_entropy(input,
     :type input: LayerOutput.
     :param label: The input label.
     :type input: LayerOutput.
-    :param name: The name of this layers. It is not necessary.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring.
     :param coeff: The cost is multiplied with coeff.
                   The coefficient affects the gradient in the backward.
@@ -5751,7 +5788,7 @@ def cross_entropy_with_selfnorm(input,
     :type input: LayerOutput.
     :param label: The input label.
     :type input: LayerOutput.
-    :param name: The name of this layers. It is not necessary.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring.
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float.
@@ -5791,7 +5828,7 @@ def sum_cost(input, name=None, layer_attr=None):
 
     :param input: The first input layer.
     :type input: LayerOutput.
-    :param name: The name of this layers. It is not necessary.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring.
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -5836,7 +5873,7 @@ def huber_regression_cost(input,
     :type input: LayerOutput.
     :param label: The input label.
     :type input: LayerOutput.
-    :param name: The name of this layers. It is not necessary.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring.
     :param delta: The difference between the observed and predicted values.
     :type delta: float.
@@ -5886,7 +5923,7 @@ def huber_classification_cost(input,
     :type input: LayerOutput.
     :param label: The input label.
     :type input: LayerOutput.
-    :param name: The name of this layers. It is not necessary.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring.
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float.
@@ -5929,7 +5966,7 @@ def multi_binary_label_cross_entropy(input,
     :type input: LayerOutput
     :param label: The input label.
     :type input: LayerOutput
-    :param name: The name of this layers. It is not necessary.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
@@ -6034,9 +6071,9 @@ def cross_entropy_over_beam(input, name=None):
        ])
 
 
-    :param input: input beams for this layer.
+    :param input: Input beams for this layer.
     :type input: BeamInput
-    :param name: input beams for this layer.
+    :param name: The name of this layer.
     :type name: basestring
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6097,7 +6134,7 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     :type input: LayerOutput
     :param label: The input label.
     :type input: LayerOutput
-    :param name: The name of this layers. It is not necessary.
+    :param name: The name of this layer. It is optional.
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
@@ -6145,7 +6182,7 @@ def multiplex_layer(input, name=None, layer_attr=None):
 
     :param input: Input layers.
     :type input: list of LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -6176,12 +6213,21 @@ def multiplex_layer(input, name=None, layer_attr=None):
 @wrap_name_default("dropout")
 def dropout_layer(input, dropout_rate, name=None):
     """
-    @TODO(yuyang18): Add comments.
 
-    :param name:
-    :param input:
-    :param dropout_rate:
-    :return:
+    The example usage is:
+
+    .. code-block:: python
+
+        dropout = dropout_layer(input=input_layer, dropout_rate=0.5)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param dropout_rate: The probability of dropout.
+    :type dropout_rate: float
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     return addto_layer(
         name=name,
@@ -6204,7 +6250,7 @@ def row_conv_layer(input,
     """
 
     The row convolution is called lookahead convolution. It is firstly
-    introduced in paper of `Deep Speech 2: End-toEnd Speech Recognition
+    introduced in paper of `Deep Speech 2: End-to-End Speech Recognition
     in English and Mandarin <https://arxiv.org/pdf/1512.02595v1.pdf>`_ .
 
     The bidirectional RNN that learns representation for a sequence by
@@ -6212,9 +6258,9 @@ def row_conv_layer(input,
     However, unlike unidirectional RNNs, bidirectional RNNs are challenging
     to deploy in an online and low-latency setting. The lookahead convolution
     incorporates information from future subsequences in a computationally
-    efficient manner to improve unidirectional recurrent neural networks.
+    efficient manner to improve unidirectional RNNs.
 
-    The connection of row convolution is different form the 1D sequence
+    The connection of row convolution is different from the 1D sequence
     convolution. Assumed that, the future context-length is k, that is to say,
     it can get the output at timestep t by using the the input feature from t-th
     timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input
@@ -6243,7 +6289,7 @@ def row_conv_layer(input,
     :param act: Activation Type. Default is linear activation.
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute. If None, the parameter will be
-                       initialized smartly. It's better set it by yourself.
+                       initialized smartly. It's better to set it by yourself.
     :type param_attr: ParameterAttribute
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None
@@ -6290,7 +6336,7 @@ def prelu_layer(input,
 
        prelu = prelu_layer(input=layers, partial_sum=1)
 
-    :param name: Name of this layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input layer.
     :type input: LayerOutput
@@ -6343,7 +6389,7 @@ def gated_unit_layer(input,
     The gated unit layer implements a simple gating mechanism over the input.
     The input :math:`X` is first projected into a new space :math:`X'`, and
     it is also used to produce a gate weight :math:`\sigma`. Element-wise
-    prodict between :match:`X'` and :math:`\sigma` is finally returned.
+    product between :match:`X'` and :math:`\sigma` is finally returned.
 
     Reference:
         Language Modeling with Gated Convolutional Networks
@@ -6363,7 +6409,7 @@ def gated_unit_layer(input,
     :type size: int
     :param act: activation type of the projected input.
     :type act: BaseActivation
-    :param name: name of this layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param gate_attr: Attributes to tune the gate output, for example, error
         clipping threshold, dropout and so on. See ExtraLayerAttribute for
@@ -6439,10 +6485,10 @@ def switch_order_layer(input,
 
     :param input: The input layer.
     :type input: LayerOutput
-    :param name: Name of this layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param reshape: reshape matrix by axises.
-    :type reshape: Dict
+    :param reshape_axis: Specify the axises of 'height'. Its value should be positive and less than 4.
+    :type reshape_axis: int
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -6492,7 +6538,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
     :type partial_sum: int
     :param shape: The shape to be cropped. Default is None.
     :type shape: Sequence | None
-    :param name: Name of this layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6538,7 +6584,7 @@ def sub_nested_seq_layer(input, selected_indices, name=None):
     :type input: LayerOutput
     :param selected_indices: a set of sequence indices in the nested sequence.
     :type input: LayerOutput
-    :param name: name of this layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6576,7 +6622,7 @@ def clip_layer(input, min, max, name=None):
 
         clip = clip_layer(input=input_layer, min=-10, max=10)
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input layer.
     :type input: LayerOutput.
@@ -6621,7 +6667,7 @@ def seq_slice_layer(input, starts, ends, name=None):
         seq_silce = seq_slice_layer(input=input_seq,
                                     starts=start_pos, ends=end_pos)
 
-    :param name: name of this layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: input for this layer, it should be a sequence.
     :type input: LayerOutput
@@ -6675,12 +6721,12 @@ def kmax_seq_score_layer(input, name=None, beam_size=1):
         kmax_indices = kmax_seq_score_layer(input=input_layer, beam_size)
 
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input layer. It stores scores over a sequence or a nested
         sequence and its size must be 1.
     :type input: LayerOutput.
-    :param beam_size: squence indices with top beam_size scores are returned.
+    :param beam_size: sequence indices with top beam_size scores are returned.
     :type beam_size: double
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6733,7 +6779,7 @@ def img_conv3d_layer(input,
                               bias_attr=False,
                               act=ReluActivation())
 
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: Layer Input.
     :type input: LayerOutput
@@ -6752,7 +6798,7 @@ def img_conv3d_layer(input,
     :type padding: int|tuple|list
     :param bias_attr: Convolution bias attribute. None means default bias.
                       False means no bias.
-    :type bias_attr: ParameterAttribute|False
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :param num_channels: number of input channels. If None will be set
                         automatically from previous output.
     :type num_channels: int
@@ -6864,14 +6910,17 @@ def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
 
         scale_shift = scale_shift_layer(input=input_layer, bias_attr=False)
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input layer.
     :type input: LayerOutput.
     :param param_attr: The parameter attribute of scaling.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The parameter attribute of shifting.
-    :type bias_attr: ParameterAttribute
+    :param bias_attr: The Bias Attribute. If the parameter is set to
+                      False or something not type of ParameterAttribute,
+                      no bias is defined. If the parameter is set to
+                      True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute|None|Bool|Any
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0206ca064be87afe204aa99021979b7ddc3c5d63
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -0,0 +1,89 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestCrossEntropyOp1(OpTest):
+    """Test standard cross-entropy, with index representation of labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        batch_size = 30
+        class_num = 10
+        X = np.random.uniform(0.1, 1.0,
+                              [batch_size, class_num]).astype("float32")
+        label = np.random.randint(0, class_num, (batch_size, 1), dtype="int32")
+        cross_entropy = np.asmatrix(
+            [[-np.log(X[i][label[i][0]])] for i in range(X.shape[0])],
+            dtype="float32")
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {'soft_label': 0}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y")
+
+
+class TestCrossEntropyOp2(OpTest):
+    """Test soft-label cross-entropy, with vecterized soft labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        batch_size = 10
+        class_num = 5
+        X = np.random.uniform(0.1, 1.0,
+                              [batch_size, class_num]).astype("float32")
+        label = np.random.uniform(0.1, 1.0,
+                                  [batch_size, class_num]).astype("float32")
+        label /= label.sum(axis=1, keepdims=True)
+        cross_entropy = (-label * np.log(X)).sum(
+            axis=1, keepdims=True).astype("float32")
+        self.inputs = {'X': X, 'Label': label}
+        self.outputs = {'Y': cross_entropy}
+        self.attrs = {'soft_label': 1}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y')
+
+
+class TestCrossEntropyOp3(OpTest):
+    """Test one-hot cross-entropy, with vecterized one-hot representation of
+    labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        batch_size = 30
+        class_num = 10
+        X = np.random.uniform(0.1, 1.0,
+                              [batch_size, class_num]).astype("float32")
+        label_index = np.random.randint(
+            0, class_num, (batch_size), dtype="int32")
+        label = np.zeros(X.shape)
+        label[np.arange(batch_size), label_index] = 1
+        cross_entropy = np.asmatrix(
+            [[-np.log(X[i][label_index[i]])] for i in range(X.shape[0])],
+            dtype="float32")
+        cross_entropy2 = (-label * np.log(X)).sum(
+            axis=1, keepdims=True).astype("float32")
+        self.inputs = {'X': X, 'Label': label}
+        self.outputs = {'Y': cross_entropy}
+        self.attrs = {'soft_label': 1}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_dropout_op.py b/python/paddle/v2/framework/tests/test_dropout_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3638fee1a1c26195791bc1f5a46dd749da0aee95
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_dropout_op.py
@@ -0,0 +1,59 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestDropoutOp(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
+        self.attrs = {'dropout_prob': 0.0, 'is_training': 1}
+        self.outputs = {'Out': self.inputs['X'], 'Mask': np.ones((32, 64))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.05)
+
+
+class TestDropoutOp2(TestDropoutOp):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
+        self.attrs = {'dropout_prob': 1.0, 'is_training': 1}
+        self.outputs = {'Out': np.zeros((32, 64)), 'Mask': np.zeros((32, 64))}
+
+
+class TestDropoutOp3(TestDropoutOp):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
+        self.attrs = {'dropout_prob': 0.0, 'is_training': 1}
+        self.outputs = {'Out': self.inputs['X'], 'Mask': np.ones((32, 64, 2))}
+
+
+class TestDropoutOp4(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
+        self.attrs = {'dropout_prob': 0.35, 'is_training': 0}
+        self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestDropoutOp5(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
+        self.attrs = {'dropout_prob': 0.75, 'is_training': 0}
+        self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_mnist.py b/python/paddle/v2/framework/tests/test_mnist.py
index f6f8f49b797fb6e5016a5e309f12f192d5096431..66452cb3965d28fd15e814833079621410775c17 100644
--- a/python/paddle/v2/framework/tests/test_mnist.py
+++ b/python/paddle/v2/framework/tests/test_mnist.py
@@ -128,7 +128,7 @@ def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
 def cross_entropy_layer(net, input, label):
     cost_name = "cross_entropy_%d" % uniq_id()
     cross_entropy_op = Operator(
-        "onehot_cross_entropy", X=input, label=label, Y=cost_name)
+        "cross_entropy", X=input, Label=label, Y=cost_name)
     net.append_op(cross_entropy_op)
     scope.new_var(cost_name)
     net.infer_shape(scope)
@@ -181,7 +181,7 @@ def error_rate(predict, label):
 
 
 images = data_layer(name="pixel", dims=[BATCH_SIZE, 784])
-labels = data_layer(name="label", dims=[BATCH_SIZE])
+labels = data_layer(name="label", dims=[BATCH_SIZE, 1])
 fc1 = fc_layer(net=forward_net, input=images, size=100, act="sigmoid")
 fc2 = fc_layer(net=forward_net, input=fc1, size=100, act="sigmoid")
 predict = fc_layer(net=forward_net, input=fc2, size=10, act="softmax")
@@ -215,6 +215,7 @@ def test(cost_name):
     for data in test_reader():
         image_data = numpy.array(map(lambda x: x[0], data)).astype("float32")
         label_data = numpy.array(map(lambda x: x[1], data)).astype("int32")
+        label_data = numpy.expand_dims(label_data, axis=1)
         feed_data(images, image_data)
         feed_data(labels, label_data)
 
@@ -235,6 +236,7 @@ for pass_id in range(PASS_NUM):
     for data in train_reader():
         image_data = numpy.array(map(lambda x: x[0], data)).astype("float32")
         label_data = numpy.array(map(lambda x: x[1], data)).astype("int32")
+        label_data = numpy.expand_dims(label_data, axis=1)
         feed_data(images, image_data)
         feed_data(labels, label_data)
 
diff --git a/python/paddle/v2/framework/tests/test_onehot_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_onehot_cross_entropy_op.py
deleted file mode 100644
index fd3cbdb80374865ccf113768856096bf49dce643..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_onehot_cross_entropy_op.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import unittest
-import numpy
-from op_test import OpTest
-
-
-class TestOnehotCrossEntropyOp(OpTest):
-    def setUp(self):
-        self.op_type = "onehot_cross_entropy"
-        batch_size = 30
-        class_num = 10
-
-        X = numpy.random.uniform(0.1, 1.0,
-                                 [batch_size, class_num]).astype("float32")
-        labels = numpy.random.randint(0, class_num, batch_size, dtype="int32")
-
-        cross_entropy = numpy.asmatrix(
-            [[-numpy.log(X[i][labels[i]])] for i in range(X.shape[0])],
-            dtype="float32")
-        self.inputs = {"X": X, "label": labels}
-        self.outputs = {"Y": cross_entropy}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Y")
-
-
-if __name__ == "__main__":
-    unittest.main()