提交 f2cfa324 编写于 作者: Y Yibing Liu

Merge branch 'develop' of upstream into rank_loss_op_dev

......@@ -27,3 +27,4 @@ CMakeFiles
cmake_install.cmake
paddle/.timestamp
python/paddlepaddle.egg-info/
paddle/pybind/pybind.h
......@@ -36,10 +36,6 @@ before_install:
# protobuf version.
- sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
- sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
- curl https://glide.sh/get | bash
- eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
- go get -u github.com/alecthomas/gometalinter
- gometalinter --install
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
......
......@@ -27,7 +27,7 @@ if(NOT CMAKE_CROSSCOMPILING)
endif(NOT CMAKE_CROSSCOMPILING)
find_package(Git REQUIRED)
find_package(Threads REQUIRED)
if(NOT ANDROID)
if(NOT ANDROID AND NOT IOS)
find_package(Boost QUIET)
endif()
......@@ -64,27 +64,29 @@ if(NOT CMAKE_BUILD_TYPE)
FORCE)
endif()
if(ANDROID)
if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
# TODO: support glog for Android api 16 ~ 19 in the future
message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
if(ANDROID OR IOS)
if(ANDROID)
if(AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
# TODO: support glog for Android api 16 ~ 19 in the future
message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
endif()
endif()
set(WITH_GPU OFF CACHE STRING
"Disable GPU when cross-compiling for Android" FORCE)
"Disable GPU when cross-compiling for Android and iOS" FORCE)
set(WITH_AVX OFF CACHE STRING
"Disable AVX when cross-compiling for Android" FORCE)
"Disable AVX when cross-compiling for Android and iOS" FORCE)
set(WITH_PYTHON OFF CACHE STRING
"Disable PYTHON when cross-compiling for Android" FORCE)
"Disable PYTHON when cross-compiling for Android and iOS" FORCE)
set(WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android" FORCE)
"Disable RDMA when cross-compiling for Android and iOS" FORCE)
set(WITH_MKLDNN OFF CACHE STRING
"Disable MKLDNN when cross-compiling for Android" FORCE)
"Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
set(WITH_MKLML OFF CACHE STRING
"Disable MKLML package when cross-compiling for Android" FORCE)
endif(ANDROID)
"Disable MKLML package when cross-compiling for Android and iOS" FORCE)
endif()
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
"A path setting third party libraries download & build directories.")
......
......@@ -171,3 +171,10 @@ if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
endif()
if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
set(CBLAS_FOUND ON)
set(CBLAS_PROVIDER vecLib)
set(CBLAS_INC_DIR ${VECLIB_INC_DIR})
add_definitions(-DPADDLE_USE_VECLIB)
endif()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This is a toolchain file for cross-compiling for iOS, and the
# configuration largely refers to public toolchain file:
# https://raw.githubusercontent.com/leetal/ios-cmake/master/ios.toolchain.cmake
# and
# https://github.com/cristeab/ios-cmake
#
# Supports options:
# IOS_PLATFORM = OS (default) or SIMULATOR
# This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
# OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
# SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
# IOS_ARCH
# The archectures wanted to support, such "arm64", "armv7;arm64"
# IOS_DEPLOYMENT_TARGET
# The minimum iOS deployment version, such as "7.0"
# IOS_ENABLE_BITCODE = ON (default) or OFF
# IOS_USE_VECLIB_FOR_BLAS = OFF (default) or ON
# IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
# By default this location is automatcially chosen based on the IOS_PLATFORM value above.
# If set manually, it will override the default location and force the user of a particular Developer Platform
# IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
# By default this location is automatcially chosen based on the IOS_DEVELOPER_ROOT value.
# In this case it will always be the most up-to-date SDK found in the IOS_DEVELOPER_ROOT path.
# If set manually, this will force the use of a specific SDK version
# Macros:
# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
# A convenience macro for setting xcode specific properties on targets
# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
# find_host_package (PROGRAM ARGS)
# A macro used to find executable programs on the host system, not within the iOS environment.
# Thanks to the android-cmake project for providing the command
if(NOT IOS)
return()
endif()
set(CMAKE_SYSTEM_NAME Darwin)
# Get the Xcode version being used.
execute_process(COMMAND xcodebuild -version
OUTPUT_VARIABLE XCODE_VERSION
RESULT_VARIABLE XCODE_VERSION_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT ${XCODE_VERSION_RESULT})
string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
else()
message(FATAL_ERROR "Cannot execute xcodebuild, please check whether xcode is installed.")
endif()
# Required as of cmake 2.8.10
set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
# Setup iOS platform unless specified manually with IOS_PLATFORM
if(NOT DEFINED IOS_PLATFORM)
set(IOS_PLATFORM "OS")
endif()
set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
# Set the architecture for iOS
if(NOT DEFINED IOS_ARCH)
if(IOS_PLATFORM STREQUAL "OS")
# FIXME(liuyiqun): support "armv7;armv7s;arm64" future
set(IOS_ARCH "arm64")
elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
set(IOS_ARCH "i386;x86_64")
elseif(IOS_PLATFORM STREQUAL "WATCHOS")
set(IOS_ARCH armv7k)
endif()
endif()
set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
# Specify minimum iOS deployment version
if(NOT DEFINED IOS_DEPLOYMENT_TARGET)
set(IOS_DEPLOYMENT_TARGET "7.0")
endif()
set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version")
# Whether to enable bitcode
if(NOT DEFINED IOS_ENABLE_BITCODE)
set(IOS_ENABLE_BITCODE ON)
endif()
set(IOS_ENABLE_BITCODE ${IOS_ENABLE_BITCODE} CACHE BOOL "Whether to enable bitcode")
if(NOT DEFINED IOS_USE_VECLIB_FOR_BLAS)
set(IOS_USE_VECLIB_FOR_BLAS OFF)
endif()
set(IOS_USE_VECLIB_FOR_BLAS ${IOS_UES_VECLIB_FOR_BLAS} CACHE BOOL "Whether to use veclib")
# Check the platform selection and setup for developer root
if(${IOS_PLATFORM} STREQUAL "OS")
set(IOS_PLATFORM_LOCATION "iPhoneOS.platform")
set(XCODE_IOS_PLATFORM iphoneos)
# This causes the installers to properly locate the output libraries
set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
elseif(${IOS_PLATFORM} STREQUAL "SIMULATOR")
set(IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
set(XCODE_IOS_PLATFORM iphonesimulator)
# This causes the installers to properly locate the output libraries
set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
elseif(${IOS_PLATFORM} STREQUAL "WATCHOS")
set(IOS_PLATFORM_LOCATION "WatchOS.platform")
set(XCODE_IOS_PLATFORM watchos)
# This causes the installers to properly locate the output libraries
set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-watchos")
else(${IOS_PLATFORM} STREQUAL "OS")
message(FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please set to\n"
"\t OS, SIMULATOR, or WATCHOS.")
endif()
# Check iOS developer toolchain
if(NOT DEFINED IOS_DEVELOPER_ROOT)
# Setup iOS developer location
execute_process(COMMAND xcode-select -print-path
OUTPUT_VARIABLE XCODE_DEVELOPER_DIR
RESULT_VARIABLE XCODE_DEVELOPER_DIR_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
# Xcode 4.3 changed the installation location, choose the most recent one available
if(${XCODE_VERSION} VERSION_LESS "4.3.0")
set(IOS_DEVELOPER_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
else()
set(IOS_DEVELOPER_ROOT "${XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
endif()
endif()
if(EXISTS ${IOS_DEVELOPER_ROOT})
set(IOS_DEVELOPER_ROOT ${IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
else()
message(FATAL_ERROR "Invalid IOS_DEVELOPER_ROOT: ${IOS_DEVELOPER_ROOT} does not exist.")
endif()
# Check iOS SDK
if(NOT DEFINED IOS_SDK_ROOT)
# Find and use the most recent iOS sdk
file(GLOB IOS_SDK_LISTS "${IOS_DEVELOPER_ROOT}/SDKs/*")
if(IOS_SDK_LISTS)
list(SORT IOS_SDK_LISTS)
list(REVERSE IOS_SDK_LISTS)
list(GET IOS_SDK_LISTS 0 IOS_SDK_ROOT)
else(IOS_SDK_LISTS)
message(FATAL_ERROR "No iOS SDK's found in default search path ${IOS_DEVELOPER_ROOT}."
" Please manually set IOS_SDK_ROOT or install the iOS SDK.")
endif(IOS_SDK_LISTS)
endif()
if(EXISTS ${IOS_SDK_ROOT})
set(IOS_SDK_ROOT ${IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
message(STATUS "iOS toolchain: ${IOS_SDK_ROOT}")
else()
message(FATAL_ERROR "Invalid IOS_SDK_ROOT: ${IOS_SDK_ROOT} does not exist.")
endif()
# Set the sysroot default to the most recent SDK
set(CMAKE_OSX_SYSROOT ${IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
# Get version of iOS SDK
execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
OUTPUT_VARIABLE IOS_SDK_VERSION
RESULT_VARIABLE IOS_SDK_VERSION_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(${IOS_SDK_VERSION_RESULT})
string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" IOS_SDK_VERSION "${IOS_SDK_ROOT}")
endif()
if(NOT IOS_SDK_VERSION)
message(WARNING "Cannot get SDK's version.")
set(IOS_SDK_VERSION 1)
endif()
set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION})
# Find the C & C++ compilers for the specified SDK.
if(NOT CMAKE_C_COMPILER)
# Default to use clang
execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
OUTPUT_VARIABLE IOS_C_COMPILER
RESULT_VARIABLE IOS_C_COMPILER_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(${IOS_C_COMPILER_RESULT})
get_filename_component(IOS_C_COMPILER clang PROGRAM)
endif()
else(NOT CMAKE_C_COMPILER)
# User can set it in cmake command
get_filename_component(IOS_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
endif(NOT CMAKE_C_COMPILER)
if(NOT EXISTS ${IOS_C_COMPILER})
message(FATAL_ERROR "Cannot find C compiler: ${IOS_C_COMPILER}")
endif()
if(NOT CMAKE_CXX_COMPILER)
# Default to use clang++
execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
OUTPUT_VARIABLE IOS_CXX_COMPILER
RESULT_VARIABLE IOS_CXX_COMPILER_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(${IOS_CXX_COMPILER_RESULT})
get_filename_component(IOS_CXX_COMPILER clang++ PROGRAM)
endif()
else(NOT CMAKE_CXX_COMPILER)
# User can set it in cmake command
get_filename_component(IOS_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
endif(NOT CMAKE_CXX_COMPILER)
if(NOT EXISTS ${IOS_CXX_COMPILER})
message(FATAL_ERROR "Cannot find CXX compiler: ${IOS_CXX_COMPILER}")
endif()
set(CMAKE_C_COMPILER ${IOS_C_COMPILER} CACHE PATH "C compiler" FORCE)
set(CMAKE_CXX_COMPILER ${IOS_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
# Set iOS specific C/C++ flags
if(IOS_PLATFORM STREQUAL "OS")
if(XCODE_VERSION VERSION_LESS "7.0")
set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
else()
# Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM.
set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
endif()
else()
set(XCODE_IOS_FLATFORM_VERSION_FLAGS "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
endif()
if(IOS_ENABLE_BITCODE)
set(XCODE_IOS_BITCODE_FLAGS "${IOS_COMPILER_FLAGS} -fembed-bitcode")
else()
set(XCODE_IOS_BITCODE_FLAGS "")
endif()
set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_FLAGS}")
# Hidden visibilty is required for cxx on iOS
set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
if(IOS_USE_VECLIB_FOR_BLAS)
# Find vecLib for iOS
set(VECLIB_SEARCH_DIRS
${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks
${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Frameworks
)
find_path(VECLIB_INC_DIR vecLib.h PATHS ${VECLIB_SEARCH_DIRS}/vecLib.framework/Headers)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(vecLib DEFAULT_MSG VECLIB_INC_DIR)
if(VECLIB_FOUND)
if(VECLIB_INC_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework vecLib")
message(STATUS "Found standalone vecLib.framework")
else()
set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework Accelerate")
message(STATUS "Found vecLib as part of Accelerate.framework")
endif()
endif()
endif()
set(CMAKE_C_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_C_LINK_FLAGS}")
set(CMAKE_CXX_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_CXX_LINK_FLAGS}")
set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
if(NOT IOS_ENABLE_BITCODE)
set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
else()
set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib")
set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle")
endif()
set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
endif()
# Set the find root to the iOS developer roots and to user defined paths
set(CMAKE_FIND_ROOT_PATH ${IOS_DEVELOPER_ROOT} ${IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH}
CACHE string "iOS find search path root")
# default to searching for frameworks first
set(CMAKE_FIND_FRAMEWORK FIRST)
# set up the default search directories for frameworks
set(CMAKE_SYSTEM_FRAMEWORK_PATH
${IOS_SDK_ROOT}/System/Library/Frameworks
${IOS_SDK_ROOT}/System/Library/PrivateFrameworks
${IOS_SDK_ROOT}/Developer/Library/Frameworks
)
# only search the iOS sdks, not the remainder of the host filesystem
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
message(STATUS "iOS: Targeting iOS '${CMAKE_SYSTEM_VERSION}', "
"building for '${IOS_PLATFORM}' platform, with architecture '${CMAKE_OSX_ARCHITECTURES}'")
message(STATUS "System CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
message(STATUS "System CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
# Used in ExternalProject command
string(REPLACE ";" "\\$<SEMICOLON>" EXTERNAL_IOS_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
set(EXTERNAL_OPTIONAL_ARGS
-DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT}
-DCMAKE_OSX_ARCHITECTURES=${EXTERNAL_IOS_ARCHITECTURES})
# This little macro lets you set any XCode specific property
macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
endmacro(set_xcode_property)
# This macro lets you find executable programs on the host system
macro(find_host_package)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
set(IOS FALSE)
find_package(${ARGN})
set(IOS TRUE)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
endmacro(find_host_package)
......@@ -39,13 +39,14 @@ ExternalProject_Add(
PREFIX ${GFLAGS_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DBUILD_TESTING=OFF
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF
-DCMAKE_BUILD_TYPE=Release
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
......
......@@ -34,16 +34,17 @@ ExternalProject_Add(
PREFIX ${GLOG_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DWITH_GFLAGS=ON
CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
CMAKE_ARGS -DBUILD_TESTING=OFF
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DWITH_GFLAGS=ON
-Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
-DBUILD_TESTING=OFF
-DCMAKE_BUILD_TYPE=Release
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
......
......@@ -48,15 +48,16 @@ IF(WITH_TESTING)
PREFIX ${GTEST_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DBUILD_GMOCK=ON
CMAKE_ARGS -Dgtest_disable_pthreads=ON
CMAKE_ARGS -Dgtest_force_shared_crt=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_GMOCK=ON
-Dgtest_disable_pthreads=ON
-Dgtest_force_shared_crt=ON
-DCMAKE_BUILD_TYPE=Release
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
......
......@@ -29,30 +29,41 @@ IF(NOT ${CBLAS_FOUND})
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
IF(APPLE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
ELSE()
SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
ENDIF()
SET(OPENBLAS_CC "${CMAKE_C_COMPILER}")
IF(CMAKE_CROSSCOMPILING)
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
IF(ANDROID)
# arm_soft_fp_abi branch of OpenBLAS to support softfp
# https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
SET(TARGET "ARMV7")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(TARGET "ARMV8")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF()
ELSEIF(IOS)
# FIXME(liuyiqun): support multiple architectures
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
ENDIF()
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(RPI)
# use hardfp
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0)
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
ENDIF()
ELSE()
IF(APPLE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
ENDIF()
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS "")
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
......@@ -60,6 +71,8 @@ IF(NOT ${CBLAS_FOUND})
ENDIF()
ENDIF()
SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
ExternalProject_Add(
extern_openblas
${EXTERNAL_PROJECT_LOG_ARGS}
......
......@@ -173,7 +173,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
"-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
"-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
"-Dprotobuf_WITH_ZLIB=ON"
"-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}")
"-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
${EXTERNAL_OPTIONAL_ARGS})
SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
ENDIF()
......
......@@ -12,16 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
INCLUDE(ExternalProject)
IF(NOT WITH_PYTHON)
return()
ENDIF()
INCLUDE(python_module)
FIND_PACKAGE(PythonInterp 2.7)
IF(WITH_PYTHON)
FIND_PACKAGE(PythonLibs 2.7)
# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
ENDIF(WITH_PYTHON)
FIND_PACKAGE(PythonLibs 2.7)
# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
SET(py_env "")
IF(PYTHONINTERP_FOUND)
......@@ -36,9 +37,5 @@ IF(PYTHONINTERP_FOUND)
ENDIF()
ENDIF(PYTHONINTERP_FOUND)
IF(WITH_PYTHON)
INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
ELSE()
SET(PYTHON_LIBRARIES "")
ENDIF()
INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
......@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT WITH_SWIG_PY)
return()
ENDIF()
FIND_PACKAGE(SWIG)
IF(NOT SWIG_FOUND)
......
......@@ -16,25 +16,14 @@ INCLUDE(ExternalProject)
SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE)
IF(WIN32)
SET(WARPCTC_LIBRARIES
"${WARPCTC_INSTALL_DIR}/lib/warpctc.dll" CACHE FILEPATH "Warp-ctc Library" FORCE)
ELSE(WIN32)
IF(APPLE)
SET(_warpctc_SHARED_SUFFIX dylib)
ELSE(APPLE)
SET(_warpctc_SHARED_SUFFIX so)
ENDIF(APPLE)
SET(WARPCTC_LIBRARIES
"${WARPCTC_INSTALL_DIR}/lib/libwarpctc.${_warpctc_SHARED_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE)
ENDIF(WIN32)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE)
# Used in unit test test_WarpCTCLayer
SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
CACHE PATH "Warp-ctc Library Directory" FORCE)
SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
CACHE FILEPATH "Warp-ctc Library" FORCE)
IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
SET(USE_OMP OFF)
......@@ -49,22 +38,26 @@ ExternalProject_Add(
PREFIX ${WARPCTC_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
CMAKE_ARGS -DWITH_GPU=${WITH_GPU}
CMAKE_ARGS -DWITH_OMP=${USE_OMP}
CMAKE_ARGS -DWITH_TORCH=OFF
CMAKE_ARGS -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
CMAKE_ARGS -DBUILD_SHARED=ON
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-DWITH_GPU=${WITH_GPU}
-DWITH_OMP=${USE_OMP}
-DWITH_TORCH=OFF
-DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-DBUILD_SHARED=ON
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=Release
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
)
MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
ADD_DEPENDENCIES(warpctc extern_warpctc)
......
......@@ -34,15 +34,16 @@ ExternalProject_Add(
GIT_TAG "v1.2.8"
PREFIX ${ZLIB_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DCMAKE_MACOSX_RPATH=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
-DBUILD_SHARED_LIBS=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_MACOSX_RPATH=ON
-DCMAKE_BUILD_TYPE=Release
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
......
......@@ -128,8 +128,10 @@ set(GPU_COMMON_FLAGS
)
if (APPLE)
# On Mac OS X build fat binaries with x86_64 architectures by default.
set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
if(NOT CMAKE_CROSSCOMPILING)
# On Mac OS X build fat binaries with x86_64 architectures by default.
set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
endif()
else()
set(GPU_COMMON_FLAGS
-Wall
......
......@@ -24,11 +24,10 @@ IF(WIN32)
SET(HOST_SYSTEM "win32")
ELSE(WIN32)
IF(APPLE)
EXEC_PROGRAM (sw_vers ARGS -productVersion OUTPUT_VARIABLE MACOSX_VERSION)
STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
SET(MACOS_VERSION ${VERSION})
SET(HOST_SYSTEM "macosx")
IF(NOT DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION)
STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}")
IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET})
# Set cache variable - end user may change this during ccmake or cmake-gui configure.
SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
"Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
......@@ -49,6 +48,8 @@ ELSE(WIN32)
ELSEIF(LINUX_ISSUE MATCHES "Fedora")
SET(HOST_SYSTEM "fedora")
ENDIF()
STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}")
ENDIF(EXISTS "/etc/issue")
IF(EXISTS "/etc/redhat-release")
......@@ -70,7 +71,7 @@ CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
# configuration for cross-compiling
......@@ -82,6 +83,9 @@ IF(DEFINED CMAKE_SYSTEM_NAME)
ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi")
SET(RPI TRUE)
INCLUDE(cross_compiling/raspberry_pi)
ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
SET(IOS TRUE)
INCLUDE(cross_compiling/ios)
ENDIF()
ENDIF()
......
......@@ -25,7 +25,9 @@ function(target_circle_link_libraries TARGET_NAME)
endif()
endforeach()
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
list(APPEND LIBS "-undefined dynamic_lookup")
if(IOS AND NOT IOS_ENABLE_BITCODE)
list(APPEND LIBS "-undefined dynamic_lookup")
endif()
endif()
list(REVERSE libsInArgn)
target_link_libraries(${TARGET_NAME}
......
# Design Doc: Refactorization Overview
The goal of refactorizaiton include:
1. Make it easy for external contributors to write new elementory computaiton operations.
1. Make the codebase clean and readable.
1. Introduce a new design of computation representation -- a computation graph of operators and variables.
1. The graph representation helps implementing auto-scalable and auto fault recoverable distributed computing.
## Computation Graphs
1. PaddlePaddle represent the computation, training and inference of DL models, by computation graphs.
1. Please dig into [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a solid example.
1. Users write Python programs to describe the graphs and run it (locally or remotely).
1. A graph is composed of *variabels* and *operators*.
1. The description of graphs must be able to be serialized/deserialized, so it
1. could to be sent to the cloud for distributed execution, and
1. be sent to clients for mobile or enterprise deployment.
1. The Python program do
1. *compilation*: runs a Python program to generate a protobuf message representation of the graph and send it to
1. the C++ library `libpaddle.so` for local execution,
1. the master process of a distributed training job for training, or
1. the server process of a Kubernetes serving job for distributed serving.
1. *execution*: according to the protobuf message, constructs instances of class `Variable` and `OperatorBase`, and run them.
## Description and Realization
At compile time, the Python program generates protobuf message representation of the graph, or the description of the graph.
At runtime, the C++ program realizes the graph and run it.
| | Representation (protobuf messages) | Realization (C++ class objects) |
|---|---|---|
|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
|Block|BlockDesc|Block|
The word *graph* is exchangable with *block* in this document. A graph represent computation steps and local variables as a C++/Java program block, or a pair of { and }.
## Compilation and Execution
1. Run an applicaton Python program to describe the graph. In particular,
1. create VarDesc to represent local/intermediate variables,
1. create operators and set attributes,
1. validate attribute values,
1. inference the type and the shape of variables,
1. plan for memory-reuse for variables,
1. generate backward and optimization part of the Graph.
1. possiblly split the graph for distributed training.
1. The invocation of `train` or `infer` in the application Python program:
1. create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
1. realize local variables defined in the BlockDesc message in the new scope,
1. a scope is similar to the stack frame in programming languages,
1. create an instance of class `Block`, in which,
1. realize operators in the BlockDesc message,
1. run the Block by calling
1. `Block::Eval(vector<Variable>* targets)` for forward and backward computations, or
1. `Block::Eval(vector<Operator>* targets)` for optimization.
## Intermediate Representation (IR)
```text
Compile Time -> IR -> Runtime
```
### Benefit
- Optimization
```text
Compile Time -> IR -> Optimized IR -> Runtime
```
- Send automatically partitioned IR to different nodes.
- Automatic data parallel
```text
Compile Time
|-> Single GPU IR
|-> [trainer-IR-0, trainer-IR-1, pserver-IR]
|-> Node-0 (runs trainer-IR-0)
|-> Node-1 (runs trainer-IR-1)
|-> Node-2 (runs pserver-IR)
```
- Automatic model parallel (planned for future)
---
# Operator/OpWithKernel/OpKernel
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot)
---
# Operator
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
* `Operator` is the fundamental building block as the user interface.
* Operator stores input/output variable name, and attributes.
* The `InferShape` interface is used to infer output variable shapes by its input shapes.
* Use `Run` to compute `input variables` to `output variables`.
---
# OpWithKernel/Kernel
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot)
* `OpWithKernel` inherits `Operator`.
* `OpWithKernel` contains a Kernel map.
* `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`.
* `OpKernelKey` is the map key. Only device place now, but may be data type later.
---
# Why separate Kernel and Operator
* Separate GPU and CPU code.
* Make Paddle can run without GPU.
* Make one operator (which is user interface) can contain many implementations.
* Same mul op, different FP16, FP32 Kernel. different MKL, eigen kernel.
---
# Libraries for Kernel development
* `Eigen::Tensor` contains basic math and element-wise functions.
* Note that `Eigen::Tensor` has broadcast implementation.
* Limit number of `tensor.device(dev) = ` in your code.
* `thrust::tranform` and `std::transform`.
* `thrust` has the same API as C++ standard library. Using `transform` can quickly implement a customized elementwise kernel.
* `thrust` has more complex API, like `scan`, `reduce`, `reduce_by_key`.
* Hand-writing `GPUKernel` and `CPU` code
* Do not write `.h`. CPU Kernel should be in `.cc`. CPU kernel should be in `.cu`. (`GCC` cannot compile GPU code.)
---
# Operator Register
## Why register is necessary?
We need a method to build mappings between Op type names and Op classes.
## How to do the register?
Maintain a map, whose key is the type name and value is corresponding Op constructor.
---
# The Registry Map
### `OpInfoMap`
`op_type(string)` -> `OpInfo`
`OpInfo`:
- **`creator`**: The Op constructor.
- **`grad_op_type`**: The type of the gradient Op.
- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes.
- **`checker`**: Used to check attributes.
---
# Related Concepts
### Op_Maker
It's constructor takes `proto` and `checker`. They are compeleted during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
### Register Macros
```cpp
REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
```
### `USE` Macros
make sure the registration process is executed and linked.
---
# Register Process
1. Write Op class, as well as its gradient Op class if there is.
2. Write Op maker class. In the constructor, describe its inputs, outputs, and attributes.
3. Invoke macro `REGISTER_OP`. The macro will
1. call maker class to complete `proto` and `checker`
2. with the completed `proto` and `checker`, build a new key-value pair in the `OpInfoMap`
4. Invoke `USE` macro in where the Op is used to make sure it is linked.
---
# Backward Module (1/2)
### Create Backward Operator
- Mapping from forwarding Op to backward Op
![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
---
# Backward Module (2/2)
### Build Backward Network
- **Input** graph of forwarding operators
- **Output** graph of backward operators
- **corner case in construction**
- shared variable => insert `Add` operator
- no gradient => insert `fill_zero_grad` operator
- recursive netOp => call `Backward` recursively
- RNN Op => recursively call `Backward` on stepnet
---
# Scope, Variable, Tensor
* `Tensor` is an n-dimension array with type.
* Only dims and data pointers are stored in `Tensor`.
* All operators on `Tensor` is written in `Operator` or global functions.
* variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
* `Variable` is the inputs and outputs of an operator. Not just `Tensor`.
* step_scopes in RNN is a variable and not a tensor.
* `Scope` is where variables store at.
* map<string/*var name */, Variable>
* `Scope` has a hierarchical structure. The local scope can get variable from its parent scope.
---
# Block (in design)
## the difference with original RNNOp
- as an operator is more intuitive than `RNNOp`,
- offers new interface `Eval(targets)` to deduce the minimal block to `Run`,
- fits the compile-time/ runtime separation design.
- during the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
- when graph executes, a Block with `BlockDesc` passed in creates `Op` and `Var` then `Run`
---
# Milestone
- take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
- model migration
- framework development gives **priority support** to model migration, for example,
- the MNIST demo needs a Python interface,
- the RNN models require the framework to support `LoDTensor`.
- determine some timelines,
- heavily-relied Ops need to be migrated first,
- different models can be migrated parallelly.
- improve the framework at the same time
- accept imperfection, concentrated on solving the specific problem at the right price.
---
# Control the migration quality
- compare the performance of migrated models with old ones.
- follow google C style
- build the automatic workflow of generating Python/C++ documentations
- the documentation of layers and ops should be written inside the code
- take the documentation quality into account when doing PR
- preview the documentations, read and improve them from users' perspective
......@@ -19,7 +19,7 @@ if(Boost_FOUND)
endif()
if(WITH_C_API)
add_subdirectory(capi)
add_subdirectory(capi)
endif()
if(WITH_SWIG_PY)
......
......@@ -28,42 +28,38 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
add_dependencies(paddle_capi paddle_proto)
# combine all paddle static libraries together, into libpaddle_capi_whole.a
# user should use PaddleCAPI as -lpaddle_capi_whole
set(capi_whole_library libpaddle_capi_whole.a)
add_custom_target(paddle_capi_whole ALL
COMMAND mkdir -p o_files/capi && cd o_files/capi/ && ar -x $<TARGET_FILE:paddle_capi>
COMMAND mkdir -p o_files/utils && cd o_files/utils/ && ar -x $<TARGET_FILE:paddle_utils>
COMMAND mkdir -p o_files/parameter && cd o_files/parameter/ && ar -x $<TARGET_FILE:paddle_parameter>
COMMAND mkdir -p o_files/math && cd o_files/math/ && ar -x $<TARGET_FILE:paddle_math>
COMMAND mkdir -p o_files/cuda && cd o_files/cuda/ && ar -x $<TARGET_FILE:paddle_cuda>
COMMAND mkdir -p o_files/function && cd o_files/function/ && ar -x $<TARGET_FILE:paddle_function>
COMMAND mkdir -p o_files/gserver && cd o_files/gserver/ && ar -x $<TARGET_FILE:paddle_gserver>
COMMAND mkdir -p o_files/proto && cd o_files/proto/ && ar -x $<TARGET_FILE:paddle_proto>
COMMAND mkdir -p o_files/network && cd o_files/network/ && ar -x $<TARGET_FILE:paddle_network>
COMMAND mkdir -p o_files/pserver && cd o_files/pserver/ && ar -x $<TARGET_FILE:paddle_pserver>
COMMAND ar crs ${capi_whole_library} `find ./o_files -name '*.o'`
COMMAND rm -rf o_files
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
DEPENDS paddle_capi paddle_utils paddle_parameter paddle_math
paddle_cuda paddle_function paddle_gserver
paddle_proto paddle_pserver paddle_network
)
set_target_properties(paddle_capi_whole
PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library})
set(PADDLE_CAPI_INFER_LIBS
paddle_utils
paddle_parameter
paddle_math
paddle_cuda
paddle_function
paddle_gserver
paddle_proto
paddle_pserver
paddle_network)
cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
# TODO: merge mkl into paddle_capi_shared
add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
set_target_properties(paddle_capi_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
link_paddle_exe(paddle_capi_shared)
# No shared library for iOS
if(NOT IOS)
set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
# TODO: merge mkl into paddle_capi_shared
add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
set_target_properties(paddle_capi_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
link_paddle_exe(paddle_capi_shared)
endif()
# install library & headers.
install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
if(ANDROID)
install(TARGETS paddle_capi_whole paddle_capi_shared
ARCHIVE DESTINATION lib/${ANDROID_ABI}
LIBRARY DESTINATION lib/${ANDROID_ABI})
execute_process(
COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
OUTPUT_VARIABLE GIT_COMMITS_LIST
......@@ -72,9 +68,6 @@ if(ANDROID)
if(${GIT_COMMITS_LIST_RESULT})
set(GIT_COMMITS_LIST "No commits.")
endif()
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library}
DESTINATION lib/${ANDROID_ABI})
install(TARGETS paddle_capi_shared DESTINATION lib/${ANDROID_ABI})
install(CODE "FILE(WRITE ${CMAKE_INSTALL_PREFIX}/lib/${ANDROID_ABI}/BUILD.txt
\"Compiler:\n\"
\"\\t${CMAKE_C_COMPILER}\\n\"
......@@ -88,8 +81,11 @@ if(ANDROID)
)"
)
else(ANDROID)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib)
install(TARGETS paddle_capi_shared DESTINATION lib)
install(TARGETS paddle_capi_whole
ARCHIVE DESTINATION lib)
if(NOT IOS)
install(TARGETS paddle_capi_shared DESTINATION lib)
endif()
endif(ANDROID)
# this variable used for unittest
......
......@@ -22,10 +22,10 @@ limitations under the License. */
*/
typedef enum {
HL_POOLING_MAX = 0,
// average includes padded values
HL_POOLING_AVERAGE = 1,
// average does not include padded values
HL_POOLING_AVERAGE_EXCLUDE_PADDING = 2,
HL_POOLING_AVERAGE = 1,
// average includes padded values
HL_POOLING_AVERAGE_INCLUDE_PADDING = 2,
HL_POOLING_END
} hl_pooling_mode_t;
......
......@@ -461,7 +461,7 @@ class add<float32x4_t> {
public:
INLINE float32x4_t operator()(const float32x4_t a,
const float32x4_t b) const {
return vmulq_f32(a, b);
return vaddq_f32(a, b);
}
};
......
......@@ -211,13 +211,11 @@ __global__ void KeAvgPoolForward(const int nthreads,
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int pool_size = (hend - hstart) * (wend - wstart);
int hend = min(hstart + sizeY, height);
int wend = min(wstart + sizeX, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
hend = min(hend, height);
wend = min(wend, width);
int pool_size = (hend - hstart) * (wend - wstart);
real aveval = 0;
inputData += (frameNum * channels + c) * height * width;
......@@ -299,12 +297,14 @@ __global__ void KeAvgPoolBackward(const int nthreads,
outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
for (int ph = phstart; ph < phend; ++ph) {
int hstart = ph * strideH - padH;
int hend = min(hstart + sizeY, height);
hstart = max(hstart, 0);
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int wend = min(wstart + sizeX, width);
wstart = max(wstart, 0);
int poolsize = (hend - hstart) * (wend - wstart);
gradient += outGrad[ph * pooledW + pw] / poolsize;
}
......@@ -600,16 +600,13 @@ __global__ void KeAvgPool3DForward(const int nthreads,
int dstart = pd * strideD - padD;
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int dend = min(dstart + sizeZ, depth + padD);
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
int dend = min(dstart + sizeZ, depth);
int hend = min(hstart + sizeY, height);
int wend = min(wstart + sizeX, width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
dend = min(dend, depth);
hend = min(hend, height);
wend = min(wend, width);
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
real aveval = 0;
inputData += (frameNum * channels + c) * depth * height * width;
......@@ -712,15 +709,18 @@ __global__ void KeAvgPool3DBackward(const int nthreads,
outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;
for (int pd = pdstart; pd < pdend; ++pd) {
int dstart = pd * strideD - padD;
int dend = min(dstart + sizeZ, depth);
dstart = max(dstart, 0);
for (int ph = phstart; ph < phend; ++ph) {
int hstart = ph * strideH - padH;
int hend = min(hstart + sizeY, height);
hstart = max(hstart, 0);
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int dstart = pd * strideD - padD;
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int dend = min(dstart + sizeZ, depth + padD);
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int wend = min(wstart + sizeX, width);
wstart = max(wstart, 0);
int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
}
......
......@@ -432,11 +432,11 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
cudnn_mode = CUDNN_POOLING_MAX;
break;
case HL_POOLING_AVERAGE:
cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
break;
case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
break;
case HL_POOLING_AVERAGE_INCLUDE_PADDING:
cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
break;
default:
LOG(FATAL) << "parameter mode error";
}
......
......@@ -22,14 +22,14 @@ namespace framework {
template <>
Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
platform::CPUPlace, Eigen::DefaultDevice>() const {
return *device_context_->get_eigen_device<Eigen::DefaultDevice>();
return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
}
#ifndef PADDLE_ONLY_CPU
template <>
Eigen::GpuDevice&
ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
return *device_context_->get_eigen_device<Eigen::GpuDevice>();
return *device_context_.get_eigen_device<Eigen::GpuDevice>();
}
#endif
......
......@@ -366,7 +366,7 @@ struct EigenDeviceConverter<platform::GPUPlace> {
class ExecutionContext : public InferShapeContext {
public:
ExecutionContext(const OperatorBase& op, const Scope& scope,
const platform::DeviceContext* device_context)
const platform::DeviceContext& device_context)
: InferShapeContext(op, scope), device_context_(device_context) {}
template <typename PlaceType,
......@@ -374,9 +374,9 @@ class ExecutionContext : public InferShapeContext {
typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
DeviceType& GetEigenDevice() const;
platform::Place GetPlace() const { return device_context_->GetPlace(); }
platform::Place GetPlace() const { return device_context_.GetPlace(); }
const platform::DeviceContext* device_context() const {
const platform::DeviceContext& device_context() const {
return device_context_;
}
......@@ -401,7 +401,8 @@ class ExecutionContext : public InferShapeContext {
return res;
}
const platform::DeviceContext* device_context_;
private:
const platform::DeviceContext& device_context_;
};
template <>
......@@ -461,7 +462,7 @@ class OperatorWithKernel : public OperatorBase {
void Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const final {
auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
opKernel->Compute(ExecutionContext(*this, scope, &dev_ctx));
opKernel->Compute(ExecutionContext(*this, scope, dev_ctx));
}
static std::unordered_map<std::string /* op_type */, OpKernelMap>&
......
......@@ -52,7 +52,7 @@ public:
int outputHeight = output[2];
int outputWidth = output[3];
int filterMultiplier = outputChannels / groups_;
CHECK_EQ(inputChannels, groups_);
CHECK_EQ(static_cast<size_t>(inputChannels), groups_);
// only support strideH() == strideW() and filterHeight == filterWidth.
CHECK_EQ(strideH(), strideW());
......
......@@ -22,9 +22,12 @@ limitations under the License. */
#include <type_traits>
#include "paddle/parameter/Argument.h"
#include "paddle/utils/ClassRegistrar.h"
#include "paddle/utils/Logging.h"
#ifdef PADDLE_USE_MKLDNN
#include "MKLDNNActivation.h"
#endif
namespace paddle {
static ClassRegistrar<ActivationFunction> gActivationRegistrar;
......@@ -456,6 +459,12 @@ Error __must_check backward(Argument& act) {
END_DEFINE_ACTIVATION(log)
ActivationFunction* ActivationFunction::create(const std::string& type) {
#ifdef PADDLE_USE_MKLDNN
if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
return MKLDNNActivation::create(type);
}
#endif
return gActivationRegistrar.createByType(type);
}
......
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLDNNActivation.h"
#include "mkldnn.hpp"
#include "paddle/utils/ClassRegistrar.h"
namespace paddle {
static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
/**
* @def MKLDNN_ACTIVATION_CLASS_NAME
* @note MKLDNN_ACTIVATION_CLASS_NAME(relu) relu_;
* means mkldnn_reluActivation relu_;
*/
#define MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) mkldnn_##ACT_TYPE##Activation
/**
* @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
*/
#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(ACT_TYPE, ALPHA, BWD_ALPHA) \
class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) \
: public MKLDNNEltwiseActivation { \
private: \
static const std::string name; \
static const float alpha; \
static const float bwdAlpha; \
\
public: \
const std::string& getName() const { return name; } \
float getAlpha() const { return alpha; } \
float getBwdAlpha() const { return bwdAlpha; } \
}; \
const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name = \
"mkldnn_" #ACT_TYPE; \
const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA; \
const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA; \
static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] { \
gMKLDNNActivationRegistrar \
.registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>( \
"mkldnn_" #ACT_TYPE); \
});
/**
* @brief MKLDNN Relu Activation.
* Actually mkldnn_relu is Leaky Relu.
* f(x) = x (x >= 0)
* f(x) = negative_slope * x (x < 0)
* @note the negative_slope should be -0.f in forward
*/
DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, -0.f, 0.f)
/**
* @brief MKLDNN Tanh Activation.
*/
DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, 0.f, 0.f)
/**
* @brief MKLDNN ELU(Exponential Linear Unit) Activation.
* f(x) = x (x >= 0)
* f(x) = negative_slope * (exp(x) - 1) (x < 0)
*/
DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, 0.f, 0.f)
ActivationFunction* MKLDNNActivation::create(const std::string& type) {
return gMKLDNNActivationRegistrar.createByType(type);
}
std::vector<std::string> MKLDNNActivation::getAllRegisteredTypes() {
std::vector<std::string> types;
gMKLDNNActivationRegistrar.forEachType(
[&](const std::string& type) { types.push_back(type); });
return types;
}
} // namespace paddle
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "ActivationFunction.h"
#include "mkldnn.hpp"
#include "paddle/gserver/layers/MKLDNNBase.h"
#include "paddle/math/MKLDNNMatrix.h"
#include "paddle/parameter/Argument.h"
namespace paddle {
/**
* @brief Base class of MKLDNN Activation.
* Common activation function are provieded,
* including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
*/
class MKLDNNActivation : public ActivationFunction {
protected:
// input value element count
size_t cnt_;
// should not merge the resetBwd into resetFwd,
// because the grad data would be changing before backward.
bool needResetBwd_;
// mkldnn matrix, primitive, stream and pipeline
MKLDNNMatrixPtr val_;
MKLDNNMatrixPtr grad_;
std::shared_ptr<MKLDNNStream> stream_;
std::shared_ptr<mkldnn::primitive> fwd_;
std::shared_ptr<mkldnn::primitive> bwd_;
std::vector<mkldnn::primitive> pipelineFwd_;
std::vector<mkldnn::primitive> pipelineBwd_;
public:
MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
~MKLDNNActivation() {}
static ActivationFunction* create(const std::string& type);
static std::vector<std::string> getAllRegisteredTypes();
virtual const std::string& getName() const = 0;
virtual Error __must_check forward(Argument& act) = 0;
virtual Error __must_check backward(Argument& act) = 0;
};
/**
* @brief Base class of MKLDNN Eltwise Activation,
* includes mkldnn_relu, mkldnn_elu and mkldnn_tanh.
*/
class MKLDNNEltwiseActivation : public MKLDNNActivation {
typedef mkldnn::eltwise_forward eltwise_fwd;
typedef mkldnn::eltwise_backward eltwise_bwd;
protected:
// save the forward primitive desc, which can be used backward
std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
// eltwise_bwd need src input value
MKLDNNMatrixPtr inVal_;
// use for copy data
std::shared_ptr<mkldnn::reorder> copyInVal_;
public:
MKLDNNEltwiseActivation() {}
~MKLDNNEltwiseActivation() {}
virtual const std::string& getName() const = 0;
// in common, the alpha of forward and backward should be equal.
// but for relu, to avoid negative value, they should be opposite
virtual float getAlpha() const = 0;
virtual float getBwdAlpha() const = 0;
virtual float getBeta() const { return 0.f; }
virtual mkldnn::algorithm getAlgo(const std::string& type) const {
if (type == "mkldnn_relu") {
return mkldnn::algorithm::eltwise_relu;
} else if (type == "mkldnn_tanh") {
return mkldnn::algorithm::eltwise_tanh;
} else if (type == "mkldnn_elu") {
return mkldnn::algorithm::eltwise_elu;
} else {
LOG(FATAL) << "Unkown eltwise activation type: " << type;
}
return (mkldnn::algorithm)0;
}
/**
* reshape and reset the forward primitives
*/
void resetFwd(Argument& act) {
if (cnt_ == act.value->getElementCnt()) {
return;
}
cnt_ = act.value->getElementCnt();
stream_.reset(new MKLDNNStream());
auto eng = CPUEngine::Instance().getEngine();
// get algo setting
mkldnn::algorithm algo = getAlgo(this->getName());
// note: alpha represents the NegativeSlope when used in relu.
float alpha = getAlpha();
float beta = getBeta();
/// forward
pipelineFwd_.clear();
val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
if (val_ == nullptr) {
int bs = act.getBatchSize();
int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
int ic = cnt_ / bs / ih / iw;
CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
val_ = MKLDNNMatrix::create(
act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, eng);
CHECK(val_);
}
auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
algo,
val_->getMemoryDesc(),
alpha,
beta);
fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, eng));
// use inplace for forward but save input value before submit
inVal_ = val_;
if (act.grad) {
// only copy when need do backward
inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
CHECK(copyInVal_) << "should not be emptry";
pipelineFwd_.push_back(*copyInVal_);
}
fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
pipelineFwd_.push_back(*fwd_);
needResetBwd_ = true;
}
/**
* reset the backward primitives, can not merge into resetFwd as the grad data
* would be changing before backward.
*/
void resetBwd(Argument& act) {
if (!needResetBwd_) {
return;
}
needResetBwd_ = false;
mkldnn::algorithm algo = getAlgo(this->getName());
float alpha = getBwdAlpha();
float beta = getBeta();
grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
auto eng = CPUEngine::Instance().getEngine();
auto bwdDesc = eltwise_bwd::desc(
algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
CHECK(inVal_);
bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
pipelineBwd_.clear();
pipelineBwd_.push_back(*bwd_);
}
Error __must_check forward(Argument& act) {
resetFwd(act);
stream_->submit(pipelineFwd_);
return Error();
}
Error __must_check backward(Argument& act) {
resetBwd(act);
stream_->submit(pipelineBwd_);
return Error();
}
};
} // namespace paddle
......@@ -29,9 +29,9 @@ bool CudnnPoolLayer::typeCheck(const std::string &poolType,
if (mode) {
*mode = HL_POOLING_AVERAGE;
}
} else if (poolType == "cudnn-avg-excl-pad-pool") {
} else if (poolType == "cudnn-avg-incl-pad-pool") {
if (mode) {
*mode = HL_POOLING_AVERAGE_EXCLUDE_PADDING;
*mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
}
} else {
return false;
......
......@@ -143,7 +143,7 @@ void DetectionOutputLayer::forward(PassType passType) {
resetOutput(numKept, 7);
} else {
MatrixPtr outV = getOutputValue();
outV = NULL;
if (outV) outV->resize(0, 0);
return;
}
MatrixPtr outV = getOutputValue();
......
......@@ -294,12 +294,9 @@ void MKLDNNConvLayer::resetOutValue(
std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& out) {
out = MKLDNNMatrix::create(output_.value, pd->dst_primitive_desc());
// change original output value from cpu matrix to mkldnn matrix
output_.value = std::dynamic_pointer_cast<Matrix>(out);
// create reorder if output value has cpu device and pd do not match
cpuOutVal_ = nullptr;
cpuOutVal_ = nullptr;
cvtOutVal_ = nullptr;
if (!outputIsOnlyMKLDNN()) {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
......
......@@ -172,12 +172,10 @@ void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
// change original output value to mkldnn output value
output_.value = std::dynamic_pointer_cast<Matrix>(out);
if (!outputIsOnlyMKLDNN()) {
// fc cpu output value do not need create convert
// just share point
getOutput(CPU_DEVICE).value->setData(output_.value->getData());
getOutput(CPU_DEVICE).value->setData(out->getData());
}
}
......
......@@ -119,6 +119,10 @@ public:
inputElemenCnt_ = elemenCnt;
reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
if (outVal_) {
// change original output value to mkldnn output value
output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
}
convertWeightsFromPaddle();
needResetBwd_ = true;
}
......
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLDNNPoolLayer.h"
#include "paddle/math/MathUtils.h"
#include "paddle/utils/Logging.h"
using namespace mkldnn; // NOLINT
typedef memory::format format;
namespace paddle {
REGISTER_LAYER(mkldnn_pool, MKLDNNPoolLayer);
bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
if (!MKLDNNLayer::init(layerMap, parameterMap)) {
return false;
}
/* the size of inputs for pool-layer is 1 */
CHECK_EQ(config_.inputs_size(), 1);
const PoolConfig& conf = config_.inputs(0).pool_conf();
ic_ = conf.channels();
ih_ = conf.img_size_y();
iw_ = conf.img_size();
oc_ = ic_;
oh_ = conf.output_y();
ow_ = conf.output_x();
fh_ = conf.size_y();
fw_ = conf.size_x();
ph_ = conf.padding_y();
pw_ = conf.padding();
sh_ = conf.stride_y();
sw_ = conf.stride();
const std::string& type = conf.pool_type();
if (type == "max-projection") {
poolAlgo_ = algorithm::pooling_max;
} else if (type == "avg-projection") {
// paddle only use exclude_padding
poolAlgo_ = algorithm::pooling_avg_exclude_padding;
} else {
LOG(FATAL) << "unknow pooling type!";
}
return true;
}
void MKLDNNPoolLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw);
// ic_ and oc can not be changed
CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
<< "Input channel can not be changed";
// cal output sizes
// paddle used false caffeMode for pooling
oh = outputSize(ih, fh_, ph_, sh_, false);
ow = outputSize(iw, fw_, pw_, sw_, false);
reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow);
printSizeInfo();
}
void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
resetFwdBuffers(in, out);
resetFwdPD(fwdPD_, in, out);
resetFwdPipeline(pipeline, fwdPD_, in, out);
printValueFormatFlow();
}
void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
std::shared_ptr<pool_bwd::primitive_desc> pd;
resetBwdBuffers(in, out);
resetBwdPD(pd, in, out);
resetBwdPipeline(pipeline, pd, in, out);
printGradFormatFlow();
}
void MKLDNNPoolLayer::updateInputData() {
inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
}
void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
resetInValue(in);
resetOutValue(out);
}
void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {
if (inputIsOnlyMKLDNN()) {
const MatrixPtr& dnnIn = getInputValue(0);
in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
CHECK(in) << "Input should be MKLDNNMatrix";
} else {
CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
in = MKLDNNMatrix::create(
cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
}
}
void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
CHECK(inVal_) << "Should reset input value first";
memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
out = MKLDNNMatrix::create(
output_.value, outDims, inVal_->getFormat(), engine_);
// create reorder if output value has cpu device and pd do not match
cpuOutVal_ = nullptr;
cvtOutVal_ = nullptr;
if (!outputIsOnlyMKLDNN()) {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
CHECK(cvtOutVal_) << "should not be emptry";
} else {
// CPU output share the same data of MKLDNN output
cpuOut->setData(out->getData());
cpuOutVal_ = out;
}
}
}
void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr in,
MKLDNNMatrixPtr out) {
memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
memory::dims kernels = memory::dims{fh_, fw_};
memory::dims strides = memory::dims{sh_, sw_};
memory::dims padL = memory::dims{ph_, pw_};
memory::dims padR = getPaddingR();
padding_kind padKind = padding_kind::zero;
prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
: prop_kind::forward_training;
auto fwdDesc = pool_fwd::desc(pk,
poolAlgo_,
in->getMemoryDesc(),
out->getMemoryDesc(),
strides,
kernels,
padL,
padR,
padKind);
pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_));
// prepare workspace if necessary
workspace_ =
(passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max)
? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
: nullptr;
}
void MKLDNNPoolLayer::resetFwdPipeline(
std::vector<primitive>& pipeline,
std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
pipeline.clear();
fwd_ = workspace_
? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
: std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
pipeline.push_back(*fwd_);
if (cvtOutVal_) {
pipeline.push_back(*cvtOutVal_);
}
}
void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
resetOutGrad(out);
resetInGrad(in);
}
void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
CHECK(outVal_) << "Should have output value";
out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
// create reorder if output value has cpu device and pd do not match
cpuOutGrad_ = nullptr;
cvtOutGrad_ = nullptr;
if (!outputIsOnlyMKLDNN()) {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
cpuOutGrad_ = MKLDNNMatrix::create(
cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
if (cpuOutGrad_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
CHECK(cvtOutGrad_) << "should not be emptry";
} else {
// share the same data of CPU output
output_.grad->setData(cpuOut->getData());
out = cpuOutGrad_;
}
}
}
void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
in = nullptr;
const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
if (inGrad == nullptr) {
return;
}
CHECK(inVal_);
in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
}
void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
memory::dims kernels = memory::dims{fh_, fw_};
memory::dims strides = memory::dims{sh_, sw_};
memory::dims padL = memory::dims{ph_, pw_};
memory::dims padR = getPaddingR();
CHECK(in);
CHECK(out);
auto bwdDesc = pool_bwd::desc(poolAlgo_,
in->getMemoryDesc(),
out->getMemoryDesc(),
strides,
kernels,
padL,
padR,
padding_kind::zero);
pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
}
void MKLDNNPoolLayer::resetBwdPipeline(
std::vector<primitive>& pipeline,
std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
pipeline.clear();
if (cvtOutGrad_) {
pipeline.push_back(*cvtOutGrad_);
}
bwdData_ =
workspace_
? std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *workspace_, *in))
: std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *in));
pipeline.push_back(*bwdData_);
}
} // namespace paddle
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "MKLDNNLayer.h"
#include "mkldnn.hpp"
namespace paddle {
typedef mkldnn::pooling_forward pool_fwd;
typedef mkldnn::pooling_backward pool_bwd;
/**
* @brief A subclass of MKLDNNLayer pool layer.
*
* The config file api is mkldnn_pool
*/
class MKLDNNPoolLayer : public MKLDNNLayer {
protected:
// padding height and width
int ph_, pw_;
// stride height and width
int sh_, sw_;
// filter(kenerl) height and width
int fh_, fw_;
// pooling_avg or pooling_max
mkldnn::algorithm poolAlgo_;
// MKLDNNMatrixPtr which should be created from CPU Device
MKLDNNMatrixPtr cpuOutVal_;
MKLDNNMatrixPtr cpuOutGrad_;
// convert handle between CPU device and MKLDNN device
std::shared_ptr<mkldnn::reorder> cvtOutVal_;
std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
// save forward primitive_desc, which can be used backward
std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
// according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
// test_pooling_forward.cpp, pool need workspace for backward
std::shared_ptr<mkldnn::memory> workspace_;
public:
explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
~MKLDNNPoolLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void updateInputData() override;
void printSizeInfo() override {
MKLDNNLayer::printSizeInfo();
VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
<< ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
<< ", sw: " << sw_;
}
protected:
/**
* Forward functions: reset buffers(input, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetInValue(MKLDNNMatrixPtr& in);
void resetOutValue(MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr in,
MKLDNNMatrixPtr out);
void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(input, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetOutGrad(MKLDNNMatrixPtr& out);
void resetInGrad(MKLDNNMatrixPtr& in);
void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out);
void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out);
/**
* get padding_r according to
* https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
* test_pooling_forward.cpp
*/
mkldnn::memory::dims getPaddingR() const {
mkldnn::memory::dims padR = {ph_, pw_};
for (int i = 0; i < 2; ++i) {
if ((ih_ + ph_ + padR[0] - fh_) / sh_ + 1 < oh_) {
++padR[0];
}
if ((iw_ + pw_ + padR[1] - fw_) / sw_ + 1 < ow_) {
++padR[1];
}
}
return padR;
}
};
} // namespace paddle
......@@ -73,9 +73,10 @@ void SequenceSliceLayer::checkInputs() {
CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
<< "must be a sequence.";
const MatrixPtr indices1 = getInputValue(1);
CHECK_EQ(static_cast<size_t>(indices1->getHeight()),
inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
: inputSeq.getNumSequences())
CHECK_EQ(
indices1->getHeight(),
static_cast<size_t>(inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
: inputSeq.getNumSequences()))
<< "Height of the second input should be equal to number of sequence "
<< "in the first input.";
if (inputLayers_.size() == 3) {
......@@ -151,7 +152,7 @@ void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
int seqLen = endPos - begPos + 1;
CHECK_GT(seqLen, 0U);
CHECK_GT(seqLen, 0);
for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
hasSubseq
? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
......
......@@ -64,15 +64,17 @@ void MKLDNNTester::reset(const TestConfig& dnn,
configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
}
refLayer_ = testLayers_[REF];
dnnLayer_ = std::dynamic_pointer_cast<MKLDNNLayer>(testLayers_[DNN]);
CHECK(dnnLayer_);
// for comparison with Paddle reference results,
// need manually add cpu device output for test
dnnLayer_->addOutputArgument(CPU_DEVICE);
dnnLayer_ = testLayers_[DNN];
EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
setInputImgSize();
// for comparison with Paddle reference results,
// need manually add cpu device output for test
MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
if (dnnLayer) {
dnnLayer->addOutputArgument(CPU_DEVICE);
}
}
void MKLDNNTester::setInputImgSize() {
......@@ -122,7 +124,7 @@ void MKLDNNTester::randomTopDiffs() {
void MKLDNNTester::checkForward() {
VLOG(MKLDNN_ALL) << "Check Forward";
printTopDatas();
double delta = compareMatrix(dnnLayer_->getOutput(-1).value,
double delta = compareMatrix(dnnLayer_->getOutput(CPU_DEVICE).value,
refLayer_->getOutputValue());
EXPECT_LE(fabs(delta), eps_);
}
......@@ -155,7 +157,10 @@ void MKLDNNTester::checkBackwardWgts() {
vector<VectorPtr> dnnWgts; // used to temply save mkldnn weights
saveWgt(parameters_[DNN], dnnWgts);
dnnLayer_->convertWeightsToPaddle();
MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
if (dnnLayer) {
dnnLayer->convertWeightsToPaddle();
}
for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
......@@ -322,6 +327,10 @@ void MKLDNNTester::runOnce() {
// and clearTopDatas(REF) should be coverd by ref layers
clearBotDiffs(REF);
clearWgtDiffs(REF);
// it is necessary to clear bottom diffs when only activation is dnn type
if (configs_[DNN].layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) {
clearBotDiffs(DNN);
}
}
void MKLDNNTester::run(const TestConfig& dnn,
......@@ -333,8 +342,19 @@ void MKLDNNTester::run(const TestConfig& dnn,
float epsilon,
bool log,
int level) {
VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: " << dnn.layerConfig.type()
<< " vs " << ref.layerConfig.type();
CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
<< "should be MKLDNN layer or MKLDNN activation";
if (dnn.layerConfig.type() == ref.layerConfig.type()) {
VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
<< dnn.layerConfig.active_type() << " vs "
<< ref.layerConfig.active_type();
} else {
VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
<< dnn.layerConfig.type() << " vs "
<< ref.layerConfig.type();
}
ih_ = inputImgH;
iw_ = inputImgW;
iter_ = iter;
......
......@@ -41,8 +41,7 @@ protected:
vector<LayerMap> layerMaps_;
vector<vector<ParameterPtr>> parameters_;
vector<LayerPtr> testLayers_;
LayerPtr refLayer_;
MKLDNNLayerPtr dnnLayer_;
LayerPtr refLayer_, dnnLayer_;
/// run some iterations, all the result should pass
size_t iter_;
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <vector>
#include "MKLDNNTester.h"
#include "ModelConfig.pb.h"
#include "paddle/gserver/activations/MKLDNNActivation.h"
#include "paddle/math/MathUtils.h"
using namespace paddle; // NOLINT
......@@ -141,6 +142,110 @@ TEST(MKLDNNLayer, ConvLayer) {
testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
}
struct testPoolDesc {
int bs, ch; // input channel and output channel are the same
int ih, iw;
int oh, ow;
int fh, fw;
int ph, pw;
int sh, sw;
};
void testPoolLayer(const testPoolDesc& pm) {
const std::string compareTypes[] = {"mkldnn_pool", "pool"};
TestConfig cfg;
cfg.layerConfig.set_type(compareTypes[0]);
cfg.layerConfig.set_size(pm.ch * pm.oh * pm.ow);
cfg.inputDefs.push_back(
{INPUT_DATA,
"layer_0",
/* size of input layer= */ size_t(pm.ch * pm.ih * pm.iw),
0});
LayerInputConfig* input = cfg.layerConfig.add_inputs();
PoolConfig* pool = input->mutable_pool_conf();
pool->set_channels(pm.ch);
pool->set_img_size(pm.iw);
pool->set_img_size_y(pm.ih);
pool->set_output_x(pm.ow);
pool->set_output_y(pm.oh);
pool->set_size_x(pm.fw);
pool->set_size_y(pm.fh);
pool->set_padding(pm.pw);
pool->set_padding_y(pm.ph);
pool->set_stride(pm.sw);
pool->set_stride_y(pm.sh);
int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
CHECK_EQ(ow, pm.ow) << "output size check failed";
CHECK_EQ(oh, pm.oh) << "output size check failed";
MKLDNNTester tester;
for (auto type : {"max-projection", "avg-projection"}) {
pool->set_pool_type(type);
TestConfig ref = cfg;
ref.layerConfig.set_type(compareTypes[1]);
for (auto bs : {pm.bs, 1}) {
tester.run(cfg, ref, bs, pm.ih, pm.iw);
}
}
}
TEST(MKLDNNLayer, PoolLayer) {
/* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw*/
testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
}
struct testActDesc {
int bs, ch;
int ih, iw;
};
static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
cfg.biasSize = 0;
cfg.layerConfig.set_type("addto");
cfg.layerConfig.set_size(pm.ch * pm.ih * pm.iw);
cfg.inputDefs.push_back(
{INPUT_DATA,
"layer_0",
/* size of input layer= */ size_t(pm.ch * pm.ih * pm.iw),
0});
cfg.layerConfig.add_inputs();
}
void testActivation(std::string& type, const testActDesc& pm) {
const std::string compareTypes[] = {type, type.erase(0, 7)};
TestConfig cfg;
getAddtoConfig(cfg, pm);
TestConfig ref = cfg;
cfg.layerConfig.set_active_type(compareTypes[0]);
ref.layerConfig.set_active_type(compareTypes[1]);
MKLDNNTester tester;
for (auto bs : {pm.bs, 1}) {
tester.run(cfg, ref, bs, pm.ih, pm.iw);
}
}
TEST(MKLDNNActivation, Activations) {
auto types = MKLDNNActivation::getAllRegisteredTypes();
// TODO(TJ): mkldnn_softmax not implemented, paddle do not have elu activation
std::set<string> excluded{"mkldnn_softmax", "mkldnn_elu"};
for (auto type : types) {
if (excluded.count(type)) {
continue;
}
testActivation(type, {16, 64, 32, 32});
}
}
// TODO(TJ): add branch test
int main(int argc, char** argv) {
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <cmath>
#include "BaseMatrix.h"
#include "MathFunctions.h"
#include "NEONFunctions.h"
#include "SIMDFunctions.h"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_base.cuh"
......@@ -666,6 +667,13 @@ void BaseMatrixT<T>::relu(BaseMatrixT& b) {
applyBinary(binary::Relu<T>(), b);
}
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <>
void BaseMatrixT<float>::relu(BaseMatrixT& b) {
neon::relu(data_, b.data_, height_ * width_);
}
#endif
DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
template <class T>
void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
......
......@@ -26,7 +26,7 @@ limitations under the License. */
#include <mkl_lapacke.h>
#endif
#ifdef PADDLE_USE_ATLAS
#if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB)
extern "C" {
#include <cblas.h>
#include <clapack.h>
......
此差异已折叠。
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include "NEONFunctions.h"
#include <arm_neon.h>
namespace paddle {
namespace neon {
// b[i] = a[i] > 0.0f ? a[i] : 0.0f
void relu(const float* a, float* b, int len) {
int offset = len % 16;
float32x4_t ma0, ma1, ma2, ma3;
float32x4_t mb0, mb1, mb2, mb3;
float32x4_t zero = vdupq_n_f32(0.f);
for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
ma0 = vld1q_f32(a);
ma1 = vld1q_f32(a + 4);
ma2 = vld1q_f32(a + 8);
ma3 = vld1q_f32(a + 12);
mb0 = vmaxq_f32(ma0, zero);
mb1 = vmaxq_f32(ma1, zero);
mb2 = vmaxq_f32(ma2, zero);
mb3 = vmaxq_f32(ma3, zero);
vst1q_f32(b, mb0);
vst1q_f32(b + 4, mb1);
vst1q_f32(b + 8, mb2);
vst1q_f32(b + 12, mb3);
}
for (int i = 0; i < offset; i++) {
b[i] = a[i] > 0.0f ? a[i] : 0.0f;
}
}
} // namespace neon
} // namespace paddle
#endif
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
namespace neon {
void relu(const float* a, float* b, int len);
} // namespace neon
} // namespace paddle
......@@ -825,9 +825,8 @@ void testMaxPoolFwdBwd(int numSamples,
int strideW,
int padH,
int padW) {
int outH = 0, outW = 0;
outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
int inWidth = imgSizeH * imgSizeW * channels;
MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
......@@ -927,9 +926,8 @@ void testAvgPoolFwdBwd(int numSamples,
int strideW,
int padH,
int padW) {
int outH = 0, outW = 0;
outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
int inWidth = imgSizeH * imgSizeW * channels;
MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
......
......@@ -12,26 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thrust/execution_policy.h>
#include <thrust/reduce.h>
#include "paddle/operators/accuracy_op.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
__global__ void AccuracySingleKernel(const int N, const int D, const int top_k,
const int* Xdata, const int* labelData,
float* accuracy) {
int correct = 0;
for (int row = 0; row < N; row++) {
const int label = labelData[row];
for (int col = 0; col < D; col++) {
const int pred = Xdata[row * D + col];
if (pred == label) {
++correct;
template <int BlockSize>
__global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata,
const int* labeldata, float* accuracy) {
int count = 0;
__shared__ int total[BlockSize];
// support only 1 block
for (int i = threadIdx.x; i < (N); i += BlockSize) {
for (int j = 0; j < D; ++j) {
if (Xdata[i * D + j] == labeldata[i]) {
++count;
break;
}
}
}
*accuracy = static_cast<float>(correct) / static_cast<float>(N);
total[threadIdx.x] = count;
__syncthreads();
// reduce the count with init value 0, and output accuracy.
int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
if (threadIdx.x == 0) {
*accuracy = static_cast<float>(result) / static_cast<float>(N);
}
}
template <typename T>
......@@ -57,8 +69,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel {
return;
}
AccuracySingleKernel<<<1, 1>>>(num_samples, infer_width, 1, inference_data,
label_data, accuracy_data);
AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS>>>(
num_samples, infer_width, inference_data, label_data, accuracy_data);
}
};
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/cross_entropy_op.h"
namespace paddle {
namespace operators {
using framework::LoDTensor;
class CrossEntropyOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
"Input(Label) must not be null.");
PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"), "Output(Y) must not be null.");
auto x = ctx.Input<Tensor>("X");
auto label = ctx.Input<Tensor>("Label");
PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
PADDLE_ENFORCE_EQ(label->dims().size(), 2,
"Input(Label)'s rank must be 2.");
// TODO(xinghai-sun): remove this check after swtiching to bool
PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
ctx.Attr<int>("soft_label") == 1);
PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
"The 1st dimension of Input(X) and Input(Label) must "
"be equal.");
if (ctx.Attr<int>("soft_label") == 1) {
PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
"If Attr(soft_label) == 1, The 2nd dimension of "
"Input(X) and Input(Label) must be equal.");
} else {
PADDLE_ENFORCE_EQ(label->dims()[1], 1,
"If Attr(soft_label) == 0, The 2nd dimension of "
"Input(Label) must be 1.");
}
ctx.Output<LoDTensor>("Y")->Resize({x->dims()[0], 1});
}
};
class CrossEntropyGradientOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
"Input(Label) must not be null.");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
"Input(Y@GRAD) must not be null.");
auto x = ctx.Input<Tensor>("X");
auto label = ctx.Input<Tensor>("Label");
auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
PADDLE_ENFORCE_EQ(dy->dims().size(), 2, "Input(Y@Grad)'s rank must be 2.");
PADDLE_ENFORCE_EQ(label->dims().size(), 2,
"Input(Label)'s rank must be 2.");
// TODO(xinghai-sun): remove this check after swtiching to bool
PADDLE_ENFORCE(ctx.Attr<int>("soft_label") == 0 ||
ctx.Attr<int>("soft_label") == 1);
PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
"The 1st dimension of Input(X) and Input(Label) must "
"be equal.");
PADDLE_ENFORCE_EQ(x->dims()[0], dy->dims()[0],
"The 1st dimension of Input(X) and Input(Y@Grad) must "
"be equal.");
PADDLE_ENFORCE_EQ(dy->dims()[1], 1,
"The 2nd dimension of Input(Y@Grad) must be 1.");
if (ctx.Attr<int>("soft_label") == 1) {
PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
"If Attr(soft_label) == 1, The 2nd dimension of "
"Input(X) and Input(Label) must be equal.");
} else {
PADDLE_ENFORCE_EQ(label->dims()[1], 1,
"If Attr(soft_label) == 0, The 2nd dimension of "
"Input(Label) must be 1.");
}
auto dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
dx->Resize(x->dims());
}
};
class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
public:
CrossEntropyOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The first input of CrossEntropyOp");
AddInput("Label", "The second input of CrossEntropyOp");
AddOutput("Y", "The output of CrossEntropyOp");
AddAttr<int>("soft_label", "Is soft label. Default zero.").SetDefault(0);
AddComment(R"DOC(
CrossEntropy Operator.
It supports both standard cross-entropy and soft-label cross-entropy loss
computation.
1) One-hot cross-entropy:
soft_label = 0, Label[i, 0] indicates the class index for sample i:
Y[i] = -log(X[i, Label[i]])
2) Soft-label cross-entropy:
soft_label = 1, Label[i, j] indicates the soft label of class j
for sample i:
Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
Please make sure that in this case the summuation of each row of Label
equals one.
3) One-hot cross-entropy with vecterized Input(Label):
As a special case of 2), when each row of Input(Label) has only one
non-zero element (equals 1), soft-label cross-entropy degenerates to a
one-hot cross-entropy with one-hot label representation.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
cross_entropy_grad, ops::CrossEntropyGradientOp);
REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>);
REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
ops::CrossEntropyGradientOpKernel<float>);
......@@ -13,27 +13,13 @@
limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/operators/cross_entropy_op.h"
#include "paddle/platform/assert.h"
#include "paddle/platform/hostdevice.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
__host__ __device__ T clipping_log(const T x) {
PADDLE_ASSERT(std::is_floating_point<T>::value);
const T kApproInf = 1e20;
T v = log(x);
if (v == INFINITY) {
return kApproInf;
}
if (v == -INFINITY) {
return -kApproInf;
}
return v;
}
template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
const int N, const int D) {
......@@ -42,7 +28,20 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
Y[i] = -clipping_log(X[i * D + label[i]]);
Y[i] = -tolerable_value(log(X[i * D + label[i]]));
}
}
template <typename T>
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
const int N, const int D) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
T sum = static_cast<T>(0);
for (int j = 0; j < D; j++) {
sum += label[i * D + j] * tolerable_value(log(X[i * D + j]));
}
Y[i] = -sum;
}
}
......@@ -69,57 +68,84 @@ __global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
}
template <typename T>
class OnehotCrossEntropyOpCUDAKernel : public framework::OpKernel {
__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
const T* label, const int N,
const int D) {
// TOOD(qingqing): optimize for this kernel
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
for (int j = 0; j < D; ++j) {
int idx = i * D + j;
dX[idx] = -label[idx] * dY[i] / X[idx];
}
}
}
template <typename T>
class CrossEntropyOpCUDAKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
auto X = ctx.Input<Tensor>("X");
const T* Xdata = X->data<T>();
const int* label_data = ctx.Input<Tensor>("label")->data<int>();
auto Y = ctx.Output<Tensor>("Y");
Y->mutable_data<T>(ctx.GetPlace());
T* Ydata = Y->data<T>();
auto x = ctx.Input<Tensor>("X");
auto y = ctx.Output<Tensor>("Y");
auto label = ctx.Input<Tensor>("Label");
int N = X->dims()[0];
int D = X->dims()[1];
auto* x_data = x->data<T>();
y->mutable_data<T>(ctx.GetPlace());
auto* y_data = y->data<T>();
int n = x->dims()[0];
int d = x->dims()[1];
int block = 512;
int grid = (N + block - 1) / block;
int grid = (n + block - 1) / block;
// TODO(qingqing) launch kernel on specified stream
// base on ExecutionContext.
CrossEntropyKernel<T><<<grid, block>>>(Ydata, Xdata, label_data, N, D);
if (ctx.Attr<int>("soft_label") == 1) {
auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
SoftCrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n,
d);
} else {
auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
CrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, d);
}
}
};
template <typename T>
class OnehotCrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
auto X = ctx.Input<Tensor>("X");
auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto label = ctx.Input<Tensor>("label");
auto x = ctx.Input<Tensor>("X");
auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto label = ctx.Input<Tensor>("Label");
auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace());
auto* dYdata = dY->template data<T>();
auto* Xdata = X->template data<T>();
auto* label_data = label->data<int>();
auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
auto* dy_data = dy->data<T>();
auto* x_data = x->data<T>();
int N = X->dims()[0];
int D = X->dims()[1];
int n = x->dims()[0];
int d = x->dims()[1];
int block = 512;
int grid = (N * D + block - 1) / block;
zero<T><<<grid, block>>>(dXdata, N * D);
grid = (N + block - 1) / block;
int grid = (n * d + block - 1) / block;
zero<T><<<grid, block>>>(dx_data, n * d);
grid = (n + block - 1) / block;
// TODO(qingqing): launch kernel on specified stream
// base on ExecutionContext.
CrossEntropyGradientKernel<T><<<grid, block>>>(dXdata, dYdata, Xdata,
label_data, N, D);
if (ctx.Attr<int>("soft_label") == 1) {
auto* label_data = label->data<T>();
SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
dx_data, dy_data, x_data, label_data, n, d);
} else {
auto* label_data = label->data<int>();
CrossEntropyGradientKernel<T><<<grid, block>>>(dx_data, dy_data, x_data,
label_data, n, d);
}
}
};
......@@ -127,7 +153,6 @@ class OnehotCrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
ops::OnehotCrossEntropyOpCUDAKernel<float>);
REGISTER_OP_GPU_KERNEL(onehot_cross_entropy_grad,
ops::OnehotCrossEntropyGradientOpCUDAKernel<float>);
REGISTER_OP_GPU_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>);
REGISTER_OP_GPU_KERNEL(cross_entropy_grad,
ops::CrossEntropyGradientOpCUDAKernel<float>);
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/platform/hostdevice.h"
namespace paddle {
namespace operators {
......@@ -21,75 +22,93 @@ namespace operators {
using Tensor = framework::Tensor;
template <typename T>
inline T tolerable_value(const T x) {
static_assert(std::is_floating_point<T>::value,
"tolerable_value works only on float, "
"double and double double.");
HOSTDEVICE T tolerable_value(const T x) {
PADDLE_ASSERT(std::is_floating_point<T>::value);
const T kApproInf = 1e20;
if (x == INFINITY) {
return kApproInf;
}
if (x == -INFINITY) {
return -kApproInf;
}
return x;
}
template <typename T>
class OnehotCrossEntropyOpKernel : public framework::OpKernel {
class CrossEntropyOpKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto X = ctx.Input<Tensor>("X");
const T* Xdata = X->data<T>();
const int* label_data = ctx.Input<Tensor>("label")->data<int>();
auto Y = ctx.Output<Tensor>("Y");
Y->mutable_data<T>(ctx.GetPlace());
T* Ydata = Y->data<T>();
int batch_size = X->dims()[0];
int class_num = X->dims()[1];
for (int i = 0; i < batch_size; ++i) {
int index = i * class_num + label_data[i];
Ydata[i] = -tolerable_value(std::log(Xdata[index]));
auto x = ctx.Input<Tensor>("X");
auto y = ctx.Output<Tensor>("Y");
auto* x_data = x->data<T>();
y->mutable_data<T>(ctx.GetPlace());
auto* y_data = y->data<T>();
int batch_size = x->dims()[0];
int class_num = x->dims()[1];
if (ctx.Attr<int>("soft_label") == 1) {
auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
int index = 0;
for (int i = 0; i < batch_size; ++i) {
T sum = static_cast<T>(0);
for (int j = 0; j < class_num; ++j) {
sum += label_data[index] * tolerable_value(std::log(x_data[index]));
y_data[i] = -sum;
index++;
}
}
} else {
auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
for (int i = 0; i < batch_size; ++i) {
int index = i * class_num + label_data[i];
y_data[i] = -tolerable_value(std::log(x_data[index]));
}
}
}
};
template <typename T>
class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
class CrossEntropyGradientOpKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto X = ctx.Input<Tensor>("X");
auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto label = ctx.Input<Tensor>("label");
auto x = ctx.Input<Tensor>("X");
auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto label = ctx.Input<Tensor>("Label");
auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace());
auto* dYdata = dY->template data<T>();
auto* Xdata = X->template data<T>();
auto* label_data = label->data<int>();
auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
auto* dy_data = dy->data<T>();
auto* x_data = x->data<T>();
const int batch_size = X->dims()[0];
const int class_num = X->dims()[1];
int batch_size = x->dims()[0];
int class_num = x->dims()[1];
// TODO(qingqing): make zero setting an common function.
memset(dXdata, 0, sizeof(T) * batch_size * class_num);
for (int i = 0; i < batch_size; ++i) {
int index = i * class_num + label_data[i];
dXdata[index] = -tolerable_value(dYdata[i] / Xdata[index]);
if (ctx.Attr<int>("soft_label") == 1) {
auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
int index = 0;
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < class_num; ++j) {
dx_data[index] = -label_data[index] * dy_data[i] / x_data[index];
index++;
}
}
} else {
auto* label_data = label->data<int>();
memset(dx_data, 0, sizeof(T) * batch_size * class_num);
for (int i = 0; i < batch_size; ++i) {
PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
int index = i * class_num + label_data[i];
dx_data[index] = -dy_data[i] / x_data[index];
}
}
}
};
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/dropout_op.h"
namespace paddle {
namespace operators {
using framework::Tensor;
using framework::LoDTensor;
class DropoutOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
PADDLE_ENFORCE_GE(ctx.Attr<float>("dropout_prob"), 0);
PADDLE_ENFORCE_LE(ctx.Attr<float>("dropout_prob"), 1);
// TODO(xinghai-sun): remove this check after swtiching to bool
PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
ctx.Attr<int>("is_training") == 1);
auto dims = ctx.Input<Tensor>("X")->dims();
ctx.Output<LoDTensor>("Out")->Resize(dims);
if (ctx.Attr<int>("is_training") == 1) {
ctx.Output<LoDTensor>("Mask")->Resize(dims);
}
}
};
template <typename AttrType>
class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
public:
DropoutOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddAttr<AttrType>("dropout_prob", "Probability of setting units to zero.")
.SetDefault(.5f);
// TODO(xinghai-sun): use bool for is_training after bool is supported.
AddAttr<int>("is_training", "Whether in training phase.").SetDefault(1);
AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
AddInput("X", "The input of dropout op.");
AddOutput("Out", "The output of dropout op.");
AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
AddComment(R"DOC(
Dropout Operator.
"Dropout" refers to randomly dropping out units in a nerual network. It is a
regularization technique for reducing overfitting by preventing neuron
co-adaption during training. The dropout operator randomly set (according to
the given dropout probability) the outputs of some units to zero, while others
being set to their inputs.
)DOC");
}
};
template <typename AttrType>
class DropoutOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_EQ(ctx.Attr<int>("is_training"), 1,
"GradOp is only callable when is_training is true");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Mask"), "Mask must not be null.");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
"Input(Out@GRAD) must not be null.");
PADDLE_ENFORCE_GE(ctx.Attr<AttrType>("dropout_prob"), 0);
PADDLE_ENFORCE_LE(ctx.Attr<AttrType>("dropout_prob"), 1);
// TODO(xinghai-sun): remove this check after swtiching to bool
PADDLE_ENFORCE(ctx.Attr<int>("is_training") == 0 ||
ctx.Attr<int>("is_training") == 1);
auto x_dims = ctx.Input<Tensor>("X")->dims();
auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
PADDLE_ENFORCE_EQ(x_dims, out_dims,
"Dimensions of Input(X) and Out@Grad must be the same.");
auto mask_dims = ctx.Input<Tensor>("Mask")->dims();
PADDLE_ENFORCE_EQ(x_dims, mask_dims,
"Dimensions of Input(X) and Mask must be the same.");
auto *x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
x_grad->Resize(x_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
ops::DropoutOpGrad<float>);
REGISTER_OP_CPU_KERNEL(
dropout, ops::CPUDropoutKernel<paddle::platform::CPUPlace, float, float>);
REGISTER_OP_CPU_KERNEL(
dropout_grad, ops::DropoutGradKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include <thrust/device_ptr.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <thrust/transform.h>
#include "paddle/operators/dropout_op.h"
namespace paddle {
namespace operators {
template <typename T, typename AttrType>
struct MaskGenerator {
AttrType dropout_prob;
int seed;
__host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
: dropout_prob(dropout_prob), seed(seed) {}
__host__ __device__ T operator()(const unsigned int n) const {
thrust::minstd_rand rng;
rng.seed(seed);
thrust::uniform_real_distribution<AttrType> dist(0, 1);
rng.discard(n);
if (dist(rng) < dropout_prob) {
return static_cast<T>(0);
} else {
return static_cast<T>(1);
}
}
};
// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
// Use std::random and thrust::random(thrust is a std library in CUDA) to
// implement uniform random.
template <typename Place, typename T, typename AttrType>
class GPUDropoutKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
auto* y = context.Output<Tensor>("Out");
y->mutable_data<T>(context.GetPlace());
AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
auto place = context.GetEigenDevice<Place>();
if (context.Attr<int>("is_training") == 1) {
auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace());
int size = framework::product(mask->dims());
int seed = context.Attr<int>("seed");
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(mask_data),
MaskGenerator<T, AttrType>(dropout_prob, seed));
auto M = EigenMatrix<T>::Reshape(*mask, 1);
Y.device(place) = X * M;
} else {
Y.device(place) = X * dropout_prob;
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
dropout, ops::GPUDropoutKernel<paddle::platform::GPUPlace, float, float>);
REGISTER_OP_GPU_KERNEL(
dropout_grad, ops::DropoutGradKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <random>
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename Place, typename T, typename AttrType>
class CPUDropoutKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
auto* y = context.Output<Tensor>("Out");
const auto* x_data = x->data<T>();
auto* y_data = y->mutable_data<T>(context.GetPlace());
AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
if (context.Attr<int>("is_training") == 1) {
auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace());
int seed = context.Attr<int>("seed");
std::minstd_rand engine;
engine.seed(seed);
std::uniform_real_distribution<AttrType> dist(0, 1);
size_t size = framework::product(mask->dims());
for (size_t i = 0; i < size; ++i) {
if (dist(engine) < dropout_prob) {
mask_data[i] = 0;
y_data[i] = 0;
} else {
mask_data[i] = 1;
y_data[i] = x_data[i];
}
}
} else {
auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
auto place = context.GetEigenDevice<Place>();
Y.device(place) = X * dropout_prob;
}
}
};
template <typename Place, typename T>
class DropoutGradKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE_EQ(context.Attr<int>("is_training"), 1,
"GradOp is only callable when is_training is true");
auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
auto* mask = context.Input<Tensor>("Mask");
grad_x->mutable_data<T>(context.GetPlace());
auto M = EigenMatrix<T>::Reshape(*mask, 1);
auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
auto place = context.GetEigenDevice<Place>();
dX.device(place) = dY * M;
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/operators/net_op.h"
namespace paddle {
namespace operators {
class FCOp : public NetOp {
public:
FCOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
PADDLE_ENFORCE(!Inputs("X").empty(),
"Inputs(X) of FCOp should not be null.");
PADDLE_ENFORCE(!Inputs("W").empty(),
"Inputs(W) of FCOp should not be null.");
PADDLE_ENFORCE(!Outputs("MulOut").empty(),
"Outputs(MulOut) of FCOp should not be null.");
PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
"Output(Out) of FCOp should not be null.");
auto x = Inputs("X");
auto w = Inputs("W");
auto mul_out = Outputs("MulOut");
PADDLE_ENFORCE_EQ(
x.size(), w.size(),
"The size of inputs X(%d) should be the same as that of weights W(%d).",
x.size(), w.size());
PADDLE_ENFORCE_EQ(mul_out.size(), x.size(),
"The size of intermediate mul_out(%d) should be the same "
"as that of inputs X(%d).",
mul_out.size(), x.size());
size_t n = x.size();
PADDLE_ENFORCE_GE(n, static_cast<size_t>(1),
"The size of inputs X(%d) should be no less than 1.", n);
auto x_num_col_dims = Attr<std::vector<int>>("xNumColDims");
// Set all values or set no values (use the default value)
if (!x_num_col_dims.empty()) {
PADDLE_ENFORCE_EQ(x_num_col_dims.size(), n,
"The size of attribute xNumColDims(%d) should be the "
"same as that of inputs X(%d).",
x_num_col_dims.size(), n);
} else {
x_num_col_dims.resize(n);
for (size_t i = 0; i < n; i++) {
x_num_col_dims[i] = 1;
}
}
// mul_out[i] = X[i] * W[i]
for (size_t i = 0; i < n; i++) {
framework::AttributeMap mul_attr;
mul_attr["x_num_col_dims"] = static_cast<int>(x_num_col_dims[i]);
mul_attr["y_num_col_dims"] = static_cast<int>(1);
AppendOp(
framework::OpRegistry::CreateOp("mul", {{"X", {x[i]}}, {"Y", {w[i]}}},
{{"Out", {mul_out[i]}}}, mul_attr));
}
// sum_out = X[0] * W[0] + ... + X[n-1] * W[n-1]
auto sum_out = mul_out[0];
if (n > 1) {
PADDLE_ENFORCE_NE(Output("SumOut"), framework::kEmptyVarName,
"Output(SumOut) of FCOp should not be null when the "
"size of Inputs(X) > 1.");
sum_out = Output("SumOut");
AppendOp(framework::OpRegistry::CreateOp("sum", {{"X", {mul_out}}},
{{"Out", {sum_out}}}, {}));
} else {
if (Output("SumOut") != framework::kEmptyVarName) {
this->Rename(Output("SumOut"), framework::kEmptyVarName);
}
}
// add_out = sum_out + b
auto b = Input("B");
auto add_out = sum_out;
if (b != framework::kEmptyVarName) {
PADDLE_ENFORCE_NE(
Output("AddOut"), framework::kEmptyVarName,
"Output(AddOut) of FCOp should not be null when Input(B) is set.");
add_out = Output("AddOut");
AppendOp(framework::OpRegistry::CreateOp(
"rowwise_add", {{"X", {sum_out}}, {"b", {Input("B")}}},
{{"Out", {add_out}}}, {}));
} else {
if (Output("AddOut") != framework::kEmptyVarName) {
this->Rename(Output("AddOut"), framework::kEmptyVarName);
}
}
auto activation = Attr<std::string>("activation");
AppendOp(framework::OpRegistry::CreateOp(activation, {{"X", {add_out}}},
{{"Y", {Output("Out")}}}, {}));
CompleteAddOp(false);
}
};
class FCOpMaker : public framework::OpProtoAndCheckerMaker {
public:
FCOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"(A vector of Tensors) each input Tensor can be of arbitrary "
"dimension, and will be reshaped to a 2-D matrix of size "
"(minibatch, number_of_input_features) according to attribute "
"xNumColDims.")
.AsDuplicable();
AddInput("W",
"(A vector of Tensors) the weights of FC operator, a "
"vector of 2-D matrix of size "
"(number_of_input_features, number_of_neurons).")
.AsDuplicable();
AddInput("B",
"(Tensor) the bias of FC operator, a 1-D vector of size "
"number_of_neurons.");
AddOutput("Out",
"(Tensor) the activated output matrix of FC operator, a 2-D "
"matrix of size (minibatch, number_of_neurons).");
AddOutput("MulOut",
"(A vector of Tensors) the intermediate outputs of FC operator, "
"each Tensor saving the product of X_i * W_i.")
.AsIntermediate()
.AsDuplicable();
AddOutput(
"SumOut",
"(Tensor) the intermediate output of FC operator, "
"saving the sum of the products of X and W, that is sum{X_i * W_i}.")
.AsIntermediate();
AddOutput("AddOut",
"(Tensor) the non-actived output of FC operator, "
"saving sum{X_i * W_i} + B.")
.AsIntermediate();
AddAttr<std::string>(
"activation",
"(string, default identity) the activation type of FC operator.")
.SetDefault("identity")
.InEnum({"identity", "sigmoid", "softmax"});
AddAttr<std::vector<int>>(
"xNumColDims",
"(std::vector<int>) The inputs Tensors of FC operator can be of "
"more than 2 dimensions. In that case, each input Tensor `X_i` will be "
"reshaped to a 2-D matrix. The matrix's first dimension "
"(the length of column) will be the product of `X_i`'s last "
"`xNumColDims_i` dimensions, that is "
"`X_i.dims[0] x ... x X_i.dims[xNumColDims_i - 1]`. "
"The matrix's second dimension (the length of row) will be the product "
"of `X_i`'s first `rank - xNumColDims_i` dimensions, that is "
"`X_i.dims[xNumColDims_i] x ... x X_i.dims[rank - 1]`)")
.SetDefault(std::vector<int>{});
AddComment(R"DOC(
Fully Connected Operator, known as Fully Connected Layer or Inner Product Layer
in Convolutional Neural Networks. Neurons in a fully connected layer have
full connections to all activations in the previous layer.
It computes an inner product of a set of
learned weights with a matrix multiplication followed by a bias offset
(optionally).
Equation:
Out = Act(sum_n{X_i * W_i} + B)
where X_i is Tensor that will be reshaped to a 2-D matrix of size (M x K),
usually M is the minibatch size and K is the number of input features.
W_i is a 2-D matrix of size (K x N), where N means the number of neurons
in the fully connected layer. B is a 1-D vector of size N.
Thus, the output Out is a 2-D matrix of size (M x N).
Activation type can be set to `identity` (default), `sigmoid` or `softmax`.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(fc, ops::FCOp, ops::FCOpMaker);
......@@ -27,7 +27,7 @@ class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input tensor of identity operator.");
AddOutput("Out", "The output tensor of identity operator.");
AddOutput("Y", "The output tensor of identity operator.");
AddComment(R"DOC(
The identity operator is an alias of the scale operator
with the attribute scale fixed to 1.0.
......@@ -44,12 +44,13 @@ class IdentityOp : public NetOp {
: NetOp(type, inputs, outputs, attrs) {
PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName,
"Input(X) of IdentityOp should not be null.");
PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
"Output(Out) of IdentityOp should not be null.");
PADDLE_ENFORCE_NE(Output("Y"), framework::kEmptyVarName,
"Output(Y) of IdentityOp should not be null.");
AppendOp(framework::OpRegistry::CreateOp(
"scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
"scale", {{"X", {Input("X")}}}, {{"Out", {Output("Y")}}},
{{"scale", static_cast<AttrType>(1)}}));
CompleteAddOp(false);
}
};
......
......@@ -19,12 +19,13 @@ namespace operators {
namespace math {
template <>
void gemm<platform::CPUPlace, float>(const CBLAS_TRANSPOSE transA,
void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M,
const int N, const int K,
const float alpha, const float* A,
const float* B, const float beta, float* C,
platform::DeviceContext* context) {
const float* B, const float beta,
float* C) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
......@@ -33,13 +34,13 @@ void gemm<platform::CPUPlace, float>(const CBLAS_TRANSPOSE transA,
}
template <>
void gemm<platform::CPUPlace, double>(const CBLAS_TRANSPOSE transA,
void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M,
const int N, const int K,
const double alpha, const double* A,
const double* B, const double beta,
double* C,
platform::DeviceContext* context) {
double* C) {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
......@@ -48,13 +49,10 @@ void gemm<platform::CPUPlace, double>(const CBLAS_TRANSPOSE transA,
}
template <>
void matmul<platform::CPUPlace, float>(const framework::Tensor& matrix_a,
bool trans_a,
const framework::Tensor& matrix_b,
bool trans_b, float alpha,
framework::Tensor* matrix_out,
float beta,
platform::DeviceContext* context) {
void matmul<platform::CPUPlace, float>(
const platform::DeviceContext& context, const framework::Tensor& matrix_a,
bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
framework::Tensor* matrix_out, float beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
......@@ -74,18 +72,15 @@ void matmul<platform::CPUPlace, float>(const framework::Tensor& matrix_a,
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CPUPlace, float>(
transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>(), context);
context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>());
}
template <>
void matmul<platform::CPUPlace, double>(const framework::Tensor& matrix_a,
bool trans_a,
const framework::Tensor& matrix_b,
bool trans_b, double alpha,
framework::Tensor* matrix_out,
double beta,
platform::DeviceContext* context) {
void matmul<platform::CPUPlace, double>(
const platform::DeviceContext& context, const framework::Tensor& matrix_a,
bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
framework::Tensor* matrix_out, double beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
......@@ -105,8 +100,8 @@ void matmul<platform::CPUPlace, double>(const framework::Tensor& matrix_a,
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CPUPlace, double>(
transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>(), context);
context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>());
}
} // namespace math
......
......@@ -19,12 +19,13 @@ namespace operators {
namespace math {
template <>
void gemm<platform::GPUPlace, float>(const CBLAS_TRANSPOSE transA,
void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M,
const int N, const int K,
const float alpha, const float* A,
const float* B, const float beta, float* C,
platform::DeviceContext* context) {
const float* B, const float beta,
float* C) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
......@@ -35,18 +36,19 @@ void gemm<platform::GPUPlace, float>(const CBLAS_TRANSPOSE transA,
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
PADDLE_ENFORCE(platform::dynload::cublasSgemm(
reinterpret_cast<platform::CUDADeviceContext*>(context)->cublas_handle(),
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.cublas_handle(),
cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
}
template <>
void gemm<platform::GPUPlace, double>(const CBLAS_TRANSPOSE transA,
void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M,
const int N, const int K,
const double alpha, const double* A,
const double* B, const double beta,
double* C,
platform::DeviceContext* context) {
double* C) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
......@@ -56,18 +58,16 @@ void gemm<platform::GPUPlace, double>(const CBLAS_TRANSPOSE transA,
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
PADDLE_ENFORCE(platform::dynload::cublasDgemm(
reinterpret_cast<platform::CUDADeviceContext*>(context)->cublas_handle(),
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.cublas_handle(),
cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
}
template <>
void matmul<platform::GPUPlace, float>(const framework::Tensor& matrix_a,
bool trans_a,
const framework::Tensor& matrix_b,
bool trans_b, float alpha,
framework::Tensor* matrix_out,
float beta,
platform::DeviceContext* context) {
void matmul<platform::GPUPlace, float>(
const platform::DeviceContext& context, const framework::Tensor& matrix_a,
bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha,
framework::Tensor* matrix_out, float beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
......@@ -87,18 +87,15 @@ void matmul<platform::GPUPlace, float>(const framework::Tensor& matrix_a,
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::GPUPlace, float>(
transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>(), context);
context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
matrix_b.data<float>(), beta, matrix_out->data<float>());
}
template <>
void matmul<platform::GPUPlace, double>(const framework::Tensor& matrix_a,
bool trans_a,
const framework::Tensor& matrix_b,
bool trans_b, double alpha,
framework::Tensor* matrix_out,
double beta,
platform::DeviceContext* context) {
void matmul<platform::GPUPlace, double>(
const platform::DeviceContext& context, const framework::Tensor& matrix_a,
bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha,
framework::Tensor* matrix_out, double beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
......@@ -118,8 +115,8 @@ void matmul<platform::GPUPlace, double>(const framework::Tensor& matrix_a,
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::GPUPlace, double>(
transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>(), context);
context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
matrix_b.data<double>(), beta, matrix_out->data<double>());
}
} // namespace math
......
......@@ -66,16 +66,16 @@ namespace math {
// For more detailed info, please refer to
// http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
template <typename Place, typename T>
void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
const int M, const int N, const int K, const T alpha, const T* A,
const T* B, const T beta, T* C, platform::DeviceContext* context);
void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const T alpha, const T* A, const T* B, const T beta, T* C);
// matrix multiply with continuous memory
template <typename Place, typename T>
void matmul(const framework::Tensor& matrix_a, bool trans_a,
void matmul(const platform::DeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, T alpha,
framework::Tensor* matrix_out, T beta,
platform::DeviceContext* context);
framework::Tensor* matrix_out, T beta);
} // namespace math
} // namespace operators
......
......@@ -15,8 +15,7 @@ TEST(math_function, notrans_mul_trans) {
memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0);
paddle::platform::DeviceContext* context =
new paddle::platform::CUDADeviceContext(*gpu_place);
paddle::platform::CUDADeviceContext context(*gpu_place);
input1_gpu.CopyFrom<float>(input1, *gpu_place);
input2_gpu.CopyFrom<float>(input1, *gpu_place);
......@@ -24,7 +23,7 @@ TEST(math_function, notrans_mul_trans) {
out_gpu.mutable_data<float>({2, 2}, *gpu_place);
paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0, context);
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
out.CopyFrom<float>(out_gpu, *cpu_place);
......@@ -33,6 +32,7 @@ TEST(math_function, notrans_mul_trans) {
EXPECT_EQ(out_ptr[1], 14);
EXPECT_EQ(out_ptr[2], 14);
EXPECT_EQ(out_ptr[3], 50);
delete gpu_place;
}
TEST(math_function, trans_mul_notrans) {
......@@ -48,8 +48,7 @@ TEST(math_function, trans_mul_notrans) {
memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::GPUPlace(0);
paddle::platform::DeviceContext* context =
new paddle::platform::CUDADeviceContext(*gpu_place);
paddle::platform::CUDADeviceContext context(*gpu_place);
input1_gpu.CopyFrom<float>(input1, *gpu_place);
input2_gpu.CopyFrom<float>(input1, *gpu_place);
......@@ -57,7 +56,7 @@ TEST(math_function, trans_mul_notrans) {
out_gpu.mutable_data<float>({3, 3}, *gpu_place);
paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0, context);
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
out.CopyFrom<float>(out_gpu, *cpu_place);
......@@ -71,5 +70,6 @@ TEST(math_function, trans_mul_notrans) {
EXPECT_EQ(out_ptr[6], 15);
EXPECT_EQ(out_ptr[7], 22);
EXPECT_EQ(out_ptr[8], 29);
delete gpu_place;
}
#endif
......@@ -71,7 +71,7 @@ class MinusGradOp : public NetOp {
// x_grad = out_grad
AppendOp(framework::OpRegistry::CreateOp("identity", {{"X", {out_grad}}},
{{"Out", {x_grad}}}, {}));
{{"Y", {x_grad}}}, {}));
framework::AttributeMap scale_attr;
scale_attr["scale"] = static_cast<AttrType>(-1);
......
......@@ -46,10 +46,8 @@ class MulKernel : public framework::OpKernel {
: *y;
z->mutable_data<T>(context.GetPlace());
auto* device_context =
const_cast<platform::DeviceContext*>(context.device_context_);
math::matmul<Place, T>(x_matrix, false, y_matrix, false, 1, z, 0,
device_context);
math::matmul<Place, T>(context.device_context(), x_matrix, false, y_matrix,
false, 1, z, 0);
}
};
......@@ -71,16 +69,14 @@ class MulGradKernel : public framework::OpKernel {
Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto* device_context =
const_cast<platform::DeviceContext*>(ctx.device_context_);
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix<T>(
*dx, x_num_col_dims)
: *dx;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N
math::matmul<Place, T>(*dout, false, y_matrix, true, 1, &dx_matrix, 0,
device_context);
math::matmul<Place, T>(ctx.device_context(), *dout, false, y_matrix, true,
1, &dx_matrix, 0);
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
......@@ -88,8 +84,8 @@ class MulGradKernel : public framework::OpKernel {
*dy, y_num_col_dims)
: *dy;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K
math::matmul<Place, T>(x_matrix, true, *dout, false, 1, &dy_matrix, 0,
device_context);
math::matmul<Place, T>(ctx.device_context(), x_matrix, true, *dout, false,
1, &dy_matrix, 0);
}
}
};
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/onehot_cross_entropy_op.h"
namespace paddle {
namespace operators {
class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_NOT_NULL(
ctx.InputVar("X"),
"Input(X) of OnehotCrossEntropyOp should not be null.");
PADDLE_ENFORCE_NOT_NULL(
ctx.InputVar("label"),
"Input(label) of OnehotCrossEntropyOp should not be null.");
PADDLE_ENFORCE_NOT_NULL(
ctx.OutputVar("Y"),
"Output(Y) of OnehotCrossEntropyOp should not be null.");
auto *X = ctx.Input<Tensor>("X");
auto *label = ctx.Input<Tensor>("label");
PADDLE_ENFORCE_EQ(X->dims().size(), 2, "X's dimension must be 2.");
PADDLE_ENFORCE_EQ(label->dims().size(), 1, "label's dimension must be 1.");
PADDLE_ENFORCE_EQ(X->dims()[0], label->dims()[0]);
ctx.Output<framework::LoDTensor>("Y")->Resize({X->dims()[0], 1});
}
};
class OnehotCrossEntropyGradientOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
auto dX = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
auto X = ctx.Input<Tensor>("X");
dX->Resize(X->dims());
}
};
class OnehotCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
public:
OnehotCrossEntropyOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The first input of OnehotCrossEntropyOp");
AddInput("label", "The second input of OnehotCrossEntropyOp");
AddOutput("Y", "The output of OnehotCrossEntropyOp");
AddComment(R"DOC(
OnehotCrossEntropy Operator.
Y[i] = -log(X[i][j])
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
ops::OnehotCrossEntropyOpMaker, onehot_cross_entropy_grad,
ops::OnehotCrossEntropyGradientOp);
REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
ops::OnehotCrossEntropyOpKernel<float>);
REGISTER_OP_CPU_KERNEL(onehot_cross_entropy_grad,
ops::OnehotCrossEntropyGradientOpKernel<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/prelu_op.h"
#include "paddle/operators/net_op.h"
namespace paddle {
namespace operators {
class PReluOp : public framework::OperatorWithKernel {
public:
PReluOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
auto *in = ctx.Input<framework::Tensor>("X");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Alpha"),
"Input(Alpha) should not be null");
auto *alpha = ctx.Input<framework::Tensor>("Alpha");
PADDLE_ENFORCE(alpha->numel() == 1, "Size of weight Alpha must be one.");
PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
"Output(Out) should not be null");
auto *out = ctx.Output<framework::LoDTensor>("Out");
out->Resize(in->dims());
}
};
class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
public:
PReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input tensor of prelu operator.");
AddInput("Alpha", "The alpha weight of PRelu operator.");
AddOutput("Out", "The output tensor of PRelu operator.");
AddComment(R"DOC(PRelu operator
The equation is:
f(x) = alpha * x , for x < 0
f(x) = x , for x >= 0
)DOC");
}
};
// The operator to calculate gradients of a prelu operator.
class PReluGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null");
auto *dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
auto *x = ctx.Input<framework::Tensor>("X");
auto *dalpha =
ctx.Output<framework::LoDTensor>(framework::GradVarName("Alpha"));
auto *alpha = ctx.Input<framework::Tensor>("Alpha");
dx->Resize(x->dims());
dalpha->Resize(alpha->dims());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
ops::PReluGradOp);
REGISTER_OP_CPU_KERNEL(prelu,
ops::PReluKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(prelu_grad,
ops::PReluGradKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/prelu_op.h"
REGISTER_OP_GPU_KERNEL(
prelu, paddle::operators::PReluKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
prelu_grad,
paddle::operators::PReluGradKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/platform/transform.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using platform::Transform;
template <typename T>
class PReluFunctor {
public:
explicit PReluFunctor(const T* alpha) : alpha_(alpha) {}
HOSTDEVICE T operator()(const T& x) const {
if (x > 0)
return x;
else
return x * (*alpha_);
}
private:
const T* alpha_;
};
template <typename Place, typename T>
class PReluKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
auto* alpha = context.Input<Tensor>("Alpha");
auto* out = context.Output<Tensor>("Out");
const T* x_ptr = x->data<T>();
T* o_ptr = out->mutable_data<T>(context.GetPlace());
auto* alpha_ptr = alpha->data<T>();
int numel = x->numel();
Transform<Place> trans;
trans(context.device_context(), x_ptr, x_ptr + numel, o_ptr,
PReluFunctor<T>(alpha_ptr));
}
};
template <typename T>
class PReluGradFunctor {
public:
explicit PReluGradFunctor(const T* alpha) : alpha_(alpha) {}
HOSTDEVICE T operator()(const T& out, const T& dout) const {
if (out > 0)
return dout;
else
return dout * (*alpha_);
}
private:
const T* alpha_;
};
template <typename Place, typename T>
class PReluGradKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
auto* out = context.Input<Tensor>("Out");
auto* alpha = context.Input<Tensor>("Alpha");
auto* alpha_ptr = alpha->data<T>();
T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
const T* dout_ptr = dout->data<T>();
const T* out_ptr = out->data<T>();
int numel = dx->numel();
Transform<Place> trans;
trans(context.device_context(), out_ptr, out_ptr + numel, dout_ptr, dx_ptr,
PReluGradFunctor<T>(alpha_ptr));
// TODO (Zhuoyuan): add dalpha upgrade when GPU kernels ready
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/split_op.h"
#include "paddle/operators/net_op.h"
namespace paddle {
namespace operators {
using framework::Tensor;
class SplitOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
// infershape
auto *in = ctx.Input<framework::Tensor>("X");
auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
size_t num = static_cast<size_t>(ctx.Attr<int>("num"));
std::vector<int> sections =
static_cast<std::vector<int>>(ctx.Attr<std::vector<int>>("sections"));
const size_t n = outs.size();
if (num > 0) {
int64_t in_axis_dim = in->dims()[axis];
PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
"tensor split does not result"
" in an equal division");
size_t out_axis_dim = in_axis_dim / num;
for (size_t i = 0; i < n; ++i) {
auto dim = in->dims();
dim[axis] = out_axis_dim;
outs[i]->Resize(dim);
}
} else if (sections.size() > 0) {
PADDLE_ENFORCE_EQ(sections.size(), n,
"tensor split sections size"
"should be equal to output size.");
for (size_t i = 0; i < n; ++i) {
auto dim = in->dims();
dim[axis] = sections[i];
outs[i]->Resize(dim);
}
} else {
PADDLE_ENFORCE_NOT_NULL(nullptr, "split operator should",
" specify indices or sections.");
}
}
};
class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SplitOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "the input tensor of split operator.");
AddOutput("Out", "the output tensors of split operator.").AsDuplicable();
AddComment(R"DOC(
Split the input tensor into multiple sub-tensors.
Example:
Input = [[1,2],
[3,4],
[5,6]]
sections = [2,1]
axis = 0
Output[0] = [[1,2],
[3,4]]
Output[1] = [[5,6]]
)DOC");
AddAttr<std::vector<int>>("sections",
"the length for each"
"output along with the specify axis.")
.SetDefault(std::vector<int>{});
AddAttr<int>("num",
"number of the sub-tensors, it must evenly divide "
"Input.dims()[axis]")
.SetDefault(0);
AddAttr<int>("axis", "The axis which the input will be splited on.")
.SetDefault(0);
}
};
class SplitOpGrad : public NetOp {
public:
SplitOpGrad(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
auto out_grad = Inputs(framework::GradVarName("Out"));
auto x_grad = Output(framework::GradVarName("X"));
AppendOp(framework::OpRegistry::CreateOp("concat", {{"X", out_grad}},
{{"Out", {x_grad}}}, attrs));
CompleteAddOp(false);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
USE_CPU_ONLY_OP(concat);
REGISTER_OP(split, ops::SplitOp, ops::SplitOpMaker, split_grad,
ops::SplitOpGrad);
REGISTER_OP_CPU_KERNEL(split,
ops::SplitKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename Place, typename T>
class SplitKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
size_t before = 1, after = 1;
const size_t n = outs.size();
size_t input_axis_dim = in->dims()[axis];
for (int64_t i = 0; i < in->dims().size(); ++i) {
if (i == axis) {
continue;
}
if (i < axis) {
before *= in->dims()[i];
} else {
after *= in->dims()[i];
}
}
size_t input_offset = 0;
for (size_t i = 0; i < n; i++) {
auto& out = outs[i];
size_t axis_dim = out->dims()[axis];
for (size_t j = 0; j < before; j++) {
size_t len = axis_dim * after * sizeof(T);
T* dest =
out->mutable_data<T>(platform::CPUPlace()) + axis_dim * after * j;
const T* src =
in->data<T>() + input_offset + input_axis_dim * after * j;
memcpy(dest, src, len);
}
input_offset += axis_dim * after;
}
}
};
} // namespace operators
} // namespace paddle
......@@ -24,4 +24,4 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
......@@ -24,6 +24,11 @@ namespace platform {
#define USE_CUDA_ATOMIC(op, T) \
CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
// Default thread count per block(or block size).
// TODO(typhoonzero): need to benchmark against setting this value
// to 1024.
constexpr int PADDLE_CUDA_NUM_THREADS = 512;
// For atomicAdd.
USE_CUDA_ATOMIC(Add, float);
......
......@@ -101,19 +101,17 @@ CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place);
eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
}
CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device);
Wait();
if (cublas_handle_) {
PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
}
if (cudnn_handle_) {
PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
}
PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
eigen_stream_.reset();
eigen_device_.reset();
PADDLE_ENFORCE(cudaStreamDestroy(stream_));
......@@ -129,25 +127,13 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return eigen_device_.get();
}
cublasHandle_t CUDADeviceContext::cublas_handle() {
if (!cublas_handle_) {
SetDeviceId(place_.device);
PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
}
cublasHandle_t CUDADeviceContext::cublas_handle() const {
return cublas_handle_;
}
cudnnHandle_t CUDADeviceContext::cudnn_handle() {
if (!cudnn_handle_) {
SetDeviceId(place_.device);
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
}
return cudnn_handle_;
}
cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
cudaStream_t CUDADeviceContext::stream() { return stream_; }
cudaStream_t CUDADeviceContext::stream() const { return stream_; }
#endif // PADDLE_ONLY_CPU
......
......@@ -67,16 +67,14 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const;
// clang-format off
/*! \brief Return cublas handle in the device context. */
cublasHandle_t cublas_handle();
cublasHandle_t cublas_handle() const;
/*! \brief Return cudnn handle in the device context. */
cudnnHandle_t cudnn_handle();
cudnnHandle_t cudnn_handle() const;
/*! \brief Return cuda stream in the device context. */
cudaStream_t stream();
// clang-format on
cudaStream_t stream() const;
private:
GPUPlace place_;
......@@ -84,11 +82,9 @@ class CUDADeviceContext : public DeviceContext {
std::unique_ptr<Eigen::GpuDevice> eigen_device_;
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
// clang-format off
cudaStream_t stream_{nullptr};
cudnnHandle_t cudnn_handle_{nullptr};
cublasHandle_t cublas_handle_{nullptr};
// clang-format on
cudaStream_t stream_;
cudnnHandle_t cudnn_handle_;
cublasHandle_t cublas_handle_;
};
#endif
......
......@@ -14,6 +14,7 @@
#pragma once
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/hostdevice.h"
#include "paddle/platform/place.h"
......@@ -21,46 +22,78 @@
#include <algorithm>
#include <type_traits>
#ifdef __NVCC__
#include <thrust/execution_policy.h>
#include <thrust/transform.h>
#include "paddle/platform/details/device_ptr_cast.h"
#endif
namespace paddle {
namespace platform {
// Transform on host or device. It provides the same API in std library.
template <typename Place, typename InputIter, typename OutputIter,
typename UnaryOperation>
void Transform(Place place, InputIter first, InputIter last, OutputIter result,
UnaryOperation op) {
if (is_cpu_place(place)) {
template <typename Place>
struct Transform {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const DeviceContext& context, InputIter first, InputIter last,
OutputIter result, UnaryOperation op);
template <typename InputIter1, typename InputIter2, typename OutputIter,
typename BinaryOperation>
void operator()(const DeviceContext& context, InputIter1 first1,
InputIter1 last1, InputIter2 first2, OutputIter result,
BinaryOperation op);
};
template <>
struct Transform<platform::CPUPlace> {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const DeviceContext& context, InputIter first, InputIter last,
OutputIter result, UnaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
std::transform(first, last, result, op);
} else {
#ifdef __NVCC__
using namespace details;
thrust::transform(DevPtrCast(first), DevPtrCast(last), DevPtrCast(result),
op);
#else
PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
#endif
}
}
template <typename Place, typename InputIter1, typename InputIter2,
typename OutputIter, typename BinaryOperation>
void Transform(Place place, InputIter1 first1, InputIter1 last1,
InputIter2 first2, OutputIter result, BinaryOperation op) {
if (is_cpu_place(place)) {
template <typename InputIter1, typename InputIter2, typename OutputIter,
typename BinaryOperation>
void operator()(const DeviceContext& context, InputIter1 first1,
InputIter1 last1, InputIter2 first2, OutputIter result,
BinaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
std::transform(first1, last1, first2, result, op);
} else {
}
};
#ifdef __NVCC__
using namespace details;
thrust::transform(DevPtrCast(first1), DevPtrCast(last1), DevPtrCast(first2),
DevPtrCast(result), op);
#else
PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
#endif
template <>
struct Transform<platform::GPUPlace> {
template <typename InputIter, typename OutputIter, typename UnaryOperation>
void operator()(const DeviceContext& context, InputIter first, InputIter last,
OutputIter result, UnaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
thrust::transform(thrust::cuda::par.on(ctx.stream()),
details::DevPtrCast(first), details::DevPtrCast(last),
details::DevPtrCast(result), op);
}
template <typename InputIter1, typename InputIter2, typename OutputIter,
typename BinaryOperation>
void operator()(const DeviceContext& context, InputIter1 first1,
InputIter1 last1, InputIter2 first2, OutputIter result,
BinaryOperation op) {
auto place = context.GetPlace();
PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
thrust::transform(thrust::cuda::par.on(ctx.stream()),
details::DevPtrCast(first1), details::DevPtrCast(last1),
details::DevPtrCast(first2), details::DevPtrCast(result),
op);
}
};
#endif
} // namespace platform
} // namespace paddle
此差异已折叠。
......@@ -45,14 +45,18 @@ add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})
set(PSERVER_MAIN_SOURCES
ParameterServer2Main.cpp)
add_executable(paddle_pserver_main
${PSERVER_MAIN_SOURCES})
link_paddle_exe(paddle_pserver_main)
if(WITH_TESTING)
add_subdirectory(test)
endif()
install(TARGETS paddle_pserver_main
RUNTIME DESTINATION opt/paddle/bin
PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
set_target_properties(paddle_pserver_main PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
if(NOT WITH_C_API)
add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES})
link_paddle_exe(paddle_pserver_main)
install(TARGETS paddle_pserver_main
RUNTIME DESTINATION opt/paddle/bin
PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
set_target_properties(paddle_pserver_main PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
endif()
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册