提交 83f8c403 编写于 作者: N nhzlx

Merge branch 'develop' of https://github.com/paddlepaddle/paddle into fix_avg_pool_trt_bug

test=develop
...@@ -26,6 +26,11 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " ...@@ -26,6 +26,11 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
if(WIN32) if(WIN32)
set(CMAKE_STATIC_LIBRARY_PREFIX lib) set(CMAKE_STATIC_LIBRARY_PREFIX lib)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
endif(WIN32) endif(WIN32)
if(NOT CMAKE_CROSSCOMPILING) if(NOT CMAKE_CROSSCOMPILING)
...@@ -66,6 +71,8 @@ option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) ...@@ -66,6 +71,8 @@ option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(ON_INFER "Turn on inference optimization." OFF) option(ON_INFER "Turn on inference optimization." OFF)
...@@ -308,7 +315,6 @@ endif() ...@@ -308,7 +315,6 @@ endif()
if (ON_INFER) if (ON_INFER)
message(STATUS "On inference mode, will take place some specific optimization.") message(STATUS "On inference mode, will take place some specific optimization.")
add_definitions(-DPADDLE_ON_INFERENCE)
else() else()
#TODO(luotao), combine this warning with `make inference_lib_dist` command. #TODO(luotao), combine this warning with `make inference_lib_dist` command.
message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.") message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
......
...@@ -218,3 +218,7 @@ endif(WITH_GRPC) ...@@ -218,3 +218,7 @@ endif(WITH_GRPC)
if(WITH_BRPC_RDMA) if(WITH_BRPC_RDMA)
add_definitions(-DPADDLE_WITH_BRPC_RDMA) add_definitions(-DPADDLE_WITH_BRPC_RDMA)
endif(WITH_BRPC_RDMA) endif(WITH_BRPC_RDMA)
if(ON_INFER)
add_definitions(-DPADDLE_ON_INFERENCE)
endif(ON_INFER)
...@@ -157,6 +157,9 @@ list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) ...@@ -157,6 +157,9 @@ list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO) if(NOT WITH_DSO)
# TODO(panyx0718): CUPTI only allows DSO? # TODO(panyx0718): CUPTI only allows DSO?
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
if(WIN32)
set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
endif(WIN32)
endif(NOT WITH_DSO) endif(NOT WITH_DSO)
# setting nvcc arch flags # setting nvcc arch flags
...@@ -196,10 +199,12 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") ...@@ -196,10 +199,12 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif() endif()
else(NOT WIN32) else(NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Release") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS "-g -G")
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
else() else()
message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.") message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
endif() endif()
endif(NOT WIN32) endif(NOT WIN32)
......
...@@ -2,7 +2,12 @@ if(NOT WITH_GPU) ...@@ -2,7 +2,12 @@ if(NOT WITH_GPU)
return() return()
endif() endif()
set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT") if(WIN32)
set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
else(WIN32)
set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
endif(WIN32)
find_path(CUDNN_INCLUDE_DIR cudnn.h find_path(CUDNN_INCLUDE_DIR cudnn.h
PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
$ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE} $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
......
...@@ -58,19 +58,21 @@ ExternalProject_Add( ...@@ -58,19 +58,21 @@ ExternalProject_Add(
-DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
-DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER} -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
-DBUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN}
-DBUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM}
${EXTERNAL_OPTIONAL_ARGS} ${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
) )
message(STATUS "Anakin for inference is enabled") message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
add_dependencies(extern_anakin protobuf mklml)
add_library(anakin_shared SHARED IMPORTED GLOBAL) add_library(anakin_shared SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB}) set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
add_dependencies(anakin_shared extern_anakin protobuf mklml) add_dependencies(anakin_shared extern_anakin)
add_library(anakin_saber SHARED IMPORTED GLOBAL) add_library(anakin_saber SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB}) set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
add_dependencies(anakin_saber extern_anakin protobuf mklml) add_dependencies(anakin_saber extern_anakin)
list(APPEND external_project_dependencies anakin_shared anakin_saber) list(APPEND external_project_dependencies anakin_shared anakin_saber)
...@@ -28,34 +28,28 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL)) ...@@ -28,34 +28,28 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
endif() endif()
IF (WIN32)
MESSAGE(WARNING, "In windows, boost can not be downloaded automaticlly, please build it manually and put it at " ${THIRD_PARTY_PATH}install/boost) MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
else()
MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
ENDIF(WIN32)
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}" CACHE PATH "boost include directory." FORCE)
set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
include_directories(${BOOST_INCLUDE_DIR}) include_directories(${BOOST_INCLUDE_DIR})
if (NOT WIN32)
ExternalProject_Add( ExternalProject_Add(
${BOOST_PROJECT} ${BOOST_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR} DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz URL ${BOOST_URL}
&& tar zxf ${BOOST_TAR}.tar.gz
DOWNLOAD_NO_PROGRESS 1 DOWNLOAD_NO_PROGRESS 1
PREFIX ${BOOST_SOURCES_DIR} PREFIX ${BOOST_SOURCES_DIR}
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
INSTALL_COMMAND "" INSTALL_COMMAND ""
UPDATE_COMMAND "" UPDATE_COMMAND ""
) )
endif(NOT WIN32)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
......
...@@ -35,7 +35,12 @@ ExternalProject_Add( ...@@ -35,7 +35,12 @@ ExternalProject_Add(
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DBUILD_STATIC_LIBS=ON
-DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF -DBUILD_TESTING=OFF
...@@ -48,8 +53,8 @@ ExternalProject_Add( ...@@ -48,8 +53,8 @@ ExternalProject_Add(
IF(WIN32) IF(WIN32)
IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib") IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
add_custom_command(TARGET extern_gflags POST_BUILD add_custom_command(TARGET extern_gflags POST_BUILD
COMMAND cmake -E rename ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib COMMAND cmake -E copy ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
) )
ENDIF() ENDIF()
ENDIF(WIN32) ENDIF(WIN32)
ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
......
...@@ -46,7 +46,11 @@ ExternalProject_Add( ...@@ -46,7 +46,11 @@ ExternalProject_Add(
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
...@@ -63,7 +67,7 @@ ExternalProject_Add( ...@@ -63,7 +67,7 @@ ExternalProject_Add(
IF(WIN32) IF(WIN32)
IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib") IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib")
add_custom_command(TARGET extern_glog POST_BUILD add_custom_command(TARGET extern_glog POST_BUILD
COMMAND cmake -E rename ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib COMMAND cmake -E copy ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
) )
ENDIF() ENDIF()
ENDIF(WIN32) ENDIF(WIN32)
......
...@@ -17,12 +17,8 @@ IF(USE_EIGEN_FOR_BLAS) ...@@ -17,12 +17,8 @@ IF(USE_EIGEN_FOR_BLAS)
ENDIF(USE_EIGEN_FOR_BLAS) ENDIF(USE_EIGEN_FOR_BLAS)
INCLUDE(cblas) INCLUDE(cblas)
# IF(WIN32 AND NOT ${CBLAS_FOUND})
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas) SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
...@@ -34,6 +30,7 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -34,6 +30,7 @@ IF(NOT ${CBLAS_FOUND})
CACHE FILEPATH "openblas library." FORCE) CACHE FILEPATH "openblas library." FORCE)
ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS) ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
IF (WIN32) IF (WIN32)
SET(CBLAS_FOUND true) SET(CBLAS_FOUND true)
MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR}) MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})
......
...@@ -140,7 +140,6 @@ endmacro() ...@@ -140,7 +140,6 @@ endmacro()
set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
IF (WIN32) IF (WIN32)
SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf) SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
MESSAGE(WARNING, "In windows, protobuf only support msvc build, please build it manually and put it at " ${PROTOBUF_ROOT})
ENDIF(WIN32) ENDIF(WIN32)
if (NOT "${PROTOBUF_ROOT}" STREQUAL "") if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
...@@ -188,13 +187,20 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -188,13 +187,20 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
SET(OPTIONAL_ARGS SET(OPTIONAL_ARGS
"-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
"-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
"-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
"-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
"-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
"-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}"
"-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
"-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
"-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
"-Dprotobuf_WITH_ZLIB=ON" "-Dprotobuf_WITH_ZLIB=ON"
"-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}" "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
${EXTERNAL_OPTIONAL_ARGS}) ${EXTERNAL_OPTIONAL_ARGS})
SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}") SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
ENDIF() ENDIF()
IF(WIN32)
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
ENDIF()
SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
......
...@@ -21,6 +21,48 @@ INCLUDE(python_module) ...@@ -21,6 +21,48 @@ INCLUDE(python_module)
FIND_PACKAGE(PythonInterp ${PY_VERSION}) FIND_PACKAGE(PythonInterp ${PY_VERSION})
FIND_PACKAGE(PythonLibs ${PY_VERSION}) FIND_PACKAGE(PythonLibs ${PY_VERSION})
if(WIN32)
execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
"from distutils import sysconfig as s;import sys;import struct;
print(sys.prefix);
print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
"
RESULT_VARIABLE _PYTHON_SUCCESS
OUTPUT_VARIABLE _PYTHON_VALUES
ERROR_VARIABLE _PYTHON_ERROR_VALUE)
if(NOT _PYTHON_SUCCESS MATCHES 0)
set(PYTHONLIBS_FOUND FALSE)
return()
endif()
# Convert the process output into a list
string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
list(GET _PYTHON_VALUES 0 PYTHON_PREFIX)
list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX)
# Make sure all directory separators are '/'
string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
set(PYTHON_LIBRARY
"${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
# when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
# original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
if(NOT EXISTS "${PYTHON_LIBRARY}")
get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
set(PYTHON_LIBRARY
"${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
endif()
# raise an error if the python libs are still not found.
if(NOT EXISTS "${PYTHON_LIBRARY}")
message(FATAL_ERROR "Python libraries not found")
endif()
SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
endif(WIN32)
# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
ADD_LIBRARY(python SHARED IMPORTED GLOBAL) ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
......
...@@ -14,23 +14,52 @@ ELSE() ...@@ -14,23 +14,52 @@ ELSE()
ENDIF(APPLE) ENDIF(APPLE)
ENDIF() ENDIF()
ExternalProject_Add( if(WIN32)
extern_xxhash ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS} extern_xxhash
GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_TAG "v0.6.5" GIT_REPOSITORY "https://github.com/Cyan4973/xxHash"
PREFIX ${XXHASH_SOURCE_DIR} GIT_TAG "v0.6.5"
DOWNLOAD_NAME "xxhash" PREFIX ${XXHASH_SOURCE_DIR}
UPDATE_COMMAND "" DOWNLOAD_NAME "xxhash"
CONFIGURE_COMMAND "" UPDATE_COMMAND ""
BUILD_IN_SOURCE 1 BUILD_IN_SOURCE 1
PATCH_COMMAND PATCH_COMMAND
BUILD_COMMAND ${BUILD_CMD} CONFIGURE_COMMAND
INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/src/extern_xxhash/cmake_unofficial
TEST_COMMAND "" -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR}
) -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DBUILD_XXHSUM=OFF
-DCMAKE_GENERATOR_PLATFORM=x64
-DBUILD_SHARED_LIBS=OFF
${OPTIONAL_CACHE_ARGS}
TEST_COMMAND ""
)
else()
ExternalProject_Add(
extern_xxhash
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/Cyan4973/xxHash"
GIT_TAG "v0.6.5"
PREFIX ${XXHASH_SOURCE_DIR}
DOWNLOAD_NAME "xxhash"
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
PATCH_COMMAND
BUILD_COMMAND ${BUILD_CMD}
INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
TEST_COMMAND ""
)
endif()
set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") if (WIN32)
set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib")
else()
set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
endif ()
INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
add_library(xxhash STATIC IMPORTED GLOBAL) add_library(xxhash STATIC IMPORTED GLOBAL)
......
...@@ -266,7 +266,11 @@ function(cc_library TARGET_NAME) ...@@ -266,7 +266,11 @@ function(cc_library TARGET_NAME)
if("${cc_library_DEPS};" MATCHES "python;") if("${cc_library_DEPS};" MATCHES "python;")
list(REMOVE_ITEM cc_library_DEPS python) list(REMOVE_ITEM cc_library_DEPS python)
add_dependencies(${TARGET_NAME} python) add_dependencies(${TARGET_NAME} python)
target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup") if(WIN32)
target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
else()
target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
endif(WIN32)
endif() endif()
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
...@@ -288,6 +292,45 @@ function(cc_library TARGET_NAME) ...@@ -288,6 +292,45 @@ function(cc_library TARGET_NAME)
endif(cc_library_SRCS) endif(cc_library_SRCS)
endfunction(cc_library) endfunction(cc_library)
# The link operation under windows may exceeds the maximum characters limit, simply break the link command
# into multiple link opeartion can fix that, say
# original:
# lib /out:target.lib a.lib b.lib c.lib d.lib
# after:
# 1. lib /out:dummy_lib_1.lib a.lib b.lib
# 2. lib /out:dummy_lib_2.lib c.lib d.lib
# 1. lib /out:target.lib dummy_lib_1.lib dummy_lib_2.lib
function(sep_library TARGET_NAME)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(sep_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(dummy_index 1)
set(dummy_offset 1)
# the dummy target would be consisted of limit size libraries
set(dummy_limit 50)
list(LENGTH sep_library_DEPS sep_all_len)
foreach(v ${sep_library_DEPS})
list(APPEND dummy_list ${v})
list(LENGTH dummy_list listlen )
if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${sep_all_len}))
message("create dummy library ${TARGET_NAME}_dummy_lib_${dummy_index} for ${TARGET_NAME}")
cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} STATIC DEPS ${dummy_list})
foreach(i ${dummy_list})
list(REMOVE_AT dummy_list 0)
endforeach()
list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_lib_${dummy_index})
MATH(EXPR dummy_index "${dummy_index}+1")
endif()
MATH(EXPR dummy_offset "${dummy_offset}+1")
endforeach()
if(${sep_library_SHARED})
cc_library(${TARGET_NAME} SHARED SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
else(${sep_library_SHARED})
cc_library(${TARGET_NAME} STATIC SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
endif(${sep_library_SHARED})
endfunction(sep_library)
function(cc_binary TARGET_NAME) function(cc_binary TARGET_NAME)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
......
...@@ -22,144 +22,165 @@ function(copy TARGET) ...@@ -22,144 +22,165 @@ function(copy TARGET)
list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len}) if (NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers") message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
endif() endif ()
math(EXPR len "${copy_lib_SRCS_len} - 1") math(EXPR len "${copy_lib_SRCS_len} - 1")
add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS}) add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
foreach(index RANGE ${len}) foreach (index RANGE ${len})
list(GET copy_lib_SRCS ${index} src) list(GET copy_lib_SRCS ${index} src)
list(GET copy_lib_DSTS ${index} dst) list(GET copy_lib_DSTS ${index} dst)
add_custom_command(TARGET ${TARGET} PRE_BUILD if (WIN32)
COMMAND mkdir -p "${dst}" # windows cmd shell will not expand wildcard automatically.
COMMAND cp -r "${src}" "${dst}" # below expand the files,libs and copy them by rules.
COMMENT "copying ${src} -> ${dst}") file(GLOB header_files ${src} "*.h")
endforeach() file(GLOB static_lib_files ${src} "*.lib")
file(GLOB dll_lib_files ${src} "*.dll")
set(src_files ${header_files} ${static_lib_files} ${dll_lib_files})
if (NOT "${src_files}" STREQUAL "")
list(REMOVE_DUPLICATES src_files)
endif ()
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
)
foreach (src_file ${src_files})
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
COMMENT "copying ${src_file} -> ${dst}")
endforeach ()
else (WIN32) # not windows
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND mkdir -p "${dst}"
COMMAND cp -r "${src}" "${dst}"
COMMENT "copying ${src} -> ${dst}")
endif (WIN32) # not windows
endforeach ()
endfunction() endfunction()
# third party # third party
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
copy(eigen3_lib copy(eigen3_lib
SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
DEPS eigen3 DEPS eigen3
) )
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags")
copy(gflags_lib copy(gflags_lib
SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS gflags DEPS gflags
) )
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog")
copy(glog_lib copy(glog_lib
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS glog DEPS glog
) )
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/")
copy(boost_lib copy(boost_lib
SRCS ${BOOST_INCLUDE_DIR}/boost SRCS ${BOOST_INCLUDE_DIR}/boost
DSTS ${dst_dir} DSTS ${dst_dir}
DEPS boost DEPS boost
) )
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
copy(xxhash_lib copy(xxhash_lib
SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES} SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS xxhash DEPS xxhash
) )
if(NOT PROTOBUF_FOUND) if (NOT PROTOBUF_FOUND)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
copy(protobuf_lib copy(protobuf_lib
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS extern_protobuf DEPS extern_protobuf
) )
endif() endif ()
if(NOT CBLAS_FOUND) if (NOT CBLAS_FOUND)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
copy(openblas_lib copy(openblas_lib
SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
DSTS ${dst_dir} ${dst_dir} DSTS ${dst_dir} ${dst_dir}
DEPS extern_openblas DEPS extern_openblas
) )
elseif (WITH_MKLML) elseif (WITH_MKLML)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
copy(mklml_lib copy(mklml_lib
SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir} DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
DEPS mklml DEPS mklml
) )
endif() endif ()
if(WITH_MKLDNN) if (WITH_MKLDNN)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
copy(mkldnn_lib copy(mkldnn_lib
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS mkldnn DEPS mkldnn
) )
endif() endif ()
if (NOT WIN32) if (NOT WIN32)
if(NOT MOBILE_INFERENCE AND NOT RPI) if (NOT MOBILE_INFERENCE AND NOT RPI)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
copy(snappy_lib copy(snappy_lib
SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappy) DEPS snappy)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
copy(snappystream_lib copy(snappystream_lib
SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappystream) DEPS snappystream)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
copy(zlib_lib copy(zlib_lib
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS zlib) DEPS zlib)
endif() endif ()
endif(NOT WIN32) endif (NOT WIN32)
# paddle fluid module # paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid") set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
set(module "framework") set(module "framework")
if (NOT WIN32) if (NOT WIN32)
set(framework_lib_deps framework_py_proto) set(framework_lib_deps framework_py_proto)
endif(NOT WIN32) endif (NOT WIN32)
copy(framework_lib DEPS ${framework_lib_deps} copy(framework_lib DEPS ${framework_lib_deps}
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
${src_dir}/${module}/ir/*.h ${src_dir}/${module}/ir/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
) )
set(module "memory") set(module "memory")
copy(memory_lib copy(memory_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
) )
set(inference_deps paddle_fluid_shared paddle_fluid) set(inference_deps paddle_fluid_shared paddle_fluid)
set(module "inference/api") set(module "inference/api")
if (WITH_ANAKIN AND WITH_MKL) if (WITH_ANAKIN AND WITH_MKL)
copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS SRCS
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
${ANAKIN_INSTALL_DIR} # anakin release ${ANAKIN_INSTALL_DIR} # anakin release
DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin) DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
list(APPEND inference_deps anakin_inference_lib) list(APPEND inference_deps anakin_inference_lib)
endif() endif ()
set(module "inference") set(module "inference")
copy(inference_lib DEPS ${inference_deps} copy(inference_lib DEPS ${inference_deps}
...@@ -167,30 +188,30 @@ copy(inference_lib DEPS ${inference_deps} ...@@ -167,30 +188,30 @@ copy(inference_lib DEPS ${inference_deps}
${src_dir}/${module}/api/paddle_*.h ${src_dir}/${module}/api/paddle_*.h
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
) )
set(module "platform") set(module "platform")
copy(platform_lib DEPS profiler_py_proto copy(platform_lib DEPS profiler_py_proto
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
) )
set(module "string") set(module "string")
copy(string_lib copy(string_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
) )
set(module "pybind") set(module "pybind")
copy(pybind_lib copy(pybind_lib
SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
DSTS ${dst_dir}/${module} DSTS ${dst_dir}/${module}
) )
# CMakeCache Info # CMakeCache Info
copy(cmake_cache copy(cmake_cache
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INSTALL_DIR}) DSTS ${FLUID_INSTALL_DIR})
# This command generates a complete fluid library for both train and inference # This command generates a complete fluid library for both train and inference
add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
...@@ -198,9 +219,9 @@ add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) ...@@ -198,9 +219,9 @@ add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
# Following commands generate a inference-only fluid library # Following commands generate a inference-only fluid library
# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR} # third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
copy(third_party DEPS fluid_lib_dist copy(third_party DEPS fluid_lib_dist
SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR} DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
) )
# only need libpaddle_fluid.so/a and paddle_*.h for inference-only library # only need libpaddle_fluid.so/a and paddle_*.h for inference-only library
copy(inference_api_lib DEPS fluid_lib_dist copy(inference_api_lib DEPS fluid_lib_dist
...@@ -213,20 +234,20 @@ add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib) ...@@ -213,20 +234,20 @@ add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)
# paddle fluid version # paddle fluid version
function(version version_file) function(version version_file)
execute_process( execute_process(
COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_GIT_COMMIT) OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
file(WRITE ${version_file} file(WRITE ${version_file}
"GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
"WITH_MKL: ${WITH_MKL}\n" "WITH_MKL: ${WITH_MKL}\n"
"WITH_MKLDNN: ${WITH_MKLDNN}\n" "WITH_MKLDNN: ${WITH_MKLDNN}\n"
"WITH_GPU: ${WITH_GPU}\n") "WITH_GPU: ${WITH_GPU}\n")
if(WITH_GPU) if (WITH_GPU)
file(APPEND ${version_file} file(APPEND ${version_file}
"CUDA version: ${CUDA_VERSION}\n" "CUDA version: ${CUDA_VERSION}\n"
"CUDNN version: v${CUDNN_MAJOR_VERSION}\n") "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
endif() endif ()
endfunction() endfunction()
version(${FLUID_INSTALL_DIR}/version.txt) version(${FLUID_INSTALL_DIR}/version.txt)
version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt) version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
../../../CONTRIBUTING.md ../../../CONTRIBUTING.md
\ No newline at end of file
...@@ -93,7 +93,7 @@ paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', ...@@ -93,7 +93,7 @@ paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized',
paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)) paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None))
paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)) paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None))
paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False)) paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False))
paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
...@@ -128,6 +128,7 @@ paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', ...@@ -128,6 +128,7 @@ paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates',
paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.selu ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
......
...@@ -4,11 +4,12 @@ add_subdirectory(framework) ...@@ -4,11 +4,12 @@ add_subdirectory(framework)
add_subdirectory(operators) add_subdirectory(operators)
add_subdirectory(string) add_subdirectory(string)
if (NOT WIN32)
add_subdirectory(pybind) add_subdirectory(pybind)
if (NOT WIN32)
add_subdirectory(recordio) add_subdirectory(recordio)
endif(NOT WIN32) endif(NOT WIN32)
# NOTE: please add subdirectory inference at last. # NOTE: please add subdirectory inference at last.
add_subdirectory(inference) add_subdirectory(inference)
add_subdirectory(train) add_subdirectory(train)
...@@ -136,20 +136,32 @@ cc_library(version SRCS version.cc) ...@@ -136,20 +136,32 @@ cc_library(version SRCS version.cc)
cc_test(version_test SRCS version_test.cc DEPS version) cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
if(NOT WIN32)
cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
endif(NOT WIN32)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
if (NOT WIN32)
py_proto_compile(framework_py_proto SRCS framework.proto) py_proto_compile(framework_py_proto SRCS framework.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module. # Generate an empty __init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init) add_dependencies(framework_py_proto framework_py_proto_init)
add_custom_command(TARGET framework_py_proto POST_BUILD if (NOT WIN32)
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMENT "Copy generated python proto into directory paddle/fluid/proto." COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
else(NOT WIN32)
string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND copy /Y *.py ${proto_dstpath}
COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif(NOT WIN32) endif(NOT WIN32)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
...@@ -163,10 +175,14 @@ if(WITH_DISTRIBUTE) ...@@ -163,10 +175,14 @@ if(WITH_DISTRIBUTE)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else() else()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) if(NOT WIN32)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
else(NOT WIN32)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
endif(NOT WIN32)
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif() endif()
if (NOT WIN32) if (NOT WIN32)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
......
...@@ -79,9 +79,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -79,9 +79,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
BuildStrategy strategy_; BuildStrategy strategy_;
}; };
std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy() std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
const { bool finalize_strategy) const {
if (is_finalized_) {
return pass_builder_;
}
pass_builder_.reset(new ParallelExecutorPassBuilder(*this)); pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
if (finalize_strategy) {
is_finalized_ = true;
}
return pass_builder_; return pass_builder_;
} }
...@@ -95,10 +101,8 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -95,10 +101,8 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
#else #else
const bool use_cuda) const { const bool use_cuda) const {
#endif #endif
// Create a default one if not initialized by user. // Create a default one if not finalized by user.
if (!pass_builder_) { CreatePassesFromStrategy(false);
CreatePassesFromStrategy();
}
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program)); std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
......
...@@ -75,12 +75,20 @@ struct BuildStrategy { ...@@ -75,12 +75,20 @@ struct BuildStrategy {
bool remove_unnecessary_lock_{false}; bool remove_unnecessary_lock_{false};
// NOTE:
// Before you add new options, think if it's a general strategy that works
// with other strategy. If not, the strategy should be created through
// CreatePassesFromStrategy and the pass can be managed separately.
// User normally doesn't need to call this API. // User normally doesn't need to call this API.
// The PassBuilder allows for more customized insert, remove of passes // The PassBuilder allows for more customized insert, remove of passes
// from python side. // from python side.
// A new PassBuilder is created based on configs defined above and // A new PassBuilder is created based on configs defined above and
// passes are owned by the PassBuilder. // passes are owned by the PassBuilder.
std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const; std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy(
bool finalize_strategy) const;
bool IsFinalized() const { return is_finalized_; }
// Apply the passes built by the pass_builder_. The passes will be // Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph. // applied to the Program and output an ir::Graph.
...@@ -97,6 +105,7 @@ struct BuildStrategy { ...@@ -97,6 +105,7 @@ struct BuildStrategy {
#endif #endif
private: private:
mutable bool is_finalized_ = false;
mutable std::shared_ptr<ir::PassBuilder> pass_builder_; mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
}; };
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/macros.h"
...@@ -25,6 +26,7 @@ limitations under the License. */ ...@@ -25,6 +26,7 @@ limitations under the License. */
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -81,6 +83,24 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, ...@@ -81,6 +83,24 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
} }
} }
static void EnableFusedOp(ExecutorPrepareContext* ctx) {
#ifdef PADDLE_WITH_NGRAPH
VLOG(3) << "use_ngraph=True";
auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_);
for (auto& interval : intervals) {
auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_,
interval.at(0), interval.at(1));
*interval[0] = std::unique_ptr<OperatorBase>(fused_op);
}
for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
ctx->ops_.erase(it->at(0) + 1, it->at(1));
}
#else
LOG(WARNING)
<< "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
#endif
}
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
void Executor::Close() { void Executor::Close() {
...@@ -338,6 +358,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( ...@@ -338,6 +358,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
} }
if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
return ctx; return ctx;
} }
...@@ -486,6 +507,5 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) { ...@@ -486,6 +507,5 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
<< "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
#endif #endif
} }
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -29,7 +29,7 @@ template <typename T> ...@@ -29,7 +29,7 @@ template <typename T>
class GarbageCollector { class GarbageCollector {
public: public:
GarbageCollector(const platform::Place &place, size_t max_memory_size) GarbageCollector(const platform::Place &place, size_t max_memory_size)
: max_memory_size_(std::max(max_memory_size, static_cast<size_t>(1))) { : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
garbages_.reset(new std::deque<T *>()); garbages_.reset(new std::deque<T *>());
dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
} }
......
...@@ -41,6 +41,7 @@ pass_library(seq_concat_fc_fuse_pass inference) ...@@ -41,6 +41,7 @@ pass_library(seq_concat_fc_fuse_pass inference)
pass_library(multi_batch_merge_pass base) pass_library(multi_batch_merge_pass base)
pass_library(conv_bn_fuse_pass inference) pass_library(conv_bn_fuse_pass inference)
pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference)
pass_library(is_test_pass base)
if(WITH_MKLDNN) if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base) pass_library(mkldnn_placement_pass base)
pass_library(depthwise_conv_mkldnn_pass base) pass_library(depthwise_conv_mkldnn_pass base)
...@@ -62,6 +63,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r ...@@ -62,6 +63,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
if (WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
......
...@@ -211,12 +211,12 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, ...@@ -211,12 +211,12 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
VLOG(30) << "LSTMWeight resized to " << out->dims(); VLOG(30) << "LSTMWeight resized to " << out->dims();
float* out_data = out->mutable_data<float>(platform::CPUPlace()); float* out_data = out->mutable_data<float>(platform::CPUPlace());
std::array<const float*, 4> tensors( std::array<const float*, 4> tensors{
{{W_forget_w0.data<float>(), W_input_w0.data<float>(), W_forget_w0.data<float>(), W_input_w0.data<float>(),
W_output_w0.data<float>(), W_cell_w0.data<float>()}}); W_output_w0.data<float>(), W_cell_w0.data<float>()};
std::array<const float*, 4> tensors1( std::array<const float*, 4> tensors1{
{{W_forget_w1.data<float>(), W_input_w1.data<float>(), W_forget_w1.data<float>(), W_input_w1.data<float>(),
W_output_w1.data<float>(), W_cell_w1.data<float>()}}); W_output_w1.data<float>(), W_cell_w1.data<float>()};
for (int row = 0; row < D; row++) { for (int row = 0; row < D; row++) {
for (int col = 0; col < 4; col++) { for (int col = 0; col < 4; col++) {
...@@ -238,9 +238,9 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, ...@@ -238,9 +238,9 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
const LoDTensor& B_output, const LoDTensor& B_cell, const LoDTensor& B_output, const LoDTensor& B_cell,
LoDTensor* out) { LoDTensor* out) {
std::array<const float*, 4> tensors( std::array<const float*, 4> tensors{
{{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(), B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
B_cell.data<float>()}}); B_cell.data<float>()};
PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1); PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
int D = B_forget.dims()[0]; int D = B_forget.dims()[0];
......
...@@ -57,6 +57,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( ...@@ -57,6 +57,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
desc.SetInput("W", std::vector<std::string>({fc_Y_in})); desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
desc.SetInput("Bias", std::vector<std::string>({fc_bias_in})); desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
desc.SetOutput("Out", std::vector<std::string>({fc_out_out})); desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims"));
desc.SetType("fc"); desc.SetType("fc");
auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out}); GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
......
...@@ -29,6 +29,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, ...@@ -29,6 +29,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
if (type == "mul") { if (type == "mul") {
op->SetInput("X", {inputs[0]}); op->SetInput("X", {inputs[0]});
op->SetInput("Y", {inputs[1]}); op->SetInput("Y", {inputs[1]});
op->SetAttr("x_num_col_dims", {1});
} else if (type == "elementwise_add") { } else if (type == "elementwise_add") {
op->SetInput("X", inputs); op->SetInput("X", inputs);
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/is_test_pass.h"
#include <string>
#include <utility>
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it "
"for activations and pooling.";
auto op_list = {"pool2d", "sigmoid", "logsigmoid",
"softshrink", "exp", "brelu",
"pow", "leaky_relu", "stanh",
"relu", "tanh", "tanh_shrink",
"sqrt", "abs", "ceil",
"elu", "floor", "cos",
"sin", "round", "reciprocal",
"hard_shrink", "hard_sigmoid", "relu6",
"soft_relu", "swish", "thresholded_relu",
"log", "square", "softplus",
"softsign"};
for (const Node* n : graph->Nodes()) {
if (n->IsOp()) {
auto* op = n->Op();
if (op->HasAttr("is_test")) {
op->SetAttr("is_test", true);
} else if (std::find(begin(op_list), end(op_list), op->Type()) !=
end(op_list)) {
op->MutableAttrMap()->insert(
std::pair<std::string, Attribute>("is_test", true));
}
}
}
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(is_test_pass, paddle::framework::ir::IsTestPass);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class IsTestPass : public Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/is_test_pass.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
namespace ir {
enum class ISTEST_STATE { FALSE, TRUE, UNSET };
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs, bool use_mkldnn = false,
ISTEST_STATE is_test = ISTEST_STATE::UNSET) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetAttr("name", name);
op->SetInput("X", inputs);
op->SetOutput("Out", outputs);
op->SetAttr("use_mkldnn", use_mkldnn);
if (is_test == ISTEST_STATE::UNSET)
op->MutableAttrMap()->erase("is_test");
else if (is_test == ISTEST_STATE::FALSE)
op->SetAttr("is_test", false);
else
op->SetAttr("is_test", true);
}
// a->pool2d->b
// b->relu->c
// c,weights1)->conv2d->d
//
// d->pool2d->e
// e->hard_sigmoid->f
// (f,weights2)->conv2d->g
//
// g->pool2d->h
// h->tanh->i
// (i,weights3)->conv2d->j
ProgramDesc BuildProgramDesc() {
ProgramDesc prog;
for (auto& v :
std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h", "i",
"j", "weights1", "weights2", "weights3"})) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::SELECTED_ROWS);
if (v == "weights1" || v == "weights2" || v == "weights3") {
var->SetPersistable(true);
}
}
SetOp(&prog, "pool2d", "pooling1", std::vector<std::string>({"a"}),
std::vector<std::string>({"b"}), true, ISTEST_STATE::TRUE);
SetOp(&prog, "relu", "activation1", std::vector<std::string>({"b"}),
std::vector<std::string>({"c"}), true, ISTEST_STATE::TRUE);
SetOp(&prog, "conv2d", "conv1", std::vector<std::string>({"c", "weights1"}),
std::vector<std::string>({"d"}), true, ISTEST_STATE::TRUE);
SetOp(&prog, "pool2d", "pooling2", std::vector<std::string>({"d"}),
std::vector<std::string>({"e"}), false, ISTEST_STATE::FALSE);
SetOp(&prog, "hard_sigmoid", "activation2", std::vector<std::string>({"e"}),
std::vector<std::string>({"f"}), false, ISTEST_STATE::FALSE);
SetOp(&prog, "conv2d", "conv2", std::vector<std::string>({"f", "weights2"}),
std::vector<std::string>({"g"}), false, ISTEST_STATE::FALSE);
SetOp(&prog, "pool2d", "pooling3", std::vector<std::string>({"g"}),
std::vector<std::string>({"h"}), false, ISTEST_STATE::UNSET);
SetOp(&prog, "tanh", "activation3", std::vector<std::string>({"h"}),
std::vector<std::string>({"i"}), true, ISTEST_STATE::UNSET);
SetOp(&prog, "conv2d", "conv3", std::vector<std::string>({"i", "weights3"}),
std::vector<std::string>({"j"}), false, ISTEST_STATE::UNSET);
return prog;
}
TEST(IsTestPass, basic) {
auto prog = BuildProgramDesc();
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto pass = PassRegistry::Instance().Get("is_test_pass");
graph = pass->Apply(std::move(graph));
for (auto* node : graph->Nodes()) {
if (node->IsOp()) {
auto* op = node->Op();
auto op_name = boost::get<std::string>(op->GetAttr("name"));
if (op_name == "conv3") {
ASSERT_FALSE(op->HasAttr("is_test"));
} else {
ASSERT_TRUE(op->HasAttr("is_test"));
EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
}
}
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(is_test_pass);
...@@ -17,7 +17,12 @@ limitations under the License. */ ...@@ -17,7 +17,12 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
// msvc15 don't support constexpr in correct way.
#if !defined(_WIN32)
constexpr char Node::kControlDepVarName[]; constexpr char Node::kControlDepVarName[];
#else
const char Node::kControlDepVarName[] = "__control_var";
#endif
std::unique_ptr<Node> CreateNodeForTest(const std::string& name, std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
Node::Type type) { Node::Type type) {
......
...@@ -55,7 +55,11 @@ class Node { ...@@ -55,7 +55,11 @@ class Node {
} }
enum class Type { kOperation, kVariable }; enum class Type { kOperation, kVariable };
#if !defined(_WIN32) // msvc not support constexpr correctly.
static constexpr char kControlDepVarName[] = "__control_var"; static constexpr char kControlDepVarName[] = "__control_var";
#else
static const char kControlDepVarName[];
#endif
Type NodeType() const { return type_; } Type NodeType() const { return type_; }
......
...@@ -197,26 +197,26 @@ struct PassRegistrar : public Registrar { ...@@ -197,26 +197,26 @@ struct PassRegistrar : public Registrar {
msg) msg)
// Register a new pass that can be applied on the IR. // Register a new pass that can be applied on the IR.
#define REGISTER_PASS(pass_type, pass_class) \ #define REGISTER_PASS(pass_type, pass_class) \
STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \
__reg_pass__##pass_type, \ __reg_pass__##pass_type, \
"REGISTER_PASS must be called in global namespace"); \ "REGISTER_PASS must be called in global namespace"); \
static ::paddle::framework::ir::PassRegistrar<pass_class> \ static ::paddle::framework::ir::PassRegistrar<pass_class> \
__pass_registrar_##pass_type##__(#pass_type); \ __pass_registrar_##pass_type##__(#pass_type); \
int TouchPassRegistrar_##pass_type() { \ int TouchPassRegistrar_##pass_type() { \
__pass_registrar_##pass_type##__.Touch(); \ __pass_registrar_##pass_type##__.Touch(); \
return 0; \ return 0; \
} \ } \
static ::paddle::framework::ir::PassRegistrar<pass_class> \ static ::paddle::framework::ir::PassRegistrar<pass_class> \
&__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \ &__pass_tmp_registrar_##pass_type##__ UNUSED = \
__pass_registrar_##pass_type##__ __pass_registrar_##pass_type##__
#define USE_PASS(pass_type) \ #define USE_PASS(pass_type) \
STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \
__use_pass_itself_##pass_type, \ __use_pass_itself_##pass_type, \
"USE_PASS must be called in global namespace"); \ "USE_PASS must be called in global namespace"); \
extern int TouchPassRegistrar_##pass_type(); \ extern int TouchPassRegistrar_##pass_type(); \
static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \ static int use_pass_itself_##pass_type##_ UNUSED = \
TouchPassRegistrar_##pass_type() TouchPassRegistrar_##pass_type()
} // namespace ir } // namespace ir
......
...@@ -70,6 +70,16 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, ...@@ -70,6 +70,16 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
} }
void NaiveExecutor::Run() { void NaiveExecutor::Run() {
#ifndef PADDLE_ON_INFERENCE
LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
"cmake flag ON_INFER is not set.";
LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
"variables will be reused to save the allocation "
"overhead.";
LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
"setting the cmake flag ON_INFER=ON if you are "
"running Paddle Inference";
#endif // PADDLE_ON_INFERENCE
for (auto &op : ops_) { for (auto &op : ops_) {
VLOG(3) << std::this_thread::get_id() << " run " << op->Type() VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
<< " on scope " << scope_; << " on scope " << scope_;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <functional>
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace framework {
std::map<std::string,
std::function<void(const std::shared_ptr<OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = {};
void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
auto& op_type = op->Type();
NG_NODE_MAP[op_type](op, ngb_node_map);
}
} // namespace framework
} // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace framework {
class NgraphBridge {
public:
static std::map<
std::string,
std::function<void(const std::shared_ptr<OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NG_NODE_MAP;
explicit NgraphBridge(
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map)
: ngb_node_map(var_node_map) {}
void build_graph(const std::shared_ptr<OperatorBase>& op);
private:
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map;
};
} // namespace framework
} // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#include <glog/logging.h>
#include <algorithm>
#include <map>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type.h"
namespace paddle {
namespace framework {
static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
{proto::VarType::FP32, ngraph::element::f32},
{proto::VarType::FP64, ngraph::element::f64},
{proto::VarType::INT32, ngraph::element::i32},
{proto::VarType::INT64, ngraph::element::i64},
{proto::VarType::BOOL, ngraph::element::boolean},
};
typedef enum { /* nGraph support state on ops */
FULL_TRAIN, /* Support full ops for train */
PARTIAL_TRAIN, /* Support partial ops for train */
FULL_TEST, /* Support full list of ops for test */
PARTIAL_TEST /* Support partial list of ops for test */
} op_state;
class NgraphOperator {
public:
explicit NgraphOperator(const Scope& scope, const platform::Place& place,
const std::vector<std::shared_ptr<OperatorBase>>& ops,
const std::unordered_map<
std::string, ngraph::element::Type>& var_type_map,
const std::unordered_set<std::string>& persist,
const std::unordered_set<std::string>& fetches,
const std::unordered_set<std::string>& post_op_inputs,
op_state ng_op_state)
: scope_(scope),
place_(place),
fused_ops_(ops),
var_type_map_(var_type_map),
persistables_(persist),
fetches_(fetches),
post_op_inputs_(post_op_inputs),
ng_op_state_(ng_op_state) {}
void Run(const Scope& scope, const platform::Place& place) const;
private:
static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
func_cache;
const Scope& scope_;
const platform::Place& place_;
std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_;
op_state ng_op_state_;
};
std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
FusedOperator::FusedOpIntervals(
std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
intervals;
if (ops->empty()) {
return intervals;
}
size_t size = ops->size();
size_t left = 0;
while (left < size && ops.at(left)->Type() != kFeedOpType) {
++left;
}
if (left == size) {
return intervals;
}
while (left < size && ops->at(left)->Type() == kFeedOpType) {
++left;
}
size_t right = left;
while (right < size && ops->at(right)->Type() != kFetchOpType) {
++right;
}
if (right == size) {
return intervals;
}
if (left >= right) return intervals;
// (left, right - 1) represents indices between feed and fetch
size_t pivot = left;
while (pivot < right) {
auto op_type = ops->at(pivot)->Type();
if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) ==
paddle::framework::NgraphBridge::NG_NODE_MAP.end()) {
++pivot;
} else {
size_t start = pivot, end = start;
while (pivot < right &&
(paddle::framework::NgraphBridge::NG_NODE_MAP.find(
ops.at(pivot)->Type()) !=
paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
++pivot;
++end;
}
std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>
interval = {ops->begin() + start, ops->begin() + end};
intervals.push_back(interval);
}
} // end while
return intervals;
}
FusedOperator::FusedOperator(
const ProgramDesc& prog, size_t block_id,
std::vector<std::unique_ptr<OperatorBase>>::iterator start,
std::vector<std::unique_ptr<OperatorBase>>::iterator end,
const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
it != end; ++it) {
fused_ops_.push_back(std::move(*it));
}
for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end;
(*it)->Type() != kFetchOpType; ++it) {
for (auto& var_name_item : (*it)->Inputs()) {
for (auto& var_name : var_name_item.second) {
post_op_inputs_.insert(var_name);
}
}
}
if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
is_complete = true;
}
Process();
}
void FusedOperator::Process() {
auto& bdesc = pdesc_.Block(block_);
for (auto& var : bdesc.AllVars()) {
if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
var->GetType() == proto::VarType::LOD_TENSOR ||
var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) {
continue;
}
auto var_name = var->Name();
if (var->Name() == framework::kEmptyVarName) {
continue;
}
if (var_name != "fetch" && var_name != "feed") {
auto pd_type = var->GetDataType();
if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
var_name);
}
var_type_map_[var_name] = pd2ng_type_map[pd_type];
}
if (var->Persistable()) {
persistables_.insert(var->Name());
}
}
for (auto* op : bdesc.AllOps()) {
if (op->Type() == kFetchOpType) {
std::string fetch_target_name = op->Input("X")[0];
fetches_.insert(fetch_target_name);
}
}
}
void FusedOperator::RunImpl(const Scope& scope,
const platform::Place& place) const {
op_state ng_op_state = PARTIAL_TEST;
auto& bdesc = pdesc_.Block(block_);
for (auto* op : bdesc.AllOps()) {
if (op->Type().find("_grad") != std::string::npos) {
ng_op_state = PARTIAL_TRAIN;
break;
}
}
if (is_full) {
ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
}
NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_,
persistables_, fetches_, post_op_inputs_,
ng_op_state);
ngraph_op.Run(scope, place);
}
} // namespace framework
} // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <atomic>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/variant.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace framework {
class FusedOperator : public OperatorBase {
public:
static std::vector<
std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
FusedOpIntervals(
std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
explicit FusedOperator(
const ProgramDesc& prog, size_t block_id,
std::vector<std::unique_ptr<OperatorBase>>::iterator start,
std::vector<std::unique_ptr<OperatorBase>>::iterator end,
const std::string& type = "fused_op", const VariableNameMap& inputs = {},
const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
void RunImpl(const Scope& scope, const platform::Place& place) const final;
private:
const ProgramDesc pdesc_;
size_t block_;
std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_;
bool is_full_ = false;
void Process();
};
} // namespace framework
} // namespace paddle
#endif
...@@ -63,6 +63,8 @@ struct OpKernelType { ...@@ -63,6 +63,8 @@ struct OpKernelType {
place_(dev_ctx.GetPlace()), place_(dev_ctx.GetPlace()),
library_type_(library_type) {} library_type_(library_type) {}
size_t hash_key() const { return Hash()(*this); }
bool operator==(const OpKernelType& o) const { bool operator==(const OpKernelType& o) const {
return platform::places_are_same_class(place_, o.place_) && return platform::places_are_same_class(place_, o.place_) &&
data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
......
...@@ -35,6 +35,11 @@ DEFINE_bool(check_nan_inf, false, ...@@ -35,6 +35,11 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// Combine two hash values to a single hash.
inline size_t CombineHash(size_t seed, size_t a) {
return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = { std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN), std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain), std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
...@@ -150,14 +155,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -150,14 +155,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#endif #endif
} }
// The profile has a process-wide mutex, results in serious performance issue // The profile has a process-wide mutex, results in serious performance issue
// in concurrency scenerio. Here use an `if` to fix this issue. // in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern. // Please not remove the `if`, ask @Superjomn if there are any concern.
#ifndef _WIN32
if (platform::IsProfileEnabled()) { if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place)); platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place); RunImpl(scope, place);
} else { } else
#endif
{
RunImpl(scope, place); RunImpl(scope, place);
} }
VLOG(30) << place << " " << DebugStringEx(&scope); VLOG(30) << place << " " << DebugStringEx(&scope);
...@@ -791,6 +799,17 @@ void OperatorWithKernel::TransferInplaceVarsBack( ...@@ -791,6 +799,17 @@ void OperatorWithKernel::TransferInplaceVarsBack(
Scope* OperatorWithKernel::TryTransferData( Scope* OperatorWithKernel::TryTransferData(
const Scope& scope, const OpKernelType& expected_kernel_key, const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const { std::vector<std::string>* transfered_inplace_vars) const {
// In the inference scenerio, the scopes will be reused across the batches, so
// the `new_scope` here will result in GPU memroy explosion over the running of
// operators.
// We use a thread_local cache to fix that issue, the key in the cache is the
// combination of the `scope` argument, from_kernel_type, target_kernel_type.
// Have a discussion with @Superjomn or the inference developers if some changes
// on this logic for this macro might not tested on the other scenerios.
#ifdef PADDLE_ON_INFERENCE
thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
#endif
Scope* new_scope = nullptr; Scope* new_scope = nullptr;
for (auto& var_name_item : Inputs()) { for (auto& var_name_item : Inputs()) {
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
...@@ -821,11 +840,28 @@ Scope* OperatorWithKernel::TryTransferData( ...@@ -821,11 +840,28 @@ Scope* OperatorWithKernel::TryTransferData(
VLOG(30) << "Transform Variable " << var_name << " from " VLOG(30) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key; << kernel_type_for_var << " to " << expected_kernel_key;
#ifdef PADDLE_ON_INFERENCE
size_t infer_cache_key =
CombineHash(OpKernelType::Hash()(kernel_type_for_var),
OpKernelType::Hash()(expected_kernel_key));
infer_cache_key =
CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
auto it = infer_transfer_scope_cache.find(infer_cache_key);
if (it != infer_transfer_scope_cache.end()) {
new_scope = infer_transfer_scope_cache[infer_cache_key];
} else {
new_scope = &scope.NewScope();
infer_transfer_scope_cache[infer_cache_key] = new_scope;
}
#endif
if (new_scope == nullptr) { if (new_scope == nullptr) {
new_scope = &scope.NewScope(); new_scope = &scope.NewScope();
} }
auto* trans_var = new_scope->Var(var_name); auto* trans_var = new_scope->Var(var_name);
Tensor out; Tensor out;
TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
SetTensorToVariable(*var, out, trans_var); SetTensorToVariable(*var, out, trans_var);
......
...@@ -42,7 +42,7 @@ DEFINE_double( ...@@ -42,7 +42,7 @@ DEFINE_double(
// a mean time, but a scope may be read by multiple threads concurrently, and // a mean time, but a scope may be read by multiple threads concurrently, and
// the mutex will cause serious performance issue. // the mutex will cause serious performance issue.
// So the mutex is disabled when `ON_INFER`. // So the mutex is disabled when `ON_INFER`.
#ifdef ON_INFER #ifdef PADDLE_ON_INFERENCE
#define SCOPE_LOCK_GUARD #define SCOPE_LOCK_GUARD
#else #else
#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_); #define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
......
...@@ -16,9 +16,21 @@ cc_library(paddle_fluid_api ...@@ -16,9 +16,21 @@ cc_library(paddle_fluid_api
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
get_property(fluid_third_partys GLOBAL PROPERTY FLUID_THRID_PARTYS)
if (WIN32)
list(APPEND fluid_third_partys gflags glog protobuf cblas)
endif(WIN32)
# paddle_fluid_origin exclude inference api interface # paddle_fluid_origin exclude inference api interface
cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) if(WIN32)
sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
if(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_fluid_origin ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
else(WIN32)
cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
endif(WIN32)
add_subdirectory(api) add_subdirectory(api)
...@@ -28,8 +40,16 @@ set(SHARED_INFERENCE_SRCS ...@@ -28,8 +40,16 @@ set(SHARED_INFERENCE_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc) ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
# Create static library if(WIN32)
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
analysis_config paddle_pass_builder)
if(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_fluid ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
else(WIN32)
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
analysis_config paddle_pass_builder)
endif(WIN32)
if(NOT APPLE) if(NOT APPLE)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
...@@ -38,11 +58,20 @@ if(NOT APPLE) ...@@ -38,11 +58,20 @@ if(NOT APPLE)
endif() endif()
# Create shared library # Create shared library
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} if(WIN32)
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
target_link_libraries(paddle_fluid_shared shlwapi)
if(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_fluid_origin ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
else(WIN32)
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
endif()
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
if(NOT APPLE) if(NOT APPLE AND NOT WIN32)
# TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac. # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map") set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
......
...@@ -26,6 +26,7 @@ limitations under the License. */ ...@@ -26,6 +26,7 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -124,20 +125,6 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) { ...@@ -124,20 +125,6 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
return *var->GetMutable<T>(); return *var->GetMutable<T>();
} }
static void ExecShellCommand(const std::string &cmd, std::string *message) {
char buffer[128];
std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
if (!pipe) {
LOG(ERROR) << "error running command: " << cmd;
return;
}
while (!feof(pipe.get())) {
if (fgets(buffer, 128, pipe.get()) != nullptr) {
*message += buffer;
}
}
}
static framework::proto::ProgramDesc LoadProgramDesc( static framework::proto::ProgramDesc LoadProgramDesc(
const std::string &model_path) { const std::string &model_path) {
std::ifstream fin(model_path, std::ios::in | std::ios::binary); std::ifstream fin(model_path, std::ios::in | std::ios::binary);
......
...@@ -412,7 +412,7 @@ void DetachDeletedNodes(framework::ir::Graph *graph) { ...@@ -412,7 +412,7 @@ void DetachDeletedNodes(framework::ir::Graph *graph) {
void SubGraphFuser::ReplaceNodesWithSubGraphs() { void SubGraphFuser::ReplaceNodesWithSubGraphs() {
auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)(); auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)();
for (auto &subgraph : subgraphs) { for (auto &subgraph : subgraphs) {
if (subgraph.size() <= min_subgraph_size_) continue; if (subgraph.size() <= (size_t)min_subgraph_size_) continue;
LOG(INFO) << "detect a subgraph size " << subgraph.size(); LOG(INFO) << "detect a subgraph size " << subgraph.size();
std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end()); std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
// replace this sub-graph with the first node. Two steps: 1. Create a Block // replace this sub-graph with the first node. Two steps: 1. Create a Block
......
...@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// it is either an OP's input or an OP's output. // it is either an OP's input or an OP's output.
auto &subgraph_nodes = *Agent(node).subgraph(); auto &subgraph_nodes = *Agent(node).subgraph();
for (int index = 0; index < block_desc.OpSize(); index++) { for (size_t index = 0; index < block_desc.OpSize(); index++) {
framework::proto::OpDesc *op = block_desc.Op(index)->Proto(); framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
auto correspond_node = subgraph_nodes[index]; auto correspond_node = subgraph_nodes[index];
PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
......
...@@ -45,7 +45,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) { ...@@ -45,7 +45,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
std::unordered_set<std::string> teller_set( std::unordered_set<std::string> teller_set(
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "dropout", "split"}); "elementwise_add", "dropout", "split", "prelu", "conv2d_transpose"});
if (!node->IsOp()) return false; if (!node->IsOp()) return false;
if (teller_set.count(node->Op()->Type())) { if (teller_set.count(node->Op()->Type())) {
......
...@@ -549,4 +549,6 @@ USE_TRT_CONVERTER(concat); ...@@ -549,4 +549,6 @@ USE_TRT_CONVERTER(concat);
USE_TRT_CONVERTER(dropout); USE_TRT_CONVERTER(dropout);
USE_TRT_CONVERTER(pad); USE_TRT_CONVERTER(pad);
USE_TRT_CONVERTER(split); USE_TRT_CONVERTER(split);
USE_TRT_CONVERTER(prelu);
USE_TRT_CONVERTER(conv2d_transpose);
#endif #endif
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <thread> #include <thread> // NOLINT
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
......
...@@ -23,7 +23,7 @@ limitations under the License. */ ...@@ -23,7 +23,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <thread> //NOLINT #include <thread> //NOLINT
#include "utils.h" #include "utils.h" // NOLINT
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_bool(use_gpu, false, "Whether use gpu."); DEFINE_bool(use_gpu, false, "Whether use gpu.");
......
...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); ...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
......
...@@ -15,9 +15,14 @@ ...@@ -15,9 +15,14 @@
#pragma once #pragma once
#include <glog/logging.h> #include <glog/logging.h>
#if !defined(_WIN32)
#include <sys/time.h> #include <sys/time.h>
#else
#endif
#include <algorithm> #include <algorithm>
#include <chrono> // NOLINT #include <chrono> // NOLINT
#include <iterator>
#include <numeric> #include <numeric>
#include <sstream> #include <sstream>
#include <string> #include <string>
......
...@@ -49,6 +49,8 @@ struct AnalysisConfig : public NativeConfig { ...@@ -49,6 +49,8 @@ struct AnalysisConfig : public NativeConfig {
void EnableTensorRtEngine(int workspace_size = 1 << 20, void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1); int max_batch_size = 1);
bool use_tensorrt() const { return use_tensorrt_; }
// NOTE this is just for internal development, please not use it. // NOTE this is just for internal development, please not use it.
// NOT stable yet. // NOT stable yet.
void EnableMKLDNN(); void EnableMKLDNN();
......
...@@ -86,12 +86,13 @@ class CpuPassStrategy : public PassStrategy { ...@@ -86,12 +86,13 @@ class CpuPassStrategy : public PassStrategy {
"fc_fuse_pass", // "fc_fuse_pass", //
"conv_bn_fuse_pass", // "conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", //
}); });
} }
virtual ~CpuPassStrategy() = default; virtual ~CpuPassStrategy() = default;
virtual void EnableMKLDNN() override { void EnableMKLDNN() override {
// TODO(Superjomn) Consider the way to mix CPU with GPU. // TODO(Superjomn) Consider the way to mix CPU with GPU.
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
passes_.insert(passes_.begin(), "mkldnn_placement_pass"); passes_.insert(passes_.begin(), "mkldnn_placement_pass");
...@@ -123,7 +124,7 @@ class GpuPassStrategy : public PassStrategy { ...@@ -123,7 +124,7 @@ class GpuPassStrategy : public PassStrategy {
GpuPassStrategy(const GpuPassStrategy &other) GpuPassStrategy(const GpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {} : PassStrategy(other.AllPasses()) {}
virtual void EnableMKLDNN() override; void EnableMKLDNN() override;
virtual ~GpuPassStrategy() = default; virtual ~GpuPassStrategy() = default;
}; };
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
nv_library(tensorrt_converter nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
pad_op.cc split_op.cc pad_op.cc split_op.cc prelu_op.cc
DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
...@@ -16,7 +16,7 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc ...@@ -16,7 +16,7 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op conv_transpose_op SERIAL)
nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op tensorrt_plugin SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op tensorrt_plugin SERIAL)
nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
...@@ -33,4 +33,7 @@ nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc ...@@ -33,4 +33,7 @@ nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL) DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL)
nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin
split_op concat_op SERIAL) split_op concat_op SERIAL)
nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin
prelu_op SERIAL)
...@@ -18,92 +18,139 @@ namespace paddle { ...@@ -18,92 +18,139 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
bool to_skip_merging_optimize(TensorRTEngine* engine_, bool to_skip_merging_optimize(TensorRTEngine* engine,
const std::vector<int>& filters, const std::vector<int>& filters,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
std::string input_name) { std::string input_name) {
if (engine_->itensor_quote_num[input_name] > 0) { if (engine->itensor_quote_num[input_name] > 0) {
return true; return true;
} }
if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 && if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0) strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
engine_->itensor_quote_num[input_name] += 1; engine->itensor_quote_num[input_name] += 1;
return false; return false;
} }
template <typename RegistFunc, typename SetDilationFunc>
void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode,
RegistFunc fadd_layer, SetDilationFunc fset_dilation,
const std::string& name) {
VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias";
framework::OpDesc op_desc(op, nullptr);
PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1); // Y is a weight
PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
PADDLE_ENFORCE(engine != nullptr);
auto* X = engine->GetITensor(op_desc.Input("Input").front());
// Declare weights
auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(Y_v);
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
platform::CPUPlace cpu_place;
std::unique_ptr<framework::LoDTensor> weight_tensor(
new framework::LoDTensor());
weight_tensor->Resize(Y_t->dims());
TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
auto* weight_data = weight_tensor->mutable_data<float>(platform::CPUPlace());
PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
const int n_output = weight_tensor->dims()[0];
const int n_input = weight_tensor->dims()[1];
const int filter_h = weight_tensor->dims()[2];
const int filter_w = weight_tensor->dims()[3];
const int groups = boost::get<int>(op_desc.GetAttr("groups"));
const std::vector<int> dilations =
boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
const std::vector<int> strides =
boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
const std::vector<int> paddings =
boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
static_cast<size_t>(weight_tensor->numel())};
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
nv_ksize, weight, bias);
PADDLE_ENFORCE(layer != nullptr);
layer->setStride(nv_strides);
layer->setPadding(nv_paddings);
layer->setNbGroups(groups);
// set dilations
fset_dilation(layer, nv_dilations);
auto output_name = op_desc.Output("Output").front();
layer->setName((name + " (Output: " + output_name + ")").c_str());
engine->weight_map[op_desc.Input("Filter").front()] =
std::move(weight_tensor);
layer->getOutput(0)->setName(output_name.c_str());
engine->SetITensor(output_name, layer->getOutput(0));
if (test_mode ||
to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings,
op_desc.Input("Input").front())) {
engine->DeclareOutput(output_name);
}
}
class Conv2dOpConverter : public OpConverter { class Conv2dOpConverter : public OpConverter {
public: public:
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
VLOG(3) << "convert a fluid conv2d op to tensorrt conv layer without bias"; ConvertConv2d(
engine_, op, scope, test_mode,
framework::OpDesc op_desc(op, nullptr); [&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */
PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1); int n_input, /* Conv input maps */
PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1); // Y is a weight nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1); TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* {
auto* layer =
auto* X = engine_->GetITensor(op_desc.Input("Input").front()); TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
ksize, weight.get(), bias.get());
// Declare weights return layer;
auto* Y_v = scope.FindVar(op_desc.Input("Filter").front()); },
PADDLE_ENFORCE_NOT_NULL(Y_v); [](nvinfer1::IConvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>(); layer->setDilation(dilations);
},
platform::CPUPlace cpu_place; "conv2d");
std::unique_ptr<framework::LoDTensor> weight_tensor( }
new framework::LoDTensor()); };
weight_tensor->Resize(Y_t->dims());
TensorCopySync((*Y_t), cpu_place, weight_tensor.get()); class Deconv2dOpConverter : public OpConverter {
public:
auto* weight_data = void operator()(const framework::proto::OpDesc& op,
weight_tensor->mutable_data<float>(platform::CPUPlace()); const framework::Scope& scope, bool test_mode) override {
ConvertConv2d(
PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); engine_, op, scope, test_mode,
const int n_output = weight_tensor->dims()[0]; [&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */
const int filter_h = weight_tensor->dims()[2]; int n_input, /* Deconv output maps */
const int filter_w = weight_tensor->dims()[3]; nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
const int groups = boost::get<int>(op_desc.GetAttr("groups")); auto* layer =
const std::vector<int> dilations = TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_input,
boost::get<std::vector<int>>(op_desc.GetAttr("dilations")); ksize, weight.get(), bias.get());
const std::vector<int> strides = return layer;
boost::get<std::vector<int>>(op_desc.GetAttr("strides")); },
const std::vector<int> paddings = [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
boost::get<std::vector<int>>(op_desc.GetAttr("paddings")); PADDLE_ENFORCE(
dilations.d[0] == 1 && dilations.d[1] == 1,
nvinfer1::DimsHW nv_ksize(filter_h, filter_w); "Dilations must be (1, 1) for tensorRT, but given (%d, %d)",
nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]); dilations.d[0], dilations.d[1]);
nvinfer1::DimsHW nv_strides(strides[0], strides[1]); },
nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); "conv2d_transpose");
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
weight_tensor->memory_size() / sizeof(float)};
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
auto* layer = TRT_ENGINE_ADD_LAYER(
engine_, Convolution, *const_cast<nvinfer1::ITensor*>(X), n_output,
nv_ksize, weight.get(), bias.get());
PADDLE_ENFORCE(layer != nullptr);
layer->setStride(nv_strides);
layer->setPadding(nv_paddings);
layer->setDilation(nv_dilations);
layer->setNbGroups(groups);
auto output_name = op_desc.Output("Output").front();
layer->setName(("conv2d (Output: " + output_name + ")").c_str());
engine_->weight_map[op_desc.Input("Filter").front()] =
std::move(weight_tensor);
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode ||
to_skip_merging_optimize(engine_, {filter_h, filter_w}, strides,
paddings, op_desc.Input("Input").front())) {
engine_->DeclareOutput(output_name);
}
} }
}; };
...@@ -112,3 +159,4 @@ class Conv2dOpConverter : public OpConverter { ...@@ -112,3 +159,4 @@ class Conv2dOpConverter : public OpConverter {
} // namespace paddle } // namespace paddle
REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter); REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
REGISTER_TRT_OP_CONVERTER(conv2d_transpose, Deconv2dOpConverter);
...@@ -34,7 +34,8 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -34,7 +34,8 @@ class ElementwiseWeightOpConverter : public OpConverter {
auto* X = engine_->GetITensor(op_desc.Input("X").front()); auto* X = engine_->GetITensor(op_desc.Input("X").front());
nvinfer1::Dims dims_x = X->getDimensions(); nvinfer1::Dims dims_x = X->getDimensions();
PADDLE_ENFORCE(dims_x.nbDims >= 3); PADDLE_ENFORCE(dims_x.nbDims >= 3, "x dims experts 3, but %d is given.",
dims_x.nbDims);
auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
PADDLE_ENFORCE_NOT_NULL(Y_v); PADDLE_ENFORCE_NOT_NULL(Y_v);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* PRelu converter from fluid to tensorRT.
*/
class PReluOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert fluid prelu op to tensorrt prelu layer";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
int input_num = op_desc.Input("X").size();
PADDLE_ENFORCE(input_num == 1);
auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
// Get output
size_t output_num = op_desc.Output("Out").size();
PADDLE_ENFORCE(output_num == 1);
// Get attrs
std::string mode = boost::get<std::string>(op_desc.GetAttr("mode"));
//
auto* alpha_var = scope.FindVar(op_desc.Input("Alpha")[0]);
PADDLE_ENFORCE_NOT_NULL(alpha_var);
auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();
platform::CUDAPlace place;
std::unique_ptr<framework::LoDTensor> alpha_tensor_device(
new framework::LoDTensor());
alpha_tensor_device->Resize(alpha_tensor->dims());
TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get());
float* alpha_data = alpha_tensor_device->mutable_data<float>(place);
// Transform alpha to TensorRTEngine::Weight
TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
static_cast<void*>(alpha_data),
alpha_tensor_device->numel());
PReluPlugin* plugin = new PReluPlugin(alpha_rt, mode);
nvinfer1::IPluginLayer* layer =
engine_->AddPlugin(&input, input_num, plugin);
// keep alpha tensor to avoid release it's memory
engine_->weight_map[op_desc.Input("Alpha")[0]] =
std::move(alpha_tensor_device);
std::string layer_name = "prelu (Output: ";
auto output_name = op_desc.Output("Out")[0];
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
layer_name += output_name;
if (test_mode) {
engine_->DeclareOutput(output_name);
}
layer->setName((layer_name + ")").c_str());
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(prelu, PReluOpConverter);
...@@ -26,7 +26,7 @@ class SplitOpConverter : public OpConverter { ...@@ -26,7 +26,7 @@ class SplitOpConverter : public OpConverter {
public: public:
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
VLOG(40) << "convert a fluid split op to tensorrt split layer"; VLOG(4) << "convert a fluid split op to tensorrt split layer";
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
// Declare inputs // Declare inputs
......
...@@ -16,6 +16,9 @@ limitations under the License. */ ...@@ -16,6 +16,9 @@ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
USE_OP(conv2d);
USE_OP(conv2d_transpose);
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
...@@ -51,7 +54,37 @@ TEST(conv2d_op, test) { ...@@ -51,7 +54,37 @@ TEST(conv2d_op, test) {
validator.Execute(3); validator.Execute(3);
} }
TEST(conv2d_transpose_op, test) {
std::unordered_set<std::string> parameters({"deconv2d-Y"});
framework::Scope scope;
TRTConvertValidation validator(5, parameters, scope, 1 << 15);
validator.DeclInputVar("deconv2d-X", nvinfer1::Dims3(3, 5, 5));
validator.DeclParamVar("deconv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
validator.DeclOutputVar("deconv2d-Out", nvinfer1::Dims3(2, 5, 5));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("conv2d_transpose");
desc.SetInput("Input", {"deconv2d-X"});
desc.SetInput("Filter", {"deconv2d-Y"});
desc.SetOutput("Output", {"deconv2d-Out"});
const std::vector<int> strides({1, 1});
const std::vector<int> paddings({1, 1});
const std::vector<int> dilations({1, 1});
const int groups = 1;
desc.SetAttr("strides", strides);
desc.SetAttr("paddings", paddings);
desc.SetAttr("dilations", dilations);
desc.SetAttr("groups", groups);
validator.SetOp(*desc.Proto());
validator.Execute(3);
}
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(conv2d);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(prelu_op, test_channel_wise) {
std::unordered_set<std::string> parameters({"prelu_alpha"});
framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(3, 1, 1));
validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("prelu");
desc.SetInput("X", {"prelu_input"});
desc.SetInput("Alpha", {"prelu_alpha"});
desc.SetOutput("Out", {"prelu_out"});
desc.SetAttr("mode", std::string("channel"));
validator.SetOp(*desc.Proto());
validator.Execute(1);
}
TEST(prelu_op, test_element_wise) {
std::unordered_set<std::string> parameters({"prelu_alpha"});
framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
validator.DeclParamVar("prelu_alpha", nvinfer1::Dims4(10, 3, 2, 2));
validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("prelu");
desc.SetInput("X", {"prelu_input"});
desc.SetInput("Alpha", {"prelu_alpha"});
desc.SetOutput("Out", {"prelu_out"});
desc.SetAttr("mode", std::string("element"));
validator.SetOp(*desc.Proto());
validator.Execute(1);
}
TEST(prelu_op, test_scalar) {
std::unordered_set<std::string> parameters({"prelu_alpha"});
framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2));
validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(1, 1, 1));
validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("prelu");
desc.SetInput("X", {"prelu_input"});
desc.SetInput("Alpha", {"prelu_alpha"});
desc.SetOutput("Out", {"prelu_out"});
desc.SetAttr("mode", std::string("all"));
validator.SetOp(*desc.Proto());
validator.Execute(1);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// USE_OP(prelu);
USE_CPU_ONLY_OP(prelu);
...@@ -200,7 +200,8 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, ...@@ -200,7 +200,8 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
Buffer &TensorRTEngine::buffer(const std::string &name) { Buffer &TensorRTEngine::buffer(const std::string &name) {
PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first."); PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
auto it = buffer_sizes_.find(name); auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s",
name);
auto slot_offset = infer_engine_->getBindingIndex(name.c_str()); auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
return buffers_[slot_offset]; return buffers_[slot_offset];
} }
......
...@@ -40,6 +40,7 @@ class TensorRTEngine : public EngineBase { ...@@ -40,6 +40,7 @@ class TensorRTEngine : public EngineBase {
// Weight is model parameter. // Weight is model parameter.
class Weight { class Weight {
public: public:
Weight() = default;
Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) { Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
w_.type = dtype; w_.type = dtype;
w_.values = value; w_.values = value;
......
nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu
avg_pool_op_plugin.cu DEPS enforce pooling) prelu_op_plugin.cu avg_pool_op_plugin.cu DEPS enforce pooling)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <cassert>
#include "glog/logging.h"
#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
static const int CUDA_NUM_THREADS = 1024;
static const int CUDA_MAX_NUM_BLOCKS = 65535;
inline static int GET_NUM_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
__global__ void PReluChannelWiseKernel(const float *input, const float *alpha,
float *output, int channel,
size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const float *in = input + offset;
float *out = output + offset;
float scale = alpha[blockIdx.x % channel];
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
float x = in[i];
out[i] = (x > 0) ? x : scale * x;
}
}
__global__ void PReluElementWiseKernel(const float *input, const float *alpha,
float *output, size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const float *in = input + offset;
const float *scale = alpha + offset;
float *out = output + offset;
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
float x = in[i];
out[i] = (x > 0) ? x : scale[i] * x;
}
}
__global__ void PReluScalarKernel(const float *input, const float *alpha,
float *output, size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const float *in = input + offset;
float scale = *alpha;
float *out = output + offset;
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
float x = in[i];
out[i] = (x > 0) ? x : scale * x;
}
}
static inline void PReluChannelWise(cudaStream_t stream, const float *input,
const float *alpha, float *output,
int batch_size,
const nvinfer1::Dims &dims) {
size_t unroll = batch_size * dims.d[0];
size_t spatial_size = dims.d[1] * dims.d[2];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, dims.d[0], spatial_size);
}
static inline void PReluElementWise(cudaStream_t stream, const float *input,
const float *alpha, float *output,
int batch_size,
const nvinfer1::Dims &dims) {
size_t unroll = batch_size * dims.d[0];
size_t spatial_size = dims.d[1] * dims.d[2];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
static inline void PReluScalar(cudaStream_t stream, const float *input,
const float *alpha, float *output,
int batch_size, const nvinfer1::Dims &dims) {
size_t unroll = batch_size * dims.d[0];
size_t spatial_size = dims.d[1] * dims.d[2];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
const nvinfer1::Dims *inputDims,
int nbInputs) {
assert(nbInputs == 1);
assert(index < this->getNbOutputs());
nvinfer1::Dims const &input_dims = inputDims[0];
nvinfer1::Dims output_dims = input_dims;
return output_dims;
}
int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
void **outputs, void *workspace, cudaStream_t stream) {
// input dims is CHW.
const auto &input_dims = this->getInputDims(0);
const float *input = reinterpret_cast<const float *>(inputs[0]);
const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
float *output = reinterpret_cast<float **>(outputs)[0];
if (mode_ == "channel") {
PReluChannelWise(stream, input, alpha, output, batchSize, input_dims);
} else if (mode_ == "element") {
PReluElementWise(stream, input, alpha, output, batchSize, input_dims);
} else {
PReluScalar(stream, input, alpha, output, batchSize, input_dims);
}
return cudaGetLastError() != cudaSuccess;
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class PReluPlugin : public PluginTensorRT {
TensorRTEngine::Weight alpha_;
std::string mode_;
protected:
size_t getSerializationSize() override {
// return getBaseSerializationSize(alpha_) + SerializedSize(mode_);
return 0;
}
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// It should not be called by users.
void serialize(void *buffer) override {
// serializeBase(buffer);
// SerializeValue(&buffer, alpha_);
// SerializeValue(&buffer, mode_);
}
public:
PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode)
: alpha_(alpha), mode_(mode) {}
// It was used for tensorrt deserialization.
// It should not be called by users.
PReluPlugin(void const *serialData, size_t serialLength) {
// deserializeBase(serialData, serialLength);
// DeserializeValue(&serialData, &serialLength, &alpha_);
// DeserializeValue(&serialData, &serialLength, &mode_);
}
PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); }
const char *getPluginType() const override { return "prelu"; }
int getNbOutputs() const override { return 1; }
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
int nbInputDims) override;
int enqueue(int batchSize, const void *const *inputs, void **outputs,
void *workspace, cudaStream_t stream) override;
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -45,11 +45,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2 ...@@ -45,11 +45,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
# DAM # DAM
set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
inference_analysis_test(test_analyzer_dam SRCS analyzer_dam_tester.cc inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS
--infer_model=${DAM_INSTALL_DIR}/model
--infer_data=${DAM_INSTALL_DIR}/data.txt
--use_analysis=0)
# chinese_ner # chinese_ner
set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
...@@ -82,6 +78,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te ...@@ -82,6 +78,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te
inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
"${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
# mobilenet with depthwise_conv op
inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
"${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
# anakin # anakin
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
# anakin rnn1 # anakin rnn1
...@@ -108,8 +108,7 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -108,8 +108,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
endif() endif()
inference_analysis_test(test_trt_models SRCS trt_models_tester.cc inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor
ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL) ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
endif() endif()
...@@ -69,7 +69,7 @@ struct DataRecord { ...@@ -69,7 +69,7 @@ struct DataRecord {
num_lines++; num_lines++;
std::vector<std::string> data; std::vector<std::string> data;
split(line, ',', &data); split(line, ',', &data);
CHECK_EQ(data.size(), 2 * MAX_TURN_NUM + 3); CHECK_EQ(data.size(), (size_t)(2 * MAX_TURN_NUM + 3));
// load turn data // load turn data
std::vector<int64_t> turns_tmp[MAX_TURN_NUM]; std::vector<int64_t> turns_tmp[MAX_TURN_NUM];
for (int i = 0; i < MAX_TURN_NUM; ++i) { for (int i = 0; i < MAX_TURN_NUM; ++i) {
...@@ -178,7 +178,8 @@ TEST(Analyzer_dam, profile) { ...@@ -178,7 +178,8 @@ TEST(Analyzer_dam, profile) {
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
PADDLE_ENFORCE_GT(outputs.size(), 0); PADDLE_ENFORCE_GT(outputs.size(), 0);
...@@ -196,15 +197,13 @@ TEST(Analyzer_dam, fuse_statis) { ...@@ -196,15 +197,13 @@ TEST(Analyzer_dam, fuse_statis) {
contrib::AnalysisConfig cfg; contrib::AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
if (FLAGS_use_analysis) { int num_ops;
int num_ops; auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg); auto fuse_statis = GetFuseStatis(
auto fuse_statis = GetFuseStatis( static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 317);
EXPECT_EQ(fuse_statis.at("fc_fuse"), 317); EXPECT_EQ(num_ops, 2020);
EXPECT_EQ(num_ops, 2020);
}
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
...@@ -215,9 +214,8 @@ TEST(Analyzer_dam, compare) { ...@@ -215,9 +214,8 @@ TEST(Analyzer_dam, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
if (FLAGS_use_analysis) { CompareNativeAndAnalysis(
CompareNativeAndAnalysis(cfg, input_slots_all); reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
}
} }
} // namespace inference } // namespace inference
......
...@@ -133,7 +133,8 @@ TEST(Analyzer_LAC, profile) { ...@@ -133,7 +133,8 @@ TEST(Analyzer_LAC, profile) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
// the first inference result // the first inference result
...@@ -175,7 +176,8 @@ TEST(Analyzer_LAC, compare) { ...@@ -175,7 +176,8 @@ TEST(Analyzer_LAC, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
} // namespace analysis } // namespace analysis
......
...@@ -121,7 +121,8 @@ TEST(Analyzer_Chinese_ner, profile) { ...@@ -121,7 +121,8 @@ TEST(Analyzer_Chinese_ner, profile) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
// the first inference result // the first inference result
...@@ -160,7 +161,8 @@ TEST(Analyzer_Chinese_ner, compare) { ...@@ -160,7 +161,8 @@ TEST(Analyzer_Chinese_ner, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
} // namespace inference } // namespace inference
......
...@@ -45,7 +45,8 @@ void profile(bool use_mkldnn = false) { ...@@ -45,7 +45,8 @@ void profile(bool use_mkldnn = false) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
} }
TEST(Analyzer_resnet50, profile) { profile(); } TEST(Analyzer_resnet50, profile) { profile(); }
...@@ -74,7 +75,8 @@ void compare(bool use_mkldnn = false) { ...@@ -74,7 +75,8 @@ void compare(bool use_mkldnn = false) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
TEST(Analyzer_resnet50, compare) { compare(); } TEST(Analyzer_resnet50, compare) { compare(); }
......
...@@ -233,8 +233,8 @@ TEST(Analyzer_rnn1, profile) { ...@@ -233,8 +233,8 @@ TEST(Analyzer_rnn1, profile) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
LOG(INFO) << "to test prediction"; TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); input_slots_all, &outputs, FLAGS_num_threads);
} }
// Check the fuse status // Check the fuse status
...@@ -261,7 +261,8 @@ TEST(Analyzer_rnn1, compare) { ...@@ -261,7 +261,8 @@ TEST(Analyzer_rnn1, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
// Test Multi-Thread. // Test Multi-Thread.
...@@ -272,7 +273,8 @@ TEST(Analyzer_rnn1, multi_thread) { ...@@ -272,7 +273,8 @@ TEST(Analyzer_rnn1, multi_thread) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, 4 /* multi_thread */); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, 4 /* multi_thread */);
} }
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
......
...@@ -132,7 +132,8 @@ TEST(Analyzer_rnn2, profile) { ...@@ -132,7 +132,8 @@ TEST(Analyzer_rnn2, profile) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
// the first inference result // the first inference result
...@@ -153,7 +154,8 @@ TEST(Analyzer_rnn2, compare) { ...@@ -153,7 +154,8 @@ TEST(Analyzer_rnn2, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
} // namespace inference } // namespace inference
......
...@@ -161,7 +161,8 @@ TEST(Analyzer_seq_conv1, profile) { ...@@ -161,7 +161,8 @@ TEST(Analyzer_seq_conv1, profile) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
// the first inference result // the first inference result
...@@ -199,7 +200,8 @@ TEST(Analyzer_seq_conv1, compare) { ...@@ -199,7 +200,8 @@ TEST(Analyzer_seq_conv1, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
} // namespace inference } // namespace inference
......
...@@ -74,7 +74,8 @@ TEST(Analyzer_Text_Classification, profile) { ...@@ -74,7 +74,8 @@ TEST(Analyzer_Text_Classification, profile) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1) { if (FLAGS_num_threads == 1) {
// Get output // Get output
...@@ -101,7 +102,8 @@ TEST(Analyzer_Text_Classification, compare) { ...@@ -101,7 +102,8 @@ TEST(Analyzer_Text_Classification, compare) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) { TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
...@@ -112,7 +114,8 @@ TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) { ...@@ -112,7 +114,8 @@ TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
} // namespace inference } // namespace inference
......
...@@ -59,9 +59,6 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -59,9 +59,6 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->specify_input_name = true; cfg->specify_input_name = true;
// TODO(TJ): fix fusion gru // TODO(TJ): fix fusion gru
cfg->pass_builder()->DeletePass("fc_gru_fuse_pass"); cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
#ifdef PADDLE_WITH_MKLDNN
cfg->EnableMKLDNN();
#endif
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
...@@ -94,7 +91,8 @@ void profile(bool use_mkldnn = false) { ...@@ -94,7 +91,8 @@ void profile(bool use_mkldnn = false) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
const float ocr_result_data[] = { const float ocr_result_data[] = {
...@@ -136,7 +134,8 @@ void compare(bool use_mkldnn = false) { ...@@ -136,7 +134,8 @@ void compare(bool use_mkldnn = false) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all); CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
TEST(Analyzer_vis, compare) { compare(); } TEST(Analyzer_vis, compare) { compare(); }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <ostream>
#include <string>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
namespace inference {
thread_local int num_spaces = 0;
static std::string GenSpaces(int num_spaces) {
std::ostringstream os;
for (int i = 0; i < num_spaces; ++i) {
os << " ";
}
return os.str();
}
std::ostream &operator<<(std::ostream &os,
const PaddlePredictor::Config &config) {
os << GenSpaces(num_spaces) << "PaddlePredictor::Config {\n";
num_spaces++;
os << GenSpaces(num_spaces) << "model_dir: " << config.model_dir << "\n";
num_spaces--;
os << GenSpaces(num_spaces) << "}\n";
return os;
}
std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
os << GenSpaces(num_spaces) << "NativeConfig {\n";
num_spaces++;
os << *reinterpret_cast<const PaddlePredictor::Config *>(&config);
os << GenSpaces(num_spaces) << "use_gpu: " << config.use_gpu << "\n";
os << GenSpaces(num_spaces) << "device: " << config.device << "\n";
os << GenSpaces(num_spaces)
<< "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
os << GenSpaces(num_spaces)
<< "specify_input_name: " << config.specify_input_name << "\n";
num_spaces--;
os << GenSpaces(num_spaces) << "}\n";
return os;
}
std::ostream &operator<<(std::ostream &os,
const contrib::AnalysisConfig &config) {
os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
num_spaces++;
os << *reinterpret_cast<const NativeConfig *>(&config);
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
<< "\n";
os << GenSpaces(num_spaces)
<< "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n";
os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt()
<< "\n";
os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n";
num_spaces--;
os << GenSpaces(num_spaces) << "}\n";
return os;
}
} // namespace inference
} // namespace paddle
...@@ -19,13 +19,16 @@ ...@@ -19,13 +19,16 @@
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/tests/api/config_printer.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -38,10 +41,18 @@ DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); ...@@ -38,10 +41,18 @@ DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
DEFINE_bool(use_analysis, true, DEFINE_bool(use_analysis, true,
"Running the inference program in analysis mode."); "Running the inference program in analysis mode.");
DECLARE_bool(profile);
namespace paddle { namespace paddle {
namespace inference { namespace inference {
using contrib::AnalysisConfig; void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
if (use_analysis) {
LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config);
return;
}
LOG(INFO) << *config;
}
void CompareResult(const std::vector<PaddleTensor> &outputs, void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) { const std::vector<PaddleTensor> &ref_outputs) {
...@@ -77,12 +88,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -77,12 +88,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
} }
std::unique_ptr<PaddlePredictor> CreateTestPredictor( std::unique_ptr<PaddlePredictor> CreateTestPredictor(
const AnalysisConfig &config, bool use_analysis = true) { const PaddlePredictor::Config *config, bool use_analysis = true) {
if (use_analysis) { if (use_analysis) {
return CreatePaddlePredictor<contrib::AnalysisConfig>(config); return CreatePaddlePredictor<contrib::AnalysisConfig>(
} else { *(reinterpret_cast<const contrib::AnalysisConfig *>(config)));
return CreatePaddlePredictor<NativeConfig>(config);
} }
return CreatePaddlePredictor<NativeConfig>(
*(reinterpret_cast<const NativeConfig *>(config)));
} }
size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
...@@ -111,11 +123,23 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor, ...@@ -111,11 +123,23 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
} }
void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs, void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
const std::string &dirname) { const std::string &dirname, bool is_combined = true,
std::string model_filename = "model",
std::string params_filename = "params") {
// Set fake_image_data // Set fake_image_data
PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
std::vector<std::vector<int64_t>> feed_target_shapes = std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
GetFeedTargetShapes(dirname, true, "model", "params"); dirname, is_combined, model_filename, params_filename);
std::ostringstream os;
for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
os << "feed target " << i << ": {" << feed_target_shapes[i][0];
for (size_t j = 1; j < feed_target_shapes[i].size(); ++j) {
os << ", " << feed_target_shapes[i][j];
}
os << "}\n";
}
LOG(INFO) << os.str();
int dim1 = feed_target_shapes[0][1]; int dim1 = feed_target_shapes[0][1];
int dim2 = feed_target_shapes[0][2]; int dim2 = feed_target_shapes[0][2];
int dim3 = feed_target_shapes[0][3]; int dim3 = feed_target_shapes[0][3];
...@@ -139,25 +163,43 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs, ...@@ -139,25 +163,43 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
} }
void TestOneThreadPrediction( void TestOneThreadPrediction(
const AnalysisConfig &config, const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, bool use_analysis = true) { std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat; int num_times = FLAGS_repeat;
auto predictor = CreateTestPredictor(config, use_analysis); auto predictor = CreateTestPredictor(config, use_analysis);
Timer timer;
timer.tic(); // warmup run
for (int i = 0; i < num_times; i++) { LOG(INFO) << "Warm up run...";
for (size_t j = 0; j < inputs.size(); j++) { {
predictor->Run(inputs[j], outputs); Timer warmup_timer;
warmup_timer.tic();
predictor->Run(inputs[0], outputs, batch_size);
PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1);
#if !defined(_WIN32)
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
#endif
}
LOG(INFO) << "Run " << num_times << " times...";
{
Timer run_timer;
run_timer.tic();
for (int i = 0; i < num_times; i++) {
for (size_t j = 0; j < inputs.size(); j++) {
predictor->Run(inputs[j], outputs, batch_size);
}
} }
PrintTime(batch_size, num_times, 1, 0, run_timer.toc() / num_times,
inputs.size());
} }
PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
inputs.size());
} }
void TestMultiThreadPrediction( void TestMultiThreadPrediction(
const AnalysisConfig &config, const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads, std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = true) { bool use_analysis = true) {
...@@ -200,12 +242,11 @@ void TestMultiThreadPrediction( ...@@ -200,12 +242,11 @@ void TestMultiThreadPrediction(
} }
} }
void TestPrediction(const AnalysisConfig &config, void TestPrediction(const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
std::vector<PaddleTensor> *outputs, int num_threads, std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = FLAGS_use_analysis) { bool use_analysis = FLAGS_use_analysis) {
LOG(INFO) << "use_analysis: " << use_analysis PrintConfig(config, use_analysis);
<< ", use_mkldnn: " << config.use_mkldnn();
if (num_threads == 1) { if (num_threads == 1) {
TestOneThreadPrediction(config, inputs, outputs, use_analysis); TestOneThreadPrediction(config, inputs, outputs, use_analysis);
} else { } else {
...@@ -215,9 +256,9 @@ void TestPrediction(const AnalysisConfig &config, ...@@ -215,9 +256,9 @@ void TestPrediction(const AnalysisConfig &config,
} }
void CompareNativeAndAnalysis( void CompareNativeAndAnalysis(
const AnalysisConfig &config, const PaddlePredictor::Config *config,
const std::vector<std::vector<PaddleTensor>> &inputs) { const std::vector<std::vector<PaddleTensor>> &inputs) {
LOG(INFO) << "use_mkldnn: " << config.use_mkldnn(); PrintConfig(config, true);
std::vector<PaddleTensor> native_outputs, analysis_outputs; std::vector<PaddleTensor> native_outputs, analysis_outputs;
TestOneThreadPrediction(config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &native_outputs, false);
TestOneThreadPrediction(config, inputs, &analysis_outputs, true); TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
// You may obtain a copy of the License at You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and See the License for the specific language governing permissions and
// limitations under the License. limitations under the License. */
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle { namespace paddle {
using paddle::contrib::AnalysisConfig; namespace inference {
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_bool(use_tensorrt, true, "Test the performance of TensorRT engine.");
DEFINE_string(prog_filename, "", "Name of model file.");
NativeConfig GetConfigNative() { DEFINE_string(param_filename, "", "Name of parameters file.");
NativeConfig config;
config.model_dir = FLAGS_dirname; template <typename ConfigType>
// LOG(INFO) << "dirname " << config.model_dir; void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu,
config.fraction_of_gpu_memory = 0.15; bool use_tensorrt = false, int batch_size = -1) {
config.use_gpu = true; if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
config.device = 0; config->prog_file = model_dir + "/" + FLAGS_prog_filename;
return config; config->param_file = model_dir + "/" + FLAGS_param_filename;
} } else {
config->model_dir = model_dir;
void PrepareTRTConfig(AnalysisConfig *config) { }
config->model_dir = FLAGS_dirname + "/" + "mobilenet"; if (use_gpu) {
config->fraction_of_gpu_memory = 0.15; config->use_gpu = true;
config->EnableTensorRtEngine(1 << 10, 5); config->device = 0;
config->pass_builder()->DeletePass("conv_bn_fuse_pass"); config->fraction_of_gpu_memory = 0.15;
config->pass_builder()->DeletePass("fc_fuse_pass"); }
config->pass_builder()->TurnOnDebug();
} }
void PrepareInputs(std::vector<PaddleTensor> *tensors, int batch_size) { template <>
PADDLE_ENFORCE_EQ(tensors->size(), 1UL); void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
auto &tensor = tensors->front(); std::string model_dir, bool use_gpu,
int height = 224; bool use_tensorrt, int batch_size) {
int width = 224; if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
float *data = new float[batch_size * 3 * height * width]; config->prog_file = model_dir + "/" + FLAGS_prog_filename;
memset(data, 0, sizeof(float) * (batch_size * 3 * height * width)); config->param_file = model_dir + "/" + FLAGS_param_filename;
data[0] = 1.0f; } else {
config->model_dir = model_dir;
// Prepare inputs }
tensor.name = "input_0"; if (use_gpu) {
tensor.shape = std::vector<int>({batch_size, 3, height, width}); config->use_gpu = true;
tensor.data = PaddleBuf(static_cast<void *>(data), config->device = 0;
sizeof(float) * (batch_size * 3 * height * width)); config->fraction_of_gpu_memory = 0.15;
tensor.dtype = PaddleDType::FLOAT32; if (use_tensorrt) {
config->EnableTensorRtEngine(1 << 10, batch_size);
config->pass_builder()->DeletePass("conv_bn_fuse_pass");
config->pass_builder()->DeletePass("fc_fuse_pass");
config->pass_builder()->TurnOnDebug();
} else {
config->enable_ir_optim = true;
}
}
} }
void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) { void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
auto config0 = GetConfigNative(); std::vector<std::vector<PaddleTensor>> inputs_all;
config0.model_dir = model_dirname; if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
AnalysisConfig config1(true); FLAGS_param_filename);
PrepareTRTConfig(&config1); } else {
config1.model_dir = model_dirname; SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0);
auto predictor1 = CreatePaddlePredictor(config1);
// Prepare inputs
std::vector<PaddleTensor> paddle_tensor_feeds(1);
PrepareInputs(&paddle_tensor_feeds, batch_size);
// Prepare outputs
std::vector<PaddleTensor> outputs0;
std::vector<PaddleTensor> outputs1;
CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
const size_t num_elements = outputs0.front().data.length() / sizeof(float);
const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
EXPECT_EQ(num_elements, num_elements1);
auto *data0 = static_cast<float *>(outputs0.front().data.data());
auto *data1 = static_cast<float *>(outputs1.front().data.data());
ASSERT_GT(num_elements, 0UL);
for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
EXPECT_NEAR(data0[i], data1[i], 1e-3);
} }
}
TEST(trt_models_test, mobilenet) { std::vector<PaddleTensor> outputs;
CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "mobilenet"); if (use_analysis || use_tensorrt) {
} contrib::AnalysisConfig config(true);
TEST(trt_models_test, resnet50) { SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnet50"); FLAGS_batch_size);
} TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
TEST(trt_models_test, resnext50) { inputs_all, &outputs, FLAGS_num_threads, true);
CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + "resnext50"); } else {
NativeConfig config;
SetConfig<NativeConfig>(&config, model_dir, true, false);
TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
inputs_all, &outputs, FLAGS_num_threads, false);
}
} }
TEST(trt_models_test, raw_gpu) { void compare(std::string model_dir, bool use_tensorrt) {
std::string model_dir = FLAGS_dirname + "/" + "mobilenet"; std::vector<std::vector<PaddleTensor>> inputs_all;
auto config0 = GetConfigNative(); if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
config0.model_dir = model_dir; SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
int batch_size = 2; FLAGS_param_filename);
} else {
AnalysisConfig config1(true); SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
config1.fraction_of_gpu_memory = 0.1; }
config1.enable_ir_optim = true;
config1.model_dir = model_dir;
auto predictor0 = CreatePaddlePredictor<NativeConfig>(config0); std::vector<PaddleTensor> native_outputs;
auto predictor1 = CreatePaddlePredictor(config1); NativeConfig native_config;
SetConfig<NativeConfig>(&native_config, model_dir, true, false,
FLAGS_batch_size);
TestOneThreadPrediction(
reinterpret_cast<PaddlePredictor::Config*>(&native_config), inputs_all,
&native_outputs, false);
std::vector<PaddleTensor> analysis_outputs;
contrib::AnalysisConfig analysis_config(true);
SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
use_tensorrt, FLAGS_batch_size);
TestOneThreadPrediction(
reinterpret_cast<PaddlePredictor::Config*>(&analysis_config), inputs_all,
&analysis_outputs, true);
CompareResult(native_outputs, analysis_outputs);
}
// Prepare inputs TEST(TensorRT_mobilenet, compare) {
std::vector<PaddleTensor> paddle_tensor_feeds(1); std::string model_dir = FLAGS_infer_model + "/mobilenet";
PrepareInputs(&paddle_tensor_feeds, batch_size); compare(model_dir, /* use_tensorrt */ true);
}
// Prepare outputs TEST(TensorRT_resnet50, compare) {
std::vector<PaddleTensor> outputs0; std::string model_dir = FLAGS_infer_model + "/resnet50";
std::vector<PaddleTensor> outputs1; compare(model_dir, /* use_tensorrt */ true);
CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0)); }
CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
const size_t num_elements = outputs0.front().data.length() / sizeof(float); TEST(TensorRT_resnext50, compare) {
const size_t num_elements1 = outputs1.front().data.length() / sizeof(float); std::string model_dir = FLAGS_infer_model + "/resnext50";
EXPECT_EQ(num_elements, num_elements1); compare(model_dir, /* use_tensorrt */ true);
}
auto *data0 = static_cast<float *>(outputs0.front().data.data()); TEST(TensorRT_resnext50, profile) {
auto *data1 = static_cast<float *>(outputs1.front().data.data()); std::string model_dir = FLAGS_infer_model + "/resnext50";
profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
}
ASSERT_GT(num_elements, 0UL); TEST(TensorRT_mobilenet, analysis) {
for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) { std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
EXPECT_NEAR(data0[i], data1[i], 1e-3); compare(model_dir, /* use_tensorrt */ false);
}
} }
} // namespace inference
} // namespace paddle } // namespace paddle
USE_PASS(tensorrt_subgraph_pass); USE_PASS(tensorrt_subgraph_pass);
...@@ -94,7 +94,8 @@ function(op_library TARGET) ...@@ -94,7 +94,8 @@ function(op_library TARGET)
# remove windows unsupported op, because windows has no nccl, no warpctc such ops. # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
"crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
"fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op"
"fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op")
if ("${TARGET}" STREQUAL "${windows_unsupport_op}") if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
return() return()
endif() endif()
...@@ -299,7 +300,6 @@ if (NOT WIN32) ...@@ -299,7 +300,6 @@ if (NOT WIN32)
op_library(gru_op DEPS sequence2batch gru_compute) op_library(gru_op DEPS sequence2batch gru_compute)
endif(NOT WIN32) endif(NOT WIN32)
op_library(recurrent_op DEPS executor) op_library(recurrent_op DEPS executor)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
op_library(cos_sim_op DEPS cos_sim_functor) op_library(cos_sim_op DEPS cos_sim_functor)
op_library(parallel_do_op DEPS executor) op_library(parallel_do_op DEPS executor)
op_library(unsqueeze_op DEPS reshape_op) op_library(unsqueeze_op DEPS reshape_op)
...@@ -308,8 +308,10 @@ op_library(flatten_op DEPS reshape_op) ...@@ -308,8 +308,10 @@ op_library(flatten_op DEPS reshape_op)
op_library(sequence_pad_op DEPS sequence_padding) op_library(sequence_pad_op DEPS sequence_padding)
op_library(unstack_op DEPS stack_op) op_library(unstack_op DEPS stack_op)
op_library(fake_quantize_op DEPS memory) op_library(fake_quantize_op DEPS memory)
if (NOT WIN32)
op_library(crf_decoding_op DEPS jit_kernel) op_library(crf_decoding_op DEPS jit_kernel)
op_library(fusion_lstm_op DEPS jit_kernel) op_library(fusion_lstm_op DEPS jit_kernel)
endif(NOT WIN32)
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(conv_op DEPS vol2col depthwise_conv im2col)
op_library(layer_norm_op DEPS cub) op_library(layer_norm_op DEPS cub)
...@@ -325,8 +327,16 @@ op_library(save_op DEPS lod_tensor) ...@@ -325,8 +327,16 @@ op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor)
op_library(save_combine_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor)
op_library(tensor_array_to_tensor_op DEPS concat_op)
op_library(concat_op DEPS concat_and_split) op_library(concat_op DEPS concat_and_split)
op_library(tensor_array_to_tensor_op DEPS concat_op)
set(DEPS_OPS ${DEPS_OPS} warpctc_op)
if (WITH_GPU)
if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
endif()
endif()
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
......
...@@ -71,6 +71,10 @@ class MKLDNNActivationGradKernel ...@@ -71,6 +71,10 @@ class MKLDNNActivationGradKernel
diff_y->format() != memory::format::format_undef, diff_y->format() != memory::format::format_undef,
"Wrong layout/format set for Input OutGrad tensor"); "Wrong layout/format set for Input OutGrad tensor");
PADDLE_ENFORCE(
!ctx.Attr<bool>("is_test"),
"is_test attribute should be set to False in training phase.");
Functor functor; Functor functor;
auto attrs = functor.GetAttrs(); auto attrs = functor.GetAttrs();
...@@ -115,11 +119,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx, ...@@ -115,11 +119,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
const std::string key_fwd = key_with_layout + "@eltwise_fwd"; const std::string key_fwd = key_with_layout + "@eltwise_fwd";
const std::string key_fwd_pd = key_with_layout + "@eltwise_fwd_pd"; const std::string key_fwd_pd = key_with_layout + "@eltwise_fwd_pd";
bool is_test = ctx.Attr<bool>("is_test");
// save input data and layout to be referred in backward path // save input data and layout to be referred in backward path
auto p_src_data = std::make_shared<const T *>(x_data); auto p_src_data = std::make_shared<const T *>(x_data);
dev_ctx.SetBlob(key_src_data, p_src_data);
auto p_src_layout = std::make_shared<memory::format>(src_format); auto p_src_layout = std::make_shared<memory::format>(src_format);
dev_ctx.SetBlob(key_src_layout, p_src_layout); if (!is_test) {
dev_ctx.SetBlob(key_src_data, p_src_data);
dev_ctx.SetBlob(key_src_layout, p_src_layout);
}
auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>( auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>(
dev_ctx.GetBlob(key_fwd)); dev_ctx.GetBlob(key_fwd));
...@@ -136,14 +144,17 @@ void eltwise_forward(const framework::ExecutionContext &ctx, ...@@ -136,14 +144,17 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
dev_ctx.SetBlob(key_src_mem, src_memory); dev_ctx.SetBlob(key_src_mem, src_memory);
// create primitive descriptor for activation forward and save it // create primitive descriptor for activation forward and save it
auto mkldnn_forward_prop_kind = is_test
? mkldnn::prop_kind::forward_inference
: mkldnn::prop_kind::forward_training;
auto forward_desc = mkldnn::eltwise_forward::desc( auto forward_desc = mkldnn::eltwise_forward::desc(
mkldnn::prop_kind::forward_training, algorithm, mkldnn_forward_prop_kind, algorithm,
src_memory->get_primitive_desc().desc(), alpha, beta); src_memory->get_primitive_desc().desc(), alpha, beta);
auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>( auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
forward_desc, mkldnn_engine); forward_desc, mkldnn_engine);
// save prim desc into global device context to be referred in backward path // save prim desc into global device context to be referred in backward path
dev_ctx.SetBlob(key_fwd_pd, forward_pd); if (!is_test) dev_ctx.SetBlob(key_fwd_pd, forward_pd);
// create mkldnn memory for output y // create mkldnn memory for output y
dst_memory = dst_memory =
......
...@@ -22,18 +22,23 @@ namespace operators { ...@@ -22,18 +22,23 @@ namespace operators {
using paddle::framework::Tensor; using paddle::framework::Tensor;
#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \
class OP_NAME##OpMaker \ class OP_NAME##OpMaker \
: public ::paddle::framework::OpProtoAndCheckerMaker { \ : public ::paddle::framework::OpProtoAndCheckerMaker { \
public: \ public: \
void Make() override { \ void Make() override { \
AddInput("X", "Input of " #OP_NAME " operator"); \ AddInput("X", "Input of " #OP_NAME " operator"); \
AddOutput("Out", "Output of " #OP_NAME " operator"); \ AddOutput("Out", "Output of " #OP_NAME " operator"); \
AddAttr<bool>("use_mkldnn", \ AddAttr<bool>("use_mkldnn", \
"(bool, default false) Only used in mkldnn kernel") \ "(bool, default false) Only used in mkldnn kernel") \
.SetDefault(false); \ .SetDefault(false); \
AddComment(#OP_COMMENT); \ AddAttr<bool>( \
} \ "is_test", \
"(bool, default false) Set to true for inference only, false " \
"for training. Some layers may run faster when this is true.") \
.SetDefault(false); \
AddComment(#OP_COMMENT); \
} \
} }
#define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE) \ #define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE) \
...@@ -269,7 +274,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -269,7 +274,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
:strong:`Softshrink Activation Operator` :strong:`Softshrink Activation Operator`
.. math:: .. math::
out = \begin{cases} out = \begin{cases}
x - \lambda, \text{if } x > \lambda \\ x - \lambda, \text{if } x > \lambda \\
x + \lambda, \text{if } x < -\lambda \\ x + \lambda, \text{if } x < -\lambda \\
0, \text{otherwise} 0, \text{otherwise}
...@@ -435,7 +440,7 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -435,7 +440,7 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
HardSigmoid Activation Operator. HardSigmoid Activation Operator.
Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
which is much faster than sigmoid. which is much faster than sigmoid.
$out = \max(0, \min(1, slope * x + shift))$ $out = \max(0, \min(1, slope * x + shift))$
......
...@@ -113,7 +113,10 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -113,7 +113,10 @@ class BatchNormOp : public framework::OperatorWithKernel {
class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddAttr<bool>("is_test", "").SetDefault(false); AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddAttr<float>("momentum", "").SetDefault(0.9); AddAttr<float>("momentum", "").SetDefault(0.9);
AddAttr<float>("epsilon", "") AddAttr<float>("epsilon", "")
.SetDefault(1e-5) .SetDefault(1e-5)
......
...@@ -50,12 +50,18 @@ static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache"; ...@@ -50,12 +50,18 @@ static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
static_cast<size_t>(1024) * 1024 * 1024; static_cast<size_t>(1024) * 1024 * 1024;
static constexpr size_t kNUM_CUDNN_FWD_ALGS = #if CUDNN_VERSION_MIN(6, 0, 5)
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
#else
// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc.
static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
#endif
template <typename T> template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> { class CUDNNConvOpKernel : public framework::OpKernel<T> {
......
...@@ -383,20 +383,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -383,20 +383,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
// create a conv primitive descriptor and save it for usage in backward // create a conv primitive descriptor and save it for usage in backward
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd; std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
: mkldnn::prop_kind::forward_training;
if (bias) { if (bias) {
bias_tz = paddle::framework::vectorize2int(bias->dims()); bias_tz = paddle::framework::vectorize2int(bias->dims());
auto bias_md = platform::MKLDNNMemDesc( auto bias_md = platform::MKLDNNMemDesc(
bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x); bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, conv_pd = ConvFwdPrimitiveDesc(
strides, paddings, mkldnn_engine, src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
fuse_relu, fuse_residual_conn); fuse_relu, fuse_residual_conn, fwd_prop_kind);
} else { } else {
conv_pd = conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, paddings, mkldnn_engine, fuse_relu,
mkldnn_engine, fuse_relu, fuse_residual_conn); fuse_residual_conn, fwd_prop_kind);
} }
// Save conv_pd/src_memory/weights_memory for backward pass // Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd); if (!is_test) dev_ctx.SetBlob(key_conv_pd, conv_pd);
ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
...@@ -510,14 +512,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -510,14 +512,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const memory::desc& dst, const std::vector<int>& strides, const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const mkldnn::engine& engine, const bool fuse_relu, const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_residual_conn) const { const bool fuse_residual_conn,
mkldnn::prop_kind fwd_prop_kind) const {
memory::dims stride_dims = {strides[0], strides[1]}; memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]}; memory::dims padding_dims = {paddings[0], paddings[1]};
auto conv_desc = mkldnn::convolution_forward::desc( auto conv_desc = mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst,
dst, stride_dims, padding_dims, padding_dims, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr = mkldnn::primitive_attr conv_attr =
CreatePostOps(fuse_relu, fuse_residual_conn); CreatePostOps(fuse_relu, fuse_residual_conn);
...@@ -535,14 +537,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -535,14 +537,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const mkldnn::engine& engine, const bool fuse_relu, const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_residual_conn) const { const bool fuse_residual_conn,
mkldnn::prop_kind fwd_prop_kind) const {
memory::dims stride_dims = {strides[0], strides[1]}; memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]}; memory::dims padding_dims = {paddings[0], paddings[1]};
auto conv_desc = mkldnn::convolution_forward::desc( auto conv_desc = mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst,
bias, dst, stride_dims, padding_dims, padding_dims, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr = mkldnn::primitive_attr conv_attr =
CreatePostOps(fuse_relu, fuse_residual_conn); CreatePostOps(fuse_relu, fuse_residual_conn);
...@@ -587,6 +589,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -587,6 +589,10 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
output_grad->format() != memory::format::format_undef, output_grad->format() != memory::format::format_undef,
"Wrong layout/format set for output_grad tensor"); "Wrong layout/format set for output_grad tensor");
PADDLE_ENFORCE(
!ctx.Attr<bool>("is_test"),
"is_test attribute should be set to False in training phase.");
if (!input_grad && !filter_grad) return; if (!input_grad && !filter_grad) return;
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides"); std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
......
...@@ -109,7 +109,10 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -109,7 +109,10 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
} }
void Conv2DOpMaker::Make() { void Conv2DOpMaker::Make() {
AddAttr<bool>("is_test", "").SetDefault(false); AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddInput( AddInput(
"Input", "Input",
"(Tensor) The input tensor of convolution operator. " "(Tensor) The input tensor of convolution operator. "
......
...@@ -15,6 +15,10 @@ limitations under the License. */ ...@@ -15,6 +15,10 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
using paddle::platform::PADDLE_CUDA_NUM_THREADS;
using paddle::platform::float16;
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -49,7 +49,10 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -49,7 +49,10 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f, PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f,
"'dropout_prob' must be between 0.0 and 1.0."); "'dropout_prob' must be between 0.0 and 1.0.");
}); });
AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false); AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddAttr<bool>("fix_seed", AddAttr<bool>("fix_seed",
"A flag indicating whether to use a fixed seed to generate " "A flag indicating whether to use a fixed seed to generate "
"random mask. NOTE: DO NOT set this flag to true in " "random mask. NOTE: DO NOT set this flag to true in "
......
...@@ -111,6 +111,17 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext> ...@@ -111,6 +111,17 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext>
return *this; return *this;
} }
RowwiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
while (n-- > 0) {
++i_;
if (UNLIKELY(i_ == n_)) {
i_ = 0;
}
}
return *this;
}
bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext> bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext>
&rhs) const { &rhs) const {
return (ptr_ + i_) == &(*rhs); return (ptr_ + i_) == &(*rhs);
...@@ -149,6 +160,21 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext> ...@@ -149,6 +160,21 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext>
return *this; return *this;
} }
MidWiseTransformIterator<T, platform::CPUDeviceContext> &operator+(int n) {
while (n-- > 0) {
++j_;
if (UNLIKELY(j_ == post_)) {
++i_;
j_ = 0;
if (UNLIKELY(i_ == n_)) {
i_ = 0;
}
}
}
return *this;
}
bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext> bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>
&rhs) const { &rhs) const {
return (ptr_ + i_) == &(*rhs); return (ptr_ + i_) == &(*rhs);
......
...@@ -47,6 +47,11 @@ class ExpandOp : public framework::OperatorWithKernel { ...@@ -47,6 +47,11 @@ class ExpandOp : public framework::OperatorWithKernel {
out_shape[i] = x_dims[i] * expand_times[i]; out_shape[i] = x_dims[i] * expand_times[i];
} }
// set the first dim to -1 in compile time
if (!ctx->IsRuntime()) {
out_shape[0] = x_dims[0];
}
ctx->SetOutputDim("Out", framework::make_ddim(out_shape)); ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
if (out_shape[0] == x_dims[0]) { if (out_shape[0] == x_dims[0]) {
ctx->ShareLoD("X", "Out"); ctx->ShareLoD("X", "Out");
...@@ -109,7 +114,16 @@ class ExpandGradOp : public framework::OperatorWithKernel { ...@@ -109,7 +114,16 @@ class ExpandGradOp : public framework::OperatorWithKernel {
ctx->Attrs().Get<std::vector<int>>("expand_times"); ctx->Attrs().Get<std::vector<int>>("expand_times");
auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
for (size_t i = 0; i < expand_times.size(); ++i) { size_t start_pos = 0u;
if (!ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(
x_dims[0], out_dims[0],
"The first dimension size of Input(Out@GRAD) should be "
"equal to the crroresponding dimension size of Input(X)");
start_pos = 1u;
}
for (size_t i = start_pos; i < expand_times.size(); ++i) {
PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i], PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
"Each dimension size of Input(Out@GRAD) should be " "Each dimension size of Input(Out@GRAD) should be "
"equal to multiplication of crroresponding dimension " "equal to multiplication of crroresponding dimension "
......
...@@ -138,7 +138,7 @@ class FakeQuantizeAbsMaxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -138,7 +138,7 @@ class FakeQuantizeAbsMaxOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
FakeQuantize operator FakeQuantize operator
$$scale = max(abs(X))$$ $$scale = max(abs(X))$$
$$range = 2^{bit_length - 1} - 1$$ $$range = 2^{bit_length - 1} - 1$$
$$Out = round(X/scale * range)$$ $$Out = round(X/scale * range)$$
...@@ -199,11 +199,14 @@ class FakeQuantizeRangeAbsMaxOpMaker ...@@ -199,11 +199,14 @@ class FakeQuantizeRangeAbsMaxOpMaker
PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16, PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
"'bit_length' should be between 1 and 16."); "'bit_length' should be between 1 and 16.");
}); });
AddAttr<bool>("is_test", "").SetDefault(false); AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
FakeQuantize operator is used in static quantization. FakeQuantize operator is used in static quantization.
$$scale = max(max(abs(x)), history_abs_max)$$ $$scale = max(max(abs(x)), history_abs_max)$$
$$range = 2^{bit_length - 1} - 1$$ $$range = 2^{bit_length - 1} - 1$$
$$Out = round(X/scale * range)$$ $$Out = round(X/scale * range)$$
......
...@@ -27,11 +27,9 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -27,11 +27,9 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
"Out(Output) of Fully Connected should not be null."); "Out(Output) of Fully Connected should not be null.");
PADDLE_ENFORCE(ctx->HasInput("W"), PADDLE_ENFORCE(ctx->HasInput("W"),
"W(Input) of Fully Connected should not be null."); "W(Input) of Fully Connected should not be null.");
// NCHW
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
// IO, I=C*H*W
auto w_dims = ctx->GetInputDim("W"); auto w_dims = ctx->GetInputDim("W");
std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
if (ctx->HasInput("Bias")) { if (ctx->HasInput("Bias")) {
auto bias_dims = ctx->GetInputDim("Bias"); auto bias_dims = ctx->GetInputDim("Bias");
...@@ -44,14 +42,32 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -44,14 +42,32 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
"The shape of Bias must be [1, dim]."); "The shape of Bias must be [1, dim].");
} }
} }
PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
"Fully Connected input should be 2-D or 4-D tensor."); if (ctx->Attrs().Get<bool>("use_mkldnn")) {
PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
"Fully Connected input should be 2-D or 4-D tensor.");
}
PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, PADDLE_ENFORCE_EQ(w_dims.size(), 2UL,
"Fully Connected input should be 2-D tensor."); "Fully Connected input should be 2-D tensor.");
PADDLE_ENFORCE_EQ(framework::product(in_dims) / in_dims[0], w_dims[0], int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
"Fully Connected input and weigth size do not match."); PADDLE_ENFORCE_GT(
in_dims.size(), in_num_col_dims,
"The input tensor Input's rank of FCOp should be larger than "
"in_num_col_dims.");
auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
PADDLE_ENFORCE_EQ(
in_mat_dims[1], w_dims[0],
"Fully Connected input and weigth size do not match. %s, %s");
std::vector<int64_t> output_dims;
output_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
for (int i = 0; i < in_num_col_dims; ++i) {
output_dims.push_back(in_dims[i]);
}
output_dims.push_back(w_dims[1]);
ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
ctx->ShareLoD("Input", "Out"); ctx->ShareLoD("Input", "Out");
} }
...@@ -101,12 +117,15 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType( ...@@ -101,12 +117,15 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType(
} }
void FCOpMaker::Make() { void FCOpMaker::Make() {
AddInput("Input", AddInput("Input", "(Tensor), The input tensor of fully connected operator.");
"(Tensor), The input tensor of fully connected operator with format "
"(NCHW). ");
AddInput("W", "(Tensor), The weight fc op with shape (I, O)."); AddInput("W", "(Tensor), The weight fc op with shape (I, O).");
AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O") AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O")
.AsDispensable(); .AsDispensable();
AddAttr<int>("in_num_col_dims",
"(int, default 1), The fc op can take tensors with more than "
"two dimensions as its inputs.")
.SetDefault(1)
.EqualGreaterThan(1);
AddOutput("Out", "(Tensor) The output tensor of fully connected operator. "); AddOutput("Out", "(Tensor) The output tensor of fully connected operator. ");
AddAttr<bool>("use_mkldnn", AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel") "(bool, default false) Only used in mkldnn kernel")
...@@ -131,13 +150,15 @@ class FCOpKernel : public framework::OpKernel<T> { ...@@ -131,13 +150,15 @@ class FCOpKernel : public framework::OpKernel<T> {
auto output = ctx.Output<Tensor>("Out"); auto output = ctx.Output<Tensor>("Out");
auto in_dims = input->dims(); auto in_dims = input->dims();
auto w_dims = w->dims(); auto w_dims = w->dims();
auto out_dims = output->dims();
int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* w_data = w->data<T>(); const T* w_data = w->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx); auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
math::FCCompute<platform::CPUDeviceContext, T>( math::FCCompute<platform::CPUDeviceContext, T>(
blas, in_dims[0], w_dims[1], w_dims[0], input_data, w_data, output_data, blas, M, w_dims[1], w_dims[0], input_data, w_data, output_data,
bias ? bias->data<T>() : NULL); bias ? bias->data<T>() : NULL);
// TODO(TJ): fuse act // TODO(TJ): fuse act
......
...@@ -63,7 +63,8 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx, ...@@ -63,7 +63,8 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
Tensor ones; Tensor ones;
ones.mutable_data<T>({n, h, w}, ctx.GetPlace()); ones.mutable_data<T>({n, h, w}, ctx.GetPlace());
auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0); auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0);
Tensor half_xmax, half_ymax; Tensor half_xmax;
Tensor half_ymax;
half_xmax.mutable_data<T>({n, h, w}, ctx.GetPlace()); half_xmax.mutable_data<T>({n, h, w}, ctx.GetPlace());
auto half_xmax_t = auto half_xmax_t =
EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max); EigenTensor<T, 3>::From(half_xmax).setConstant(0.5 * x_max);
......
...@@ -38,7 +38,7 @@ class HashOp : public framework::OperatorWithKernel { ...@@ -38,7 +38,7 @@ class HashOp : public framework::OperatorWithKernel {
std::vector<int64_t> out_dims; std::vector<int64_t> out_dims;
out_dims.reserve(dims.size() + 1); out_dims.reserve(dims.size() + 1);
// copy all dims except the last one // copy all dims except the last one
for (size_t i = 0u; i != dims.size() - 1; ++i) { for (int i = 0u; i != dims.size() - 1; ++i) {
out_dims.emplace_back(dims[i]); out_dims.emplace_back(dims[i]);
} }
int num_hash = ctx->Attrs().Get<int>("num_hash"); int num_hash = ctx->Attrs().Get<int>("num_hash");
......
...@@ -46,7 +46,7 @@ struct LRNFunctor<platform::CPUDeviceContext, T> { ...@@ -46,7 +46,7 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
int pre_pad = (n - 1) / 2; int pre_pad = (n - 1) / 2;
// compute batches one by one // compute batches one by one
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
blas.VSQR(fea_size, idata + i * fea_size, sdata + pre_pad * img_size); blas.VSQUARE(fea_size, idata + i * fea_size, sdata + pre_pad * img_size);
// init the first channel of mid // init the first channel of mid
for (int c = 0; c < n; ++c) { for (int c = 0; c < n; ++c) {
blas.AXPY(img_size, alpha, sdata + c * img_size, mdata + i * fea_size); blas.AXPY(img_size, alpha, sdata + c * img_size, mdata + i * fea_size);
...@@ -229,8 +229,8 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -229,8 +229,8 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
"the input will be transformed automatically. ") "the input will be transformed automatically. ")
.SetDefault("AnyLayout"); .SetDefault("AnyLayout");
AddAttr<bool>("is_test", AddAttr<bool>("is_test",
"Turns on memory optimization that optimizes away " "(bool, default false) Set to true for inference only, false "
"unnecessary memory allocations. Used by MKLDNN.") "for training. Some layers may run faster when this is true.")
.SetDefault(false); .SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
......
...@@ -75,12 +75,13 @@ if(WITH_GPU) ...@@ -75,12 +75,13 @@ if(WITH_GPU)
endif() endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
if (NOT WIN32)
set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc) set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
if(WITH_XBYAK) if(WITH_XBYAK)
list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
list(APPEND JIT_KERNEL_DEPS xbyak) list(APPEND JIT_KERNEL_DEPS xbyak)
endif() endif()
cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
endif (NOT WIN32)
...@@ -153,7 +153,7 @@ class Blas { ...@@ -153,7 +153,7 @@ class Blas {
void VEXP(int n, const T* x, T* y) const; void VEXP(int n, const T* x, T* y) const;
template <typename T> template <typename T>
void VSQR(int n, const T* x, T* y) const; void VSQUARE(int n, const T* x, T* y) const;
template <typename T> template <typename T>
void VPOW(int n, const T* x, T alpha, T* y) const; void VPOW(int n, const T* x, T alpha, T* y) const;
...@@ -245,8 +245,8 @@ class BlasT : private Blas<DeviceContext> { ...@@ -245,8 +245,8 @@ class BlasT : private Blas<DeviceContext> {
} }
template <typename... ARGS> template <typename... ARGS>
void VSQR(ARGS... args) const { void VSQUARE(ARGS... args) const {
Base()->template VSQR<T>(args...); Base()->template VSQUARE<T>(args...);
} }
template <typename... ARGS> template <typename... ARGS>
......
...@@ -105,7 +105,7 @@ struct CBlas<float> { ...@@ -105,7 +105,7 @@ struct CBlas<float> {
} }
template <typename... ARGS> template <typename... ARGS>
static void VSQR(ARGS... args) { static void VSQUARE(ARGS... args) {
platform::dynload::vsSqr(args...); platform::dynload::vsSqr(args...);
} }
...@@ -195,7 +195,7 @@ struct CBlas<double> { ...@@ -195,7 +195,7 @@ struct CBlas<double> {
} }
template <typename... ARGS> template <typename... ARGS>
static void VSQR(ARGS... args) { static void VSQUARE(ARGS... args) {
platform::dynload::vdSqr(args...); platform::dynload::vdSqr(args...);
} }
...@@ -262,7 +262,9 @@ struct CBlas<platform::float16> { ...@@ -262,7 +262,9 @@ struct CBlas<platform::float16> {
} }
static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); } static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); } static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
static void VSQR(...) { PADDLE_THROW("float16 VSQR not supported on CPU"); } static void VSQUARE(...) {
PADDLE_THROW("float16 VSQUARE not supported on CPU");
}
static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); } static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); }; static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); }; static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
...@@ -423,12 +425,12 @@ void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const { ...@@ -423,12 +425,12 @@ void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CPUDeviceContext>::VSQR(int n, const T *x, T *y) const { void Blas<platform::CPUDeviceContext>::VSQUARE(int n, const T *x, T *y) const {
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
CBlas<T>::VSQR(n, x, y); CBlas<T>::VSQUARE(n, x, y);
#else #else
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = std::sqrt(x[i]); y[i] = x[i] * x[i];
} }
#endif #endif
} }
......
...@@ -118,6 +118,39 @@ void VXXJitCode::generate() { ...@@ -118,6 +118,39 @@ void VXXJitCode::generate() {
ret(); ret();
} }
bool ReluJitCode::init(int d) { return MayIUse(avx); }
void ReluJitCode::generate() {
int offset = 0;
vxorps(ymm_zero, ymm_zero, ymm_zero);
for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) {
vmovups(ymm_src, ptr[param1 + offset]);
vmaxps(ymm_dst, ymm_zero, ymm_src);
vmovups(ptr[param2 + offset], ymm_dst);
offset += sizeof(float) * AVX_FLOAT_BLOCK;
}
int rest = num_ % AVX_FLOAT_BLOCK;
if (rest >= 4) {
vmovups(xmm_src, ptr[param1 + offset]);
vmaxps(xmm_dst, xmm_zero, xmm_src);
vmovups(ptr[param2 + offset], xmm_dst);
offset += sizeof(float) * 4;
rest -= 4;
}
if (rest >= 2) {
vmovups(xmm_src, ptr[param1 + offset]);
vmaxps(xmm_dst, xmm_zero, xmm_src);
vmovq(ptr[param2 + offset], xmm_dst);
offset += sizeof(float) * 2;
rest -= 2;
}
if (rest > 0) {
vmovups(xmm_src, ptr[param1 + offset]);
vmaxps(xmm_dst, xmm_zero, xmm_src);
vmovss(ptr[param2 + offset], xmm_dst);
}
ret();
}
} // namespace gen } // namespace gen
} // namespace jitkernel } // namespace jitkernel
} // namespace math } // namespace math
......
...@@ -85,6 +85,29 @@ class VXXJitCode : public JitCode { ...@@ -85,6 +85,29 @@ class VXXJitCode : public JitCode {
ymm_t ymm_zero = ymm_t(3); ymm_t ymm_zero = ymm_t(3);
}; };
class ReluJitCode : public JitCode {
public:
DECLARE_JIT_CODE(ReluJitCode);
explicit ReluJitCode(int d, size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), num_(d) {}
static bool init(int d);
void generate() override;
private:
int num_;
reg64_t param1{abi_param1};
reg64_t param2{abi_param2};
xmm_t xmm_zero = xmm_t(0);
xmm_t xmm_src = xmm_t(1);
xmm_t xmm_dst = xmm_t(1);
ymm_t ymm_zero = ymm_t(0);
ymm_t ymm_src = ymm_t(1);
ymm_t ymm_dst = ymm_t(1);
};
} // namespace gen } // namespace gen
} // namespace jitkernel } // namespace jitkernel
} // namespace math } // namespace math
......
...@@ -97,37 +97,38 @@ class VAddBiasKernel : public Kernel { ...@@ -97,37 +97,38 @@ class VAddBiasKernel : public Kernel {
template <typename T> template <typename T>
class VActKernel : public Kernel { class VActKernel : public Kernel {
public: public:
virtual void Compute(const T *x, T *y) const = 0; virtual void ComputeDeprecated(const T *x, T *y) const = 0;
}; };
template <typename T> template <typename T>
class VReluKernel : public VActKernel<T> { class VReluKernel : public VActKernel<T> {
public: public:
virtual void Compute(const T *x, T *y) const = 0; virtual void ComputeDeprecated(const T *x, T *y) const = 0;
void (*Compute)(const T *, T *, int);
}; };
template <typename T> template <typename T>
class VIdentityKernel : public VActKernel<T> { class VIdentityKernel : public VActKernel<T> {
public: public:
virtual void Compute(const T *x, T *y) const = 0; virtual void ComputeDeprecated(const T *x, T *y) const = 0;
}; };
template <typename T> template <typename T>
class VExpKernel : public VActKernel<T> { class VExpKernel : public VActKernel<T> {
public: public:
virtual void Compute(const T *x, T *y) const = 0; virtual void ComputeDeprecated(const T *x, T *y) const = 0;
}; };
template <typename T> template <typename T>
class VSigmoidKernel : public VActKernel<T> { class VSigmoidKernel : public VActKernel<T> {
public: public:
virtual void Compute(const T *x, T *y) const = 0; virtual void ComputeDeprecated(const T *x, T *y) const = 0;
}; };
template <typename T> template <typename T>
class VTanhKernel : public VActKernel<T> { class VTanhKernel : public VActKernel<T> {
public: public:
virtual void Compute(const T *x, T *y) const = 0; virtual void ComputeDeprecated(const T *x, T *y) const = 0;
}; };
template <typename T> template <typename T>
......
...@@ -71,6 +71,13 @@ void VAddBiasRefer(const T* a, const T* x, T* y, int n) { ...@@ -71,6 +71,13 @@ void VAddBiasRefer(const T* a, const T* x, T* y, int n) {
} }
} }
template <typename T>
void VReluRefer(const T* x, T* y, int n) {
for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0;
}
}
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
template <typename T> template <typename T>
void VMulMKL(const T* x, const T* y, T* z, int n); void VMulMKL(const T* x, const T* y, T* z, int n);
...@@ -344,124 +351,60 @@ bool VAddBiasKernelImpl<float>::useJIT(int d) { ...@@ -344,124 +351,60 @@ bool VAddBiasKernelImpl<float>::useJIT(int d) {
} }
#endif #endif
#undef DECLARE_STATIC_FUNC
REGISTER_JITKERNEL(vmul, VMulKernel);
REGISTER_JITKERNEL(vadd, VAddKernel);
REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
REGISTER_JITKERNEL(vscal, VScalKernel);
REGISTER_JITKERNEL(vaddbias, VAddBiasKernel);
/* VRelu JitKernel */ /* VRelu JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block> template <typename T>
class VReluKernelImpl : public VReluKernel<T> { class VReluKernelImpl : public VReluKernel<T> {
public: public:
explicit VReluKernelImpl(int d) : VReluKernel<T>() { this->num_ = d; } DECLARE_STATIC_FUNC;
void Compute(const T* x, T* y) const override { explicit VReluKernelImpl(int d) : VReluKernel<T>() {
for (int i = 0; i < this->num_; ++i) { this->num_ = d; // TODO(TJ): remove me when ComputeDeprecated done
y[i] = x[i] > 0 ? x[i] : 0; #ifdef PADDLE_WITH_XBYAK
if (useJIT(d)) {
size_t sz = 96 /*init*/ +
d / AVX_FLOAT_BLOCK * 4 /* instructions*/ *
8 /*everage byte for each instruction*/;
jitcode_.reset(new gen::ReluJitCode(d, sz > 4096 ? sz : 4096));
this->Compute = jitcode_->getCode<void (*)(const T*, T*, int)>();
return;
} }
} #endif
};
#define INTRI8_FLOAT(isa) \
template <> \
void VReluKernelImpl<float, isa, kEQ8>::Compute(const float* x, float* y) \
const { \
__m256 tmp = _mm256_loadu_ps(x); \
tmp = _mm256_max_ps(tmp, _mm256_setzero_ps()); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VReluKernelImpl<float, isa, kEQ16>::Compute(const float* x, float* y) \
const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#define INTRI_GT8LT16_FLOAT(isa) \ this->Compute = VReluRefer<T>;
template <> \
VReluKernelImpl<float, isa, kGT8LT16>::VReluKernelImpl(int d) \
: VReluKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - AVX_FLOAT_BLOCK; \
} \
template <> \
void VReluKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + this->rest_); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + this->rest_, tmp1); \
} }
void ComputeDeprecated(const T* x, T* y) const override {
#define INTRI_GT16_FLOAT(isa) \ VReluRefer(x, y, this->num_);
template <> \
VReluKernelImpl<float, isa, kGT16>::VReluKernelImpl(int d) \
: VReluKernel<float>() { \
this->num_ = d; \
this->end_ = d - d % AVX_FLOAT_BLOCK; \
this->rest_ = d - AVX_FLOAT_BLOCK; \
} \
template <> \
void VReluKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
const { \
__m256 zeros = _mm256_setzero_ps(); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + i, tmp); \
} \
__m256 tmp = _mm256_loadu_ps(x + this->rest_); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + this->rest_, tmp); \
} }
#ifdef PADDLE_WITH_XBYAK
#ifdef __AVX__ private:
INTRI8_FLOAT(jit::avx); std::unique_ptr<gen::ReluJitCode> jitcode_{nullptr};
INTRI16_FLOAT(jit::avx);
INTRI_GT8LT16_FLOAT(jit::avx);
INTRI_GT16_FLOAT(jit::avx);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
INTRI16_FLOAT(jit::avx2);
INTRI_GT8LT16_FLOAT(jit::avx2);
INTRI_GT16_FLOAT(jit::avx2);
#endif #endif
#ifdef __AVX512F__ };
// TODO(TJ): refine avx512
INTRI8_FLOAT(jit::avx512f); #ifdef PADDLE_WITH_XBYAK
INTRI16_FLOAT(jit::avx512f); template <>
INTRI_GT8LT16_FLOAT(jit::avx512f); bool VReluKernelImpl<float>::useJIT(int d) {
INTRI_GT16_FLOAT(jit::avx512f); return gen::ReluJitCode::init(d);
}
#endif #endif
#undef INTRI8_FLOAT #undef DECLARE_STATIC_FUNC
#undef INTRI16_FLOAT
#undef INTRI_GT8LT16_FLOAT REGISTER_JITKERNEL(vmul, VMulKernel);
#undef INTRI_GT16_FLOAT REGISTER_JITKERNEL(vadd, VAddKernel);
REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
REGISTER_JITKERNEL(vscal, VScalKernel);
REGISTER_JITKERNEL(vaddbias, VAddBiasKernel);
REGISTER_JITKERNEL(vrelu, VReluKernel);
/* An empty JitKernel */ /* An empty JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block> template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class VIdentityKernelImpl : public VIdentityKernel<T> { class VIdentityKernelImpl : public VIdentityKernel<T> {
public: public:
explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() { this->num_ = d; } explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() { this->num_ = d; }
void Compute(const T* x, T* y) const override {} void ComputeDeprecated(const T* x, T* y) const override {}
}; };
REGISTER_JITKERNEL_DEPRECATED(vrelu, VReluKernel);
REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel); REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel);
} // namespace jitkernel } // namespace jitkernel
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册