diff --git a/AUTHORS.md b/AUTHORS.md index 41b7193677a0208ba2fa82b72862292572dcb6ef..4060f75613ac4dadf353ff53a73fd0647a8052be 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -43,6 +43,7 @@ | qingqing01 | Qing-Qing Dang | | reyoung | Yang Yu | | Superjom | Chun-Wei Yan | +| tensor-tang | Jian Tang | | tianbingsz | Tian-Bing Xu | | tpatejko | Tomasz Patejko | | typhoonzero | Yi Wu | diff --git a/CMakeLists.txt b/CMakeLists.txt index ed704585d8a6bf3befd9a549aa5a62a33fea3da9..996a79fbbc3005680205e9fc0442b6bc6199bebb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,11 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") if(WIN32) set(CMAKE_STATIC_LIBRARY_PREFIX lib) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") endif(WIN32) if(NOT CMAKE_CROSSCOMPILING) @@ -41,6 +46,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) +option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) @@ -65,6 +71,8 @@ option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF) +option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF) +option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(ON_INFER "Turn on inference optimization." OFF) @@ -103,6 +111,8 @@ if(ANDROID OR IOS) "Disable RDMA when cross-compiling for Android and iOS" FORCE) set(WITH_MKL OFF CACHE STRING "Disable MKL when cross-compiling for Android and iOS" FORCE) + set(WITH_NGRAPH OFF CACHE STRING + "Disable nGraph when cross-compiling for Android and iOS" FORCE) set(WITH_GOLANG OFF CACHE STRING "Disable golang when cross-compiling for Android and iOS" FORCE) @@ -171,6 +181,7 @@ include(external/protobuf) # download, build, install protobuf include(external/python) # download, build, install python include(external/openblas) # download, build, install openblas include(external/mkldnn) # download, build, install mkldnn +include(external/ngraph) # download, build, install nGraph include(external/swig) # download, build, install swig include(external/boost) # download boost include(external/any) # download libn::any diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index f507bb41a1103c093e9569176ee868cfaac6bf7b..964d5fd45b350db2e5948574f53a427e53484ff4 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -157,6 +157,9 @@ list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) if(NOT WITH_DSO) # TODO(panyx0718): CUPTI only allows DSO? list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) + if(WIN32) + set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY}) + endif(WIN32) endif(NOT WITH_DSO) # setting nvcc arch flags @@ -196,10 +199,12 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) endif() else(NOT WIN32) -if(CMAKE_BUILD_TYPE STREQUAL "Release") +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS "-g -G") +elseif(CMAKE_BUILD_TYPE STREQUAL "Release") list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") else() - message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.") + message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") endif() endif(NOT WIN32) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index cd51533926de7bb132ab7bfab1686d664a331410..09bec347dbd569203103eccc7dbc0521c291bc0a 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -2,7 +2,12 @@ if(NOT WITH_GPU) return() endif() -set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT") +if(WIN32) + set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) +else(WIN32) + set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT") +endif(WIN32) + find_path(CUDNN_INCLUDE_DIR cudnn.h PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE} diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index 84354c446e2f54fa13b90fa37221eed90968b251..06fc6061bc98eec8c4c71860333f7d3456952aeb 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -58,19 +58,21 @@ ExternalProject_Add( -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER} + -DBUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN} + -DBUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR} ) message(STATUS "Anakin for inference is enabled") message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") - +add_dependencies(extern_anakin protobuf mklml) add_library(anakin_shared SHARED IMPORTED GLOBAL) set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB}) -add_dependencies(anakin_shared extern_anakin protobuf mklml) +add_dependencies(anakin_shared extern_anakin) add_library(anakin_saber SHARED IMPORTED GLOBAL) set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB}) -add_dependencies(anakin_saber extern_anakin protobuf mklml) +add_dependencies(anakin_saber extern_anakin) list(APPEND external_project_dependencies anakin_shared anakin_saber) diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake index ada61de8eb15ae10288ac54f588e9adf84acee37..5a78a1d1b7dea0d95ae3fa2c9f39679899dd1bcb 100644 --- a/cmake/external/boost.cmake +++ b/cmake/external/boost.cmake @@ -28,34 +28,28 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL)) set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) endif() -IF (WIN32) - MESSAGE(WARNING, "In windows, boost can not be downloaded automaticlly, please build it manually and put it at " ${THIRD_PARTY_PATH}install/boost) -else() - MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") -ENDIF(WIN32) + +MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") -set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) -set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) +set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}" CACHE PATH "boost include directory." FORCE) +set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) include_directories(${BOOST_INCLUDE_DIR}) -if (NOT WIN32) ExternalProject_Add( ${BOOST_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz - && tar zxf ${BOOST_TAR}.tar.gz + URL ${BOOST_URL} DOWNLOAD_NO_PROGRESS 1 PREFIX ${BOOST_SOURCES_DIR} CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" UPDATE_COMMAND "" -) -endif(NOT WIN32) + ) if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index cf58cc39762351f8b37d073bcd218d249285bf52..4e98e4bf889bc13938931be7f6cb204c83250a5c 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -35,7 +35,12 @@ ExternalProject_Add( CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DBUILD_STATIC_LIBS=ON -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DBUILD_TESTING=OFF @@ -48,8 +53,8 @@ ExternalProject_Add( IF(WIN32) IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib") add_custom_command(TARGET extern_gflags POST_BUILD - COMMAND cmake -E rename ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib - ) + COMMAND cmake -E copy ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib + ) ENDIF() ENDIF(WIN32) ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 25ef2970ac52f12f961c9c6d3a589fec4c80983f..8cd0455c16bf84909b735102e7fb1089744c4245 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -46,7 +46,11 @@ ExternalProject_Add( CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib -DCMAKE_POSITION_INDEPENDENT_CODE=ON @@ -63,7 +67,7 @@ ExternalProject_Add( IF(WIN32) IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib") add_custom_command(TARGET extern_glog POST_BUILD - COMMAND cmake -E rename ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib + COMMAND cmake -E copy ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib ) ENDIF() ENDIF(WIN32) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 9fea9ca05bce921b105a0f092c321b0c3a55c63c..785148d4f9f44032e2ce5bf93f0dc80fc865808b 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -37,7 +37,6 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers. -INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h IF(${CBLAS_PROVIDER} STREQUAL "MKLML") SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake new file mode 100644 index 0000000000000000000000000000000000000000..2e335579f32df4f146c8d88e05e684a9a8105e20 --- /dev/null +++ b/cmake/external/ngraph.cmake @@ -0,0 +1,92 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_library(ngraph INTERFACE) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with nGraph in Paddle yet." + "Force WITH_NGRAPH=OFF") + SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph in Windows and MacOS" FORCE) +ENDIF() + +IF(${WITH_NGRAPH} AND NOT ${WITH_MKLDNN}) + MESSAGE(WARNING + "nGraph needs mkl-dnn to be enabled." + "Force WITH_NGRAPH=OFF") + SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph if mkl-dnn is disabled" FORCE) +ENDIF() + +IF(NOT ${WITH_NGRAPH}) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(NGRAPH_PROJECT "extern_ngraph") +SET(NGRAPH_VERSION "0.9") +SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0") +SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) +SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) +SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) +SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION}) +SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) +SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) +SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") + +ExternalProject_Add( + ${NGRAPH_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS ${MKLDNN_PROJECT} ${MKLML_PROJECT} + GIT_REPOSITORY ${NGRAPH_GIT_REPO} + GIT_TAG ${NGRAPH_GIT_TAG} + PREFIX ${NGRAPH_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR} + CMAKE_ARGS -DNGRAPH_UNIT_TEST_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_TOOLS_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_INTERPRETER_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE + CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} + CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib +) + +if(UNIX AND NOT APPLE) + include(GNUInstallDirs) + SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) +else() + SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib) +endif() +MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}") + +SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME}) +SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME}) +SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME}) + +# Workaround for nGraph expecting mklml to be in mkldnn install directory. +ExternalProject_Add_Step( + ${NGRAPH_PROJECT} + PrepareMKL + COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_LIB} ${MKLDNN_INSTALL_DIR}/lib/libmklml_intel.so + COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_IOMP_LIB} ${MKLDNN_INSTALL_DIR}/lib/libiomp5.so + DEPENDEES download + DEPENDERS configure +) + +add_dependencies(ngraph ${NGRAPH_PROJECT}) +target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH) +target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR}) +target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB}) +LIST(APPEND external_project_dependencies ngraph) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 755dbd610c40c2d9b85d3017b6f000a869b0f39a..aeb976b840e999a20e8cab11939cbb1f49a27850 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -17,12 +17,8 @@ IF(USE_EIGEN_FOR_BLAS) ENDIF(USE_EIGEN_FOR_BLAS) INCLUDE(cblas) -# IF(WIN32 AND NOT ${CBLAS_FOUND}) - - IF(NOT ${CBLAS_FOUND}) - INCLUDE(ExternalProject) SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas) @@ -34,6 +30,7 @@ IF(NOT ${CBLAS_FOUND}) CACHE FILEPATH "openblas library." FORCE) ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS) + IF (WIN32) SET(CBLAS_FOUND true) MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 550b0dada8e90c1e2b33705fd53c065672113b45..e1e619e572b05e83fbe751af2e5391aafc494416 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -30,66 +30,61 @@ UNSET_VAR(PROTOBUF_LITE_LIBRARY) UNSET_VAR(PROTOBUF_LIBRARY) UNSET_VAR(PROTOBUF_INCLUDE_DIR) UNSET_VAR(Protobuf_PROTOC_EXECUTABLE) +function(protobuf_generate_python SRCS) + # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") + return() + endif() -if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined. - function(protobuf_generate_python SRCS) - # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake - if(NOT ARGN) - message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") - return() - endif() - - if(PROTOBUF_GENERATE_CPP_APPEND_PATH) - # Create an include path for each file specified - foreach(FIL ${ARGN}) - get_filename_component(ABS_FIL ${FIL} ABSOLUTE) - get_filename_component(ABS_PATH ${ABS_FIL} PATH) - list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) - if(${_contains_already} EQUAL -1) - list(APPEND _protobuf_include_path -I ${ABS_PATH}) - endif() - endforeach() - else() - set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) - endif() - - if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) - set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") - endif() - - if(DEFINED Protobuf_IMPORT_DIRS) - foreach(DIR ${Protobuf_IMPORT_DIRS}) - get_filename_component(ABS_PATH ${DIR} ABSOLUTE) - list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) - if(${_contains_already} EQUAL -1) - list(APPEND _protobuf_include_path -I ${ABS_PATH}) - endif() - endforeach() - endif() - - set(${SRCS}) + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + # Create an include path for each file specified foreach(FIL ${ARGN}) get_filename_component(ABS_FIL ${FIL} ABSOLUTE) - get_filename_component(FIL_WE ${FIL} NAME_WE) - if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) - get_filename_component(FIL_DIR ${FIL} DIRECTORY) - if(FIL_DIR) - set(FIL_WE "${FIL_DIR}/${FIL_WE}") - endif() + get_filename_component(ABS_PATH ${ABS_FIL} PATH) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() - list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") - add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" - COMMAND ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} - DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE} - COMMENT "Running Python protocol buffer compiler on ${FIL}" - VERBATIM ) + if(DEFINED Protobuf_IMPORT_DIRS) + foreach(DIR ${Protobuf_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() endforeach() + endif() - set(${SRCS} ${${SRCS}} PARENT_SCOPE) - endfunction() -endif() + set(${SRCS}) + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) + get_filename_component(FIL_DIR ${FIL} DIRECTORY) + if(FIL_DIR) + set(FIL_WE "${FIL_DIR}/${FIL_WE}") + endif() + endif() + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} + DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE} + COMMENT "Running Python protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set(${SRCS} ${${SRCS}} PARENT_SCOPE) +endfunction() # Print and set the protobuf library information, # finish this cmake process and exit from this file. @@ -126,6 +121,7 @@ macro(PROMPT_PROTOBUF_LIB) # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. # make `protobuf_generate_cpp` happy. SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) + FOREACH(dep ${protobuf_DEPS}) ADD_DEPENDENCIES(protobuf ${dep}) ADD_DEPENDENCIES(protobuf_lite ${dep}) @@ -144,7 +140,6 @@ endmacro() set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") IF (WIN32) SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf) - MESSAGE(WARNING, "In windows, protobuf only support msvc build, please build it manually and put it at " ${PROTOBUF_ROOT}) ENDIF(WIN32) if (NOT "${PROTOBUF_ROOT}" STREQUAL "") @@ -192,13 +187,20 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}" + "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}" + "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}" "-Dprotobuf_WITH_ZLIB=ON" "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}" ${EXTERNAL_OPTIONAL_ARGS}) SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}") ENDIF() + IF(WIN32) + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64") + ENDIF() SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index f17b8d46dc2d8ded81ced7de5827d5e7fd5109f0..a3599dd798c07f57ed82e3f25b6bb9fc4f8bdc3a 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -21,6 +21,48 @@ INCLUDE(python_module) FIND_PACKAGE(PythonInterp ${PY_VERSION}) FIND_PACKAGE(PythonLibs ${PY_VERSION}) +if(WIN32) + execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" +"from distutils import sysconfig as s;import sys;import struct; +print(sys.prefix); +print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION')); +" + RESULT_VARIABLE _PYTHON_SUCCESS + OUTPUT_VARIABLE _PYTHON_VALUES + ERROR_VARIABLE _PYTHON_ERROR_VALUE) + + if(NOT _PYTHON_SUCCESS MATCHES 0) + set(PYTHONLIBS_FOUND FALSE) + return() + endif() + + # Convert the process output into a list + string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES}) + string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES}) + list(GET _PYTHON_VALUES 0 PYTHON_PREFIX) + list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX) + + # Make sure all directory separators are '/' + string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX}) + + set(PYTHON_LIBRARY + "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") + + # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the + # original python installation. They may be found relative to PYTHON_INCLUDE_DIR. + if(NOT EXISTS "${PYTHON_LIBRARY}") + get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY) + set(PYTHON_LIBRARY + "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") + endif() + + # raise an error if the python libs are still not found. + if(NOT EXISTS "${PYTHON_LIBRARY}") + message(FATAL_ERROR "Python libraries not found") + endif() + SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}") +endif(WIN32) + # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. ADD_LIBRARY(python SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake index c227e09719bd5f0e825f81fb96f78105aa10c79b..4c2d64f627401071098e72bfb930fb5d62fa042d 100644 --- a/cmake/external/xxhash.cmake +++ b/cmake/external/xxhash.cmake @@ -14,23 +14,52 @@ ELSE() ENDIF(APPLE) ENDIF() -ExternalProject_Add( - extern_xxhash - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" - GIT_TAG "v0.6.5" - PREFIX ${XXHASH_SOURCE_DIR} - DOWNLOAD_NAME "xxhash" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_IN_SOURCE 1 - PATCH_COMMAND - BUILD_COMMAND ${BUILD_CMD} - INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install - TEST_COMMAND "" -) +if(WIN32) + ExternalProject_Add( + extern_xxhash + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" + GIT_TAG "v0.6.5" + PREFIX ${XXHASH_SOURCE_DIR} + DOWNLOAD_NAME "xxhash" + UPDATE_COMMAND "" + BUILD_IN_SOURCE 1 + PATCH_COMMAND + CONFIGURE_COMMAND + ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/src/extern_xxhash/cmake_unofficial + -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DBUILD_XXHSUM=OFF + -DCMAKE_GENERATOR_PLATFORM=x64 + -DBUILD_SHARED_LIBS=OFF + ${OPTIONAL_CACHE_ARGS} + TEST_COMMAND "" + ) +else() + ExternalProject_Add( + extern_xxhash + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" + GIT_TAG "v0.6.5" + PREFIX ${XXHASH_SOURCE_DIR} + DOWNLOAD_NAME "xxhash" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + PATCH_COMMAND + BUILD_COMMAND ${BUILD_CMD} + INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install + TEST_COMMAND "" + ) +endif() -set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") +if (WIN32) + set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib") +else() + set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") +endif () INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR}) add_library(xxhash STATIC IMPORTED GLOBAL) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 62227c67849dbb476339a176e0c98e295cbf529c..e21f89c7c585053631391852522d47cd7ffa7638 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -266,7 +266,11 @@ function(cc_library TARGET_NAME) if("${cc_library_DEPS};" MATCHES "python;") list(REMOVE_ITEM cc_library_DEPS python) add_dependencies(${TARGET_NAME} python) - target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup") + if(WIN32) + target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES}) + else() + target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup") + endif(WIN32) endif() target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) @@ -288,6 +292,45 @@ function(cc_library TARGET_NAME) endif(cc_library_SRCS) endfunction(cc_library) +# The link operation under windows may exceeds the maximum characters limit, simply break the link command +# into multiple link opeartion can fix that, say +# original: +# lib /out:target.lib a.lib b.lib c.lib d.lib +# after: +# 1. lib /out:dummy_lib_1.lib a.lib b.lib +# 2. lib /out:dummy_lib_2.lib c.lib d.lib +# 1. lib /out:target.lib dummy_lib_1.lib dummy_lib_2.lib +function(sep_library TARGET_NAME) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(sep_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(dummy_index 1) + set(dummy_offset 1) + # the dummy target would be consisted of limit size libraries + set(dummy_limit 50) + list(LENGTH sep_library_DEPS sep_all_len) + foreach(v ${sep_library_DEPS}) + list(APPEND dummy_list ${v}) + list(LENGTH dummy_list listlen ) + if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${sep_all_len})) + message("create dummy library ${TARGET_NAME}_dummy_lib_${dummy_index} for ${TARGET_NAME}") + cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} STATIC DEPS ${dummy_list}) + foreach(i ${dummy_list}) + list(REMOVE_AT dummy_list 0) + endforeach() + list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_lib_${dummy_index}) + MATH(EXPR dummy_index "${dummy_index}+1") + endif() + MATH(EXPR dummy_offset "${dummy_offset}+1") + endforeach() + if(${sep_library_SHARED}) + cc_library(${TARGET_NAME} SHARED SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list}) + else(${sep_library_SHARED}) + cc_library(${TARGET_NAME} STATIC SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list}) + endif(${sep_library_SHARED}) +endfunction(sep_library) + function(cc_binary TARGET_NAME) set(options "") set(oneValueArgs "") diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index efdb093a7b28e19f3b2a774dd54f2e7f042e9ca7..729bdcb3dc5324df0a5272402ef203012be0072a 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -22,175 +22,196 @@ function(copy TARGET) list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) - if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len}) + if (NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len}) message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers") - endif() + endif () math(EXPR len "${copy_lib_SRCS_len} - 1") add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS}) - foreach(index RANGE ${len}) + foreach (index RANGE ${len}) list(GET copy_lib_SRCS ${index} src) list(GET copy_lib_DSTS ${index} dst) - add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND mkdir -p "${dst}" - COMMAND cp -r "${src}" "${dst}" - COMMENT "copying ${src} -> ${dst}") - endforeach() + if (WIN32) + # windows cmd shell will not expand wildcard automatically. + # below expand the files,libs and copy them by rules. + file(GLOB header_files ${src} "*.h") + file(GLOB static_lib_files ${src} "*.lib") + file(GLOB dll_lib_files ${src} "*.dll") + set(src_files ${header_files} ${static_lib_files} ${dll_lib_files}) + + if (NOT "${src_files}" STREQUAL "") + list(REMOVE_DUPLICATES src_files) + endif () + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" + ) + foreach (src_file ${src_files}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}" + COMMENT "copying ${src_file} -> ${dst}") + endforeach () + else (WIN32) # not windows + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND mkdir -p "${dst}" + COMMAND cp -r "${src}" "${dst}" + COMMENT "copying ${src} -> ${dst}") + endif (WIN32) # not windows + endforeach () endfunction() # third party set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3") copy(eigen3_lib - SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen - DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported - DEPS eigen3 -) + SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen + DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported + DEPS eigen3 + ) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags") copy(gflags_lib - SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS gflags -) + SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS gflags + ) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog") copy(glog_lib - SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS glog -) + SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS glog + ) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/") copy(boost_lib - SRCS ${BOOST_INCLUDE_DIR}/boost - DSTS ${dst_dir} - DEPS boost -) + SRCS ${BOOST_INCLUDE_DIR}/boost + DSTS ${dst_dir} + DEPS boost + ) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash") copy(xxhash_lib - SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS xxhash -) + SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS xxhash + ) -if(NOT PROTOBUF_FOUND) +if (NOT PROTOBUF_FOUND) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") copy(protobuf_lib - SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS extern_protobuf - ) -endif() + SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS extern_protobuf + ) +endif () -if(NOT CBLAS_FOUND) +if (NOT CBLAS_FOUND) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas") copy(openblas_lib - SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include - DSTS ${dst_dir} ${dst_dir} - DEPS extern_openblas - ) + SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include + DSTS ${dst_dir} ${dst_dir} + DEPS extern_openblas + ) elseif (WITH_MKLML) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml") copy(mklml_lib - SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} - DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir} - DEPS mklml - ) -endif() - -if(WITH_MKLDNN) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn") - copy(mkldnn_lib - SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS mkldnn - ) -endif() + SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} + DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir} + DEPS mklml + ) +endif () + +if (WITH_MKLDNN) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn") + copy(mkldnn_lib + SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS mkldnn + ) +endif () if (NOT WIN32) -if(NOT MOBILE_INFERENCE AND NOT RPI) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") - copy(snappy_lib - SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappy) - - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") - copy(snappystream_lib - SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappystream) - - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") - copy(zlib_lib - SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS zlib) -endif() -endif(NOT WIN32) + if (NOT MOBILE_INFERENCE AND NOT RPI) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") + copy(snappy_lib + SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappy) + + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") + copy(snappystream_lib + SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappystream) + + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") + copy(zlib_lib + SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS zlib) + endif () +endif (NOT WIN32) # paddle fluid module set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid") set(module "framework") if (NOT WIN32) -set(framework_lib_deps framework_py_proto) -endif(NOT WIN32) + set(framework_lib_deps framework_py_proto) +endif (NOT WIN32) copy(framework_lib DEPS ${framework_lib_deps} - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h - ${src_dir}/${module}/ir/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir -) + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h + ${src_dir}/${module}/ir/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir + ) set(module "memory") copy(memory_lib - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail -) + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail + ) set(inference_deps paddle_fluid_shared paddle_fluid) set(module "inference/api") if (WITH_ANAKIN AND WITH_MKL) copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api - SRCS - ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api - ${ANAKIN_INSTALL_DIR} # anakin release - DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin) - list(APPEND inference_deps anakin_inference_lib) -endif() + SRCS + ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api + ${ANAKIN_INSTALL_DIR} # anakin release + DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin) + list(APPEND inference_deps anakin_inference_lib) +endif () set(module "inference") copy(inference_lib DEPS ${inference_deps} SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* - ${src_dir}/${module}/api/paddle_inference_api.h + ${src_dir}/${module}/api/paddle_*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} -) + ) set(module "platform") copy(platform_lib DEPS profiler_py_proto - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details -) + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details + ) set(module "string") copy(string_lib - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat -) + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat + ) set(module "pybind") copy(pybind_lib - SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h - DSTS ${dst_dir}/${module} -) + SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h + DSTS ${dst_dir}/${module} + ) # CMakeCache Info copy(cmake_cache - SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt - DSTS ${FLUID_INSTALL_DIR}) + SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt + DSTS ${FLUID_INSTALL_DIR}) # This command generates a complete fluid library for both train and inference add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) @@ -198,14 +219,14 @@ add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) # Following commands generate a inference-only fluid library # third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR} copy(third_party DEPS fluid_lib_dist - SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt - DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR} -) + SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt + DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR} + ) -# only need libpaddle_fluid.so/a and paddle_inference_api.h for inference-only library +# only need libpaddle_fluid.so/a and paddle_*.h for inference-only library copy(inference_api_lib DEPS fluid_lib_dist SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.* - ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_inference_api.h + ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ) @@ -213,20 +234,20 @@ add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib) # paddle fluid version function(version version_file) - execute_process( - COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} - OUTPUT_VARIABLE PADDLE_GIT_COMMIT) - file(WRITE ${version_file} - "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" - "WITH_MKL: ${WITH_MKL}\n" - "WITH_MKLDNN: ${WITH_MKLDNN}\n" - "WITH_GPU: ${WITH_GPU}\n") - if(WITH_GPU) - file(APPEND ${version_file} - "CUDA version: ${CUDA_VERSION}\n" - "CUDNN version: v${CUDNN_MAJOR_VERSION}\n") - endif() + execute_process( + COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_GIT_COMMIT) + file(WRITE ${version_file} + "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" + "WITH_MKL: ${WITH_MKL}\n" + "WITH_MKLDNN: ${WITH_MKLDNN}\n" + "WITH_GPU: ${WITH_GPU}\n") + if (WITH_GPU) + file(APPEND ${version_file} + "CUDA version: ${CUDA_VERSION}\n" + "CUDNN version: v${CUDNN_MAJOR_VERSION}\n") + endif () endfunction() version(${FLUID_INSTALL_DIR}/version.txt) version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt) diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index fa0e834a1dfd6e60f0ec07945be9a4d84017316f..3dc7171551bfb7aff8d1e75083c98b00378d247f 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -34,4 +34,5 @@ if(TENSORRT_FOUND) "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") include_directories(${TENSORRT_INCLUDE_DIR}) list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY}) + add_definitions(-DPADDLE_WITH_TENSORRT) endif() diff --git a/doc/v2/dev/contribute_to_paddle_en.md b/doc/v2/dev/contribute_to_paddle_en.md index c97564d93a7f0a753a23cd97d2467d595bd154ff..72723396444c0a6cc0516f6f2379b2d868ba59f7 120000 --- a/doc/v2/dev/contribute_to_paddle_en.md +++ b/doc/v2/dev/contribute_to_paddle_en.md @@ -1 +1 @@ -../../../CONTRIBUTING.md \ No newline at end of file +../../../CONTRIBUTING.md diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index a155c7052be4f07b568af7954f270ea13bd70164..3378d210cdf6a625e11b1dd5fe348aa04cdb9361 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -103,7 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) -paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False)) +paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False)) paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) @@ -184,6 +184,7 @@ paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'] paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -273,6 +274,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)) +paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None)) paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)) paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 7d48f0057140cf021a21ea7e304b7e38cc8b9ec2..abadda3adb00e1f41e90e07aa5e961134e69ae3d 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -4,11 +4,12 @@ add_subdirectory(framework) add_subdirectory(operators) add_subdirectory(string) -if (NOT WIN32) add_subdirectory(pybind) +if (NOT WIN32) add_subdirectory(recordio) endif(NOT WIN32) # NOTE: please add subdirectory inference at last. add_subdirectory(inference) + add_subdirectory(train) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 844291140602a7a0aac9d9d40256deaf9d8a4c60..cb9057672cc2c29af21b662edc189004bb0a4866 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -136,20 +136,32 @@ cc_library(version SRCS version.cc) cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) +cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto) +if(NOT WIN32) +cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog + shape_inference data_transform lod_tensor profiler) +endif(NOT WIN32) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) -if (NOT WIN32) py_proto_compile(framework_py_proto SRCS framework.proto) # Generate an empty __init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) -add_custom_command(TARGET framework_py_proto POST_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto - COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ - COMMENT "Copy generated python proto into directory paddle/fluid/proto." - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +if (NOT WIN32) + add_custom_command(TARGET framework_py_proto POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto + COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ + COMMENT "Copy generated python proto into directory paddle/fluid/proto." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +else(NOT WIN32) + string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/") + add_custom_command(TARGET framework_py_proto POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto + COMMAND copy /Y *.py ${proto_dstpath} + COMMENT "Copy generated python proto into directory paddle/fluid/proto." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif(NOT WIN32) cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) @@ -163,10 +175,14 @@ if(WITH_DISTRIBUTE) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + if(NOT WIN32) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator) + else(NOT WIN32) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + endif(NOT WIN32) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() - + if (NOT WIN32) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 48f94a1f05614d4b797562ac67cdb9828fd0456e..37202f869508c283e1b464942cadc0ebe3eef39c 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -79,9 +79,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { BuildStrategy strategy_; }; -std::shared_ptr BuildStrategy::CreatePassesFromStrategy() - const { +std::shared_ptr BuildStrategy::CreatePassesFromStrategy( + bool finalize_strategy) const { + if (is_finalized_) { + return pass_builder_; + } pass_builder_.reset(new ParallelExecutorPassBuilder(*this)); + if (finalize_strategy) { + is_finalized_ = true; + } return pass_builder_; } @@ -95,10 +101,8 @@ std::unique_ptr BuildStrategy::Apply( #else const bool use_cuda) const { #endif - // Create a default one if not initialized by user. - if (!pass_builder_) { - CreatePassesFromStrategy(); - } + // Create a default one if not finalized by user. + CreatePassesFromStrategy(false); std::unique_ptr graph(new ir::Graph(main_program)); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 6c7b54db8f610aa34cd51dcbc13063290cae3ac0..fc2641dbd48274b43db0b1f156e3e1128f96772e 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -75,12 +75,20 @@ struct BuildStrategy { bool remove_unnecessary_lock_{false}; + // NOTE: + // Before you add new options, think if it's a general strategy that works + // with other strategy. If not, the strategy should be created through + // CreatePassesFromStrategy and the pass can be managed separately. + // User normally doesn't need to call this API. // The PassBuilder allows for more customized insert, remove of passes // from python side. // A new PassBuilder is created based on configs defined above and // passes are owned by the PassBuilder. - std::shared_ptr CreatePassesFromStrategy() const; + std::shared_ptr CreatePassesFromStrategy( + bool finalize_strategy) const; + + bool IsFinalized() const { return is_finalized_; } // Apply the passes built by the pass_builder_. The passes will be // applied to the Program and output an ir::Graph. @@ -97,6 +105,7 @@ struct BuildStrategy { #endif private: + mutable bool is_finalized_ = false; mutable std::shared_ptr pass_builder_; }; diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index ca11c0083961f9b3d04e33113a0d685d508918f9..949510e03705a4a0900f1c7b8758a8f7308aa44b 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -30,8 +30,8 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( local_scopes_(local_scopes), places_(places), graph_(std::move(graph)), - pool_(strategy.num_threads_ + - 1), // add one more thread for generate op_deps + pool_(strategy.num_threads_), + prepare_pool_(1), // add one more thread for generate op_deps fetch_ctxs_(places) { for (auto &op : ir::FilterByNodeWrapper(*graph_)) { int dep = static_cast(op->NotReadyInputSize()); @@ -160,7 +160,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( }); } void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() { - atomic_op_deps_ = pool_.enqueue([&] { + atomic_op_deps_ = prepare_pool_.enqueue([&] { auto *op_deps = new std::unordered_map>; for (auto &pair : op_deps_) { (*op_deps)[pair.first] = pair.second; diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index 8b8382447105c8caa36963214684d6ee9fa15200..949616f02d5168e6abab932d608e4b20ee64304a 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -46,6 +46,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { std::vector bootstrap_ops_; ::ThreadPool pool_; + ::ThreadPool prepare_pool_; platform::DeviceContextPool fetch_ctxs_; std::atomic remaining_; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index d97f03660c70ed19ee27fd9f8096a4328b33a892..7ce08b728d9436c3b6e678faf328ddf1c45b7080 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/ngraph_operator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/operators/detail/macros.h" @@ -25,6 +26,7 @@ limitations under the License. */ DECLARE_bool(benchmark); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); +DEFINE_bool(use_ngraph, false, "Use NGRAPH to run"); namespace paddle { namespace framework { @@ -81,6 +83,24 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, } } +static void EnableFusedOp(ExecutorPrepareContext* ctx) { +#ifdef PADDLE_WITH_NGRAPH + VLOG(3) << "use_ngraph=True"; + auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_); + for (auto& interval : intervals) { + auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_, + interval.at(0), interval.at(1)); + *interval[0] = std::unique_ptr(fused_op); + } + for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { + ctx->ops_.erase(it->at(0) + 1, it->at(1)); + } +#else + LOG(WARNING) + << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option"; +#endif +} + Executor::Executor(const platform::Place& place) : place_(place) {} void Executor::Close() { @@ -338,6 +358,7 @@ std::unique_ptr Executor::Prepare( for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } + if (FLAGS_use_ngraph) EnableFusedOp(ctx.get()); return ctx; } @@ -359,6 +380,7 @@ std::vector> Executor::Prepare( void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, bool create_local_scope, bool create_vars, bool keep_kids) { + PADDLE_ENFORCE_NOT_NULL(scope); Scope* local_scope = scope; if (create_vars) { if (create_local_scope) { @@ -473,6 +495,5 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) { << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; #endif } - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index b403252c972d26da6deeca54ce88a9547ffe7afa..818b3334ea4171fd7a9cbaa896ee1672e8ecca51 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -29,7 +29,7 @@ template class GarbageCollector { public: GarbageCollector(const platform::Place &place, size_t max_memory_size) - : max_memory_size_(std::max(max_memory_size, static_cast(1))) { + : max_memory_size_((std::max)(max_memory_size, static_cast(1))) { garbages_.reset(new std::deque()); dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 4cf973253cc4f1f22d2fc578a1ac3a8c95e479c9..504f7e6d6c13d6c40d72a53e52fec920457f2dae 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -5,6 +5,7 @@ file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") # Usage: pass_library(target inference) will append to paddle_inference_pass.h +unset(INFER_IR_PASSES CACHE) # clear the global variable function(pass_library TARGET DEST) set(options "") set(oneValueArgs "") @@ -15,10 +16,11 @@ function(pass_library TARGET DEST) if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference") message(STATUS "add pass ${TARGET} ${DEST}") file(APPEND ${pass_file} "USE_PASS(${TARGET});\n") - set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE) + set(INFER_IR_PASSES ${INFER_IR_PASSES} ${TARGET} CACHE INTERNAL "") endif() endfunction() + cc_library(node SRCS node.cc DEPS proto_desc) cc_library(graph SRCS graph.cc DEPS node pretty_log) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index 6b284b1c1a4a37803229f4d55b100ca1da3a741d..c436dd414d01ab61d143427fe7ecd34a82f11f8d 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -91,10 +91,10 @@ void FindWhileOp(Graph* graph) { #undef OP_SET_IN #undef OP_SET_OUT - auto* X = graph->RetriveNode(34); - auto* LSTMOUT = graph->RetriveNode(81); - auto* cell_init = graph->RetriveNode(6); - auto* hidden_init = graph->RetriveNode(8); + auto* X = graph->RetrieveNode(34); + auto* LSTMOUT = graph->RetrieveNode(81); + auto* cell_init = graph->RetrieveNode(6); + auto* hidden_init = graph->RetrieveNode(8); auto* lstm_op = graph->CreateOpNode(&op_desc); PrepareParameters(graph, param); @@ -211,12 +211,12 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, VLOG(30) << "LSTMWeight resized to " << out->dims(); float* out_data = out->mutable_data(platform::CPUPlace()); - std::array tensors( - {{W_forget_w0.data(), W_input_w0.data(), - W_output_w0.data(), W_cell_w0.data()}}); - std::array tensors1( - {{W_forget_w1.data(), W_input_w1.data(), - W_output_w1.data(), W_cell_w1.data()}}); + std::array tensors{ + W_forget_w0.data(), W_input_w0.data(), + W_output_w0.data(), W_cell_w0.data()}; + std::array tensors1{ + W_forget_w1.data(), W_input_w1.data(), + W_output_w1.data(), W_cell_w1.data()}; for (int row = 0; row < D; row++) { for (int col = 0; col < 4; col++) { @@ -238,9 +238,9 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, const LoDTensor& B_output, const LoDTensor& B_cell, LoDTensor* out) { - std::array tensors( - {{B_forget.data(), B_input.data(), B_output.data(), - B_cell.data()}}); + std::array tensors{ + B_forget.data(), B_input.data(), B_output.data(), + B_cell.data()}; PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1); int D = B_forget.dims()[0]; diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 3348abb19b3339b2b3e8b50485133b15a1973a32..7b6ce0da07309a0ed2a5c8bcd5f59d84105261d7 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -57,6 +57,7 @@ std::unique_ptr FCFusePass::ApplyImpl( desc.SetInput("W", std::vector({fc_Y_in})); desc.SetInput("Bias", std::vector({fc_bias_in})); desc.SetOutput("Out", std::vector({fc_out_out})); + desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims")); desc.SetType("fc"); auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out}); diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc index 2db7d95cae1c8c59691fd642e2462e92ed58814f..4e1e4e27f9ba932b56ecc25e816a2aee9d42362e 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc @@ -29,6 +29,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, if (type == "mul") { op->SetInput("X", {inputs[0]}); op->SetInput("Y", {inputs[1]}); + op->SetAttr("x_num_col_dims", {1}); } else if (type == "elementwise_add") { op->SetInput("X", inputs); } diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index a2a8baa5e45d1791120e32c62dd0dbc533668290..ae0e42ff5e89466013382ab97650e6afeeff3d2d 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -84,8 +84,6 @@ void CheckProgram(const ProgramDesc &program) { Graph::Graph(const ProgramDesc &program) : program_(program) { CheckProgram(program_); - // Make the nodes id start from 0. - Node::ResetId(); auto var_nodes = InitFromProgram(program_); ResolveHazard(var_nodes); } diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 6384d89d2f2af4ab1d733af5eb1561cab2d09728..0c856f8e610077c69416ccfb8a763d4b8ae881b8 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -116,13 +116,17 @@ class Graph { // Create a normal variable with non-null VarDesc. ir::Node *CreateVarNode(VarDesc *var_desc) { PADDLE_ENFORCE(var_desc); - return AddNode(new ir::Node(var_desc)); + auto *x = AddNode(new ir::Node(var_desc)); + x->SetId(num_node_created_++); + return x; } // Create a normal runnable operator with OpDesc. ir::Node *CreateOpNode(OpDesc *op_desc) { PADDLE_ENFORCE(op_desc); - return AddNode(new ir::Node(op_desc)); + auto *x = AddNode(new ir::Node(op_desc)); + x->SetId(num_node_created_++); + return x; } // Create a control dependency var that connects 2 operations. The @@ -132,13 +136,17 @@ class Graph { // TODO(panyx0718): control var name should be really unique. const std::string name = string::Sprintf( "%s@%llu", ir::Node::kControlDepVarName, node_set_.size()); - return AddNode(new ir::Node(name, ir::Node::Type::kVariable)); + auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable)); + x->SetId(num_node_created_++); + return x; } // A more free style way of creating a graph node. Mostly use for test // or "copy" from another node. Avoid using it if possible. ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) { - return AddNode(new ir::Node(name, type)); + auto *x = AddNode(new ir::Node(name, type)); + x->SetId(num_node_created_++); + return x; } // Clear all node information of the graph and return the ownership of the @@ -160,7 +168,7 @@ class Graph { } // NOTE low performance, but simple and secure. - Node *RetriveNode(int id) { + Node *RetrieveNode(int id) { for (auto &node : nodes_) { if (node.second->id() == id) { return node.second.get(); @@ -169,6 +177,7 @@ class Graph { return nullptr; } + const ProgramDesc &program() const { return program_; } std::map> InitFromProgram( const ProgramDesc &program); @@ -190,6 +199,7 @@ class Graph { std::map> attr_dels_; std::map> nodes_; std::unordered_set node_set_; + size_t num_node_created_{0}; // help to generate a unique node id. }; bool IsControlDepVar(const ir::Node &var); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 0a3c8a6cb5c4187063db8ec2decd1f8d55aa39ea..b534a5509279ef7bfc5fc92ec726224e6c5ed16f 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -167,10 +167,12 @@ struct HitGroup { bool Match(Node *node, PDNode *pat) { if (nodes_.count(node)) { - if (!roles.count(pat)) return false; - return roles[pat] == node; + if (roles.count(pat) && roles[pat] == node) return true; + return false; + } else { + if (roles.count(pat) && roles[pat] != node) return false; + return true; } - return !roles.count(pat) || roles.at(pat) == node; } void Register(Node *node, PDNode *pat) { @@ -198,7 +200,6 @@ GraphPatternDetector::DetectPatterns() { std::vector result; std::vector init_groups; std::array, 2> bi_records; - // PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed"); auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get() : pattern_.edges().front().first; if (!pdnodes2nodes_.count(first_pnode)) return result; @@ -228,11 +229,12 @@ GraphPatternDetector::DetectPatterns() { VLOG(80) << "check " << source->id() << " -- " << target->id(); // TODO(Superjomn) add some prune strategies. for (const auto &group : pre_groups) { - HitGroup new_group = group; - if (IsNodesLink(source, target) && - new_group.Match(source, edge.first)) { - new_group.Register(source, edge.first); - if (new_group.Match(target, edge.second)) { + if (IsNodesLink(source, target)) { + HitGroup new_group = group; + bool flag = new_group.Match(source, edge.first) && + new_group.Match(target, edge.second); + if (flag) { + new_group.Register(source, edge.first); new_group.Register(target, edge.second); cur_groups.push_back(new_group); // TODO(Superjomn) need to unique @@ -261,14 +263,16 @@ GraphPatternDetector::DetectPatterns() { return result; } -bool GraphItemCMP(const std::pair &a, +struct GraphItemLessThan { + bool operator()(const std::pair &a, const std::pair &b) { - if (a.first != b.first) { - return a.first < b.first; - } else { - return a.second < b.second; + if (a.first != b.first) { + return a.first < b.first; + } else { + return a.second < b.second; + } } -} +}; // TODO(Superjomn) enhance the function as it marks unique unique as duplicates // see https://github.com/PaddlePaddle/Paddle/issues/13550 @@ -282,7 +286,7 @@ void GraphPatternDetector::UniquePatterns( for (auto &g : *subgraphs) { // Sort the items in the sub-graph, and transform to a string key. std::vector> sorted_keys(g.begin(), g.end()); - std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP); + std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan()); std::stringstream ss; for (auto &item : sorted_keys) { ss << item.first << ":" << item.second; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 9e462ac671ee931fc17a31f32a76049a0990341f..1c5155df7867f95fb403d51bf633084a6c400f12 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -310,8 +310,8 @@ void GraphSafeRemoveNodes(Graph* graph, const std::unordered_set& nodes); // Some pre-defined patterns those can be reused in multiple passes. -// The related Fluid Layer or Op should be one pattern here for better reusage -// accross different fusion. +// The related Fluid Layer or Op should be one pattern here for better re-usage +// across different fusion. namespace patterns { struct KeyCounter { diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc index 414d8f79b15de091c62af5fe099ffae144156e4e..36f36933265c69fcd450894a3e32bbb3e547b62c 100644 --- a/paddle/fluid/framework/ir/graph_to_program_pass.cc +++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc @@ -35,10 +35,11 @@ std::unique_ptr GraphToProgramPass::ApplyImpl( new proto::ProgramDesc(*program.Proto())); auto block = program_pb->mutable_blocks(kRootBlockIndex); + block->set_idx(kRootBlockIndex); block->clear_vars(); std::unordered_set visited_vars; for (ir::Node* n : graph->Nodes()) { - if (n->NodeType() == ir::Node::Type::kVariable) { + if (n->IsVar()) { if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) { visited_vars.insert(n->Var()->Name()); block->add_vars()->MergeFrom(*n->Var()->Proto()); diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc index 084a4ba2def87eaa8badb3ca2c39865c6e5cb981..2ee12cc410393d1e1aa5fc9e5374d858eca1b901 100644 --- a/paddle/fluid/framework/ir/graph_traits.cc +++ b/paddle/fluid/framework/ir/graph_traits.cc @@ -66,6 +66,76 @@ NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) { } Node *NodesDFSIterator::operator->() { return stack_.top(); } +inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) { + return node.inputs.size() == n; +} + +NodesTSIterator::NodesTSIterator(const std::vector &source) { + PADDLE_ENFORCE(!source.empty(), + "Start points of topological sorting should not be empty!"); + // CHECK all the inputs' in-degree is 0 + for (auto *node : source) { + PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0)); + } + + std::unordered_set visited; + std::unordered_set to_visit{source.begin(), source.end()}; + + std::vector inlink_visited; + while (!to_visit.empty()) { + std::vector queue(to_visit.begin(), to_visit.end()); + for (auto *p : queue) { + inlink_visited.clear(); + + std::copy_if(p->inputs.begin(), p->inputs.end(), + std::back_inserter(inlink_visited), + [&](Node *x) -> bool { return visited.count(x) != 0; }); + + if (inlink_visited.size() == p->inputs.size()) { + sorted_.push_back(p); + for (auto *_ : p->outputs) { + if (!visited.count(_)) { + to_visit.insert(_); + } + } + + to_visit.erase(p); + visited.insert(p); + } + } + } +} + +NodesTSIterator::NodesTSIterator(const NodesTSIterator &other) + : sorted_(other.sorted_), cursor_(other.cursor_) {} + +Node &NodesTSIterator::operator*() { + PADDLE_ENFORCE_LT(cursor_, sorted_.size()); + return *sorted_[cursor_]; +} + +NodesTSIterator &NodesTSIterator::operator++() { + if (++cursor_ >= sorted_.size()) { + sorted_.clear(); + cursor_ = 0; + } + return *this; +} +NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) { + cursor_ = other.cursor_; + sorted_ = other.sorted_; + return *this; +} + +bool NodesTSIterator::operator==(const NodesTSIterator &other) { + return sorted_ == other.sorted_ && cursor_ == other.cursor_; +} + +Node *NodesTSIterator::operator->() { + PADDLE_ENFORCE_LT(cursor_, sorted_.size()); + return sorted_[cursor_]; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_traits.h b/paddle/fluid/framework/ir/graph_traits.h index f42bab20ed97e372d2da0c4a492a4458ab94e0a0..f6772f9a37567c83c49bd44d551481edda1a74ae 100644 --- a/paddle/fluid/framework/ir/graph_traits.h +++ b/paddle/fluid/framework/ir/graph_traits.h @@ -62,6 +62,32 @@ struct NodesDFSIterator std::unordered_set visited_; }; +// Topological sorting iterator on nodes. +struct NodesTSIterator + : public std::iterator { + NodesTSIterator() = default; + NodesTSIterator(const std::vector &source); + NodesTSIterator(NodesTSIterator &&other) + : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) { + other.cursor_ = 0; + } + NodesTSIterator(const NodesTSIterator &other); + + Node &operator*(); + NodesTSIterator &operator++(); + // TODO(Superjomn) current implementation just compare the first + // element, need to compare the graph and all the elements in the queue and + // set. + NodesTSIterator &operator=(const NodesTSIterator &other); + bool operator==(const NodesTSIterator &other); + bool operator!=(const NodesTSIterator &other) { return !(*this == other); } + Node *operator->(); + + private: + std::vector sorted_; + size_t cursor_{0}; +}; + /* * GraphTraits contains some graph traversal algorithms. * @@ -76,6 +102,14 @@ struct GraphTraits { NodesDFSIterator()); } + static iterator_range TS(const Graph &g) { + auto start_points = ExtractStartPoints(g); + PADDLE_ENFORCE(!start_points.empty()); + NodesTSIterator x(start_points); + return iterator_range(NodesTSIterator(start_points), + NodesTSIterator()); + } + private: // The nodes those have no input will be treated as start points. static std::vector ExtractStartPoints(const Graph &g) { diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index 9277abe8c1b79c5f76f4610d0554bf337f329518..50d9113088903aa7681d6c6af5cc65f846d32787 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -17,8 +17,12 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { +// msvc15 don't support constexpr in correct way. +#if !defined(_WIN32) constexpr char Node::kControlDepVarName[]; -int Node::count_ = 0; +#else +const char Node::kControlDepVarName[] = "__control_var"; +#endif std::unique_ptr CreateNodeForTest(const std::string& name, Node::Type type) { diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index eedb375cf46165ebb09af56e5ab052a0327f1d0c..d2a393b3f19e9aab79098757dae663d030b0fa2b 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -55,7 +55,11 @@ class Node { } enum class Type { kOperation, kVariable }; +#if !defined(_WIN32) // msvc not support constexpr correctly. static constexpr char kControlDepVarName[] = "__control_var"; +#else + static const char kControlDepVarName[]; +#endif Type NodeType() const { return type_; } @@ -115,37 +119,30 @@ class Node { int id_; private: + // ID can only set by a Graph. + void SetId(int id) { id_ = id; } + friend class Graph; friend std::unique_ptr CreateNodeForTest(const std::string& name, Node::Type type); explicit Node(const std::string& name, Type type) - : name_(name), - var_desc_(nullptr), - op_desc_(nullptr), - type_(type), - id_(count_++) {} + : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {} explicit Node(VarDesc* var_desc) : name_(var_desc->Name()), var_desc_(new VarDesc(*var_desc)), op_desc_(nullptr), - type_(Type::kVariable), - id_(count_++) {} + type_(Type::kVariable) {} explicit Node(OpDesc* op_desc) : name_(op_desc->Type()), var_desc_(nullptr), op_desc_(new OpDesc(*op_desc, op_desc->Block())), - type_(Type::kOperation), - id_(count_++) {} + type_(Type::kOperation) {} Node() = delete; - static int count_; - // Please don't use this API or make this public. - static void ResetId() { count_ = 0; } - boost::any wrapper_; std::function wrapper_deleter_; std::type_index wrapper_type_ = std::type_index(typeid(void)); diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 8ac8d7677e1fe339d7802ea262e61a02a678aab5..615b539695de8c3f9a256d17d4d49e61902da394 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -93,6 +93,7 @@ class Pass { protected: virtual std::unique_ptr ApplyImpl(std::unique_ptr graph) const { LOG(FATAL) << "Calling virtual Pass not implemented."; + return graph; } private: @@ -196,26 +197,26 @@ struct PassRegistrar : public Registrar { msg) // Register a new pass that can be applied on the IR. -#define REGISTER_PASS(pass_type, pass_class) \ - STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ - __reg_pass__##pass_type, \ - "REGISTER_PASS must be called in global namespace"); \ - static ::paddle::framework::ir::PassRegistrar \ - __pass_registrar_##pass_type##__(#pass_type); \ - int TouchPassRegistrar_##pass_type() { \ - __pass_registrar_##pass_type##__.Touch(); \ - return 0; \ - } \ - static ::paddle::framework::ir::PassRegistrar \ - &__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \ +#define REGISTER_PASS(pass_type, pass_class) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __reg_pass__##pass_type, \ + "REGISTER_PASS must be called in global namespace"); \ + static ::paddle::framework::ir::PassRegistrar \ + __pass_registrar_##pass_type##__(#pass_type); \ + int TouchPassRegistrar_##pass_type() { \ + __pass_registrar_##pass_type##__.Touch(); \ + return 0; \ + } \ + static ::paddle::framework::ir::PassRegistrar \ + &__pass_tmp_registrar_##pass_type##__ UNUSED = \ __pass_registrar_##pass_type##__ -#define USE_PASS(pass_type) \ - STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ - __use_pass_itself_##pass_type, \ - "USE_PASS must be called in global namespace"); \ - extern int TouchPassRegistrar_##pass_type(); \ - static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \ +#define USE_PASS(pass_type) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __use_pass_itself_##pass_type, \ + "USE_PASS must be called in global namespace"); \ + extern int TouchPassRegistrar_##pass_type(); \ + static int use_pass_itself_##pass_type##_ UNUSED = \ TouchPassRegistrar_##pass_type() } // namespace ir diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 8e660f97f051b194a0305dc82371fbe64da7e061..c384456b648d4497bf4bd003b183b773186e0f15 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -57,60 +57,58 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { } } -void NaiveExecutor::Prepare(Scope *parent_scope, - const ProgramDesc &program_desc, int block_id, - bool with_feed_fetch_ops) { - if (!parent_scope) { +void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, + int block_id, bool with_feed_fetch_ops) { + if (!scope) { scope_ = new framework::Scope; } else { - scope_ = &parent_scope->NewScope(); + scope_ = scope; } - CreateVariables(program_desc, scope_, block_id); + + VLOG(3) << "NaiveExecutor init with scope " << scope; CreateOps(program_desc, block_id, with_feed_fetch_ops); } void NaiveExecutor::Run() { for (auto &op : ops_) { - VLOG(40) << "run " << op->Type(); + VLOG(3) << std::this_thread::get_id() << " run " << op->Type() + << " on scope " << scope_; op->Run(*scope_, place_); } } -void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope, - int block_id) { - PADDLE_ENFORCE(scope); +void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id, + bool persistable, Scope *scope) { + PADDLE_ENFORCE_NOT_NULL(scope); + auto &global_block = desc.Block(block_id); - const Scope *ancestor_scope = scope; - while (ancestor_scope->parent()) { - ancestor_scope = ancestor_scope->parent(); + const auto *anc = scope; + PADDLE_ENFORCE(anc->parent() != anc); + while (anc->parent()) { + anc = anc->parent(); } - if (ancestor_scope != scope) { - for (auto &var : global_block.AllVars()) { - if (var->Name() == framework::kEmptyVarName) { - continue; - } - // Create persistable vars in ancestor scope. - if (var->Persistable()) { - auto *ptr = const_cast(ancestor_scope)->Var(var->Name()); - InitializeVariable(ptr, var->GetType()); - VLOG(30) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; - } else { // Create temporary variables in local scope. - auto *ptr = scope->Var(var->Name()); + for (auto &var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (persistable == var->Persistable()) { + if (persistable) { + if (!anc->FindVar(var->Name())) { + auto *ptr = const_cast(anc)->Var(var->Name()); + VLOG(3) << scope << " Create persistable variable " << var->Name() + << ", which pointer is " << ptr; + InitializeVariable(ptr, var->GetType()); + } + } else { + auto *ptr = const_cast(scope)->Var(var->Name()); + VLOG(3) << scope << " Create variable " << var->Name() + << ", which pointer is " << ptr; InitializeVariable(ptr, var->GetType()); - VLOG(30) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; } } - } else { - for (auto &var : global_block.AllVars()) { - auto *ptr = scope->Var(var->Name()); - InitializeVariable(ptr, var->GetType()); - VLOG(30) << "Create variable " << var->Name() << ", which pointer is " - << ptr; - } } } diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h index ddfa6e1f4d8b73f594fc381ab505797491cdd378..5e673f68574c4ddaa4c9260367d09e9f62f6b751 100644 --- a/paddle/fluid/framework/naive_executor.h +++ b/paddle/fluid/framework/naive_executor.h @@ -35,8 +35,14 @@ class NaiveExecutor { // Create child scope. // Create variables. // @with_feed_fetch_ops: whether to work with the feed and fetch operators. - void Prepare(Scope* parent_scope, const ProgramDesc& program_desc, - int block_id, bool with_feed_fetch_ops); + void Prepare(Scope* scope, const ProgramDesc& program_desc, int block_id, + bool with_feed_fetch_ops); + + // Create variables before head. + // Create parameters if persistable is ture, or create the temporary variables + // instead. + void CreateVariables(const ProgramDesc& desc, int block_id, bool persistable, + Scope* scope); // Run all the operators. void Run(); @@ -49,8 +55,6 @@ class NaiveExecutor { void CleanFeedFetchOps(); protected: - void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id); - void CreateOps(const ProgramDesc& desc, int block_id, bool with_feed_fetch_ops); diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc index 6b9f79b9d398bc5a0ee6ba66587924daad0dbbc5..c917630666b082ab7148550707f9f1f720aa25d3 100644 --- a/paddle/fluid/framework/naive_executor_test.cc +++ b/paddle/fluid/framework/naive_executor_test.cc @@ -39,7 +39,7 @@ TEST(NaiveExecutor, Basic) { auto place = platform::CPUPlace(); NaiveExecutor exe(place); - exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/); + exe.Prepare(nullptr, program, 0, false); auto* a_tensor = exe.FindTensor("a"); auto* b_tensor = exe.FindTensor("b"); auto* c_tensor = exe.FindTensor("c"); diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc new file mode 100644 index 0000000000000000000000000000000000000000..8177436d0bd90c3bcf8f91d5c55b66be188b19f9 --- /dev/null +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -0,0 +1,39 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_NGRAPH +#include +#include + +#include "paddle/fluid/framework/ngraph_bridge.h" + +#include "ngraph/ngraph.hpp" + +namespace paddle { +namespace framework { + +std::map&, + std::shared_ptr>>)>> + NgraphBridge::NG_NODE_MAP = {}; + +void NgraphBridge::build_graph(const std::shared_ptr& op) { + auto& op_type = op->Type(); + NG_NODE_MAP[op_type](op, ngb_node_map); +} + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h new file mode 100644 index 0000000000000000000000000000000000000000..55bf0d21f3471013b1fb780e852d813313345f03 --- /dev/null +++ b/paddle/fluid/framework/ngraph_bridge.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_NGRAPH + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +#include "ngraph/ngraph.hpp" + +namespace paddle { +namespace framework { + +class NgraphBridge { + public: + static std::map< + std::string, + std::function&, + std::shared_ptr>>)>> + NG_NODE_MAP; + + explicit NgraphBridge( + std::shared_ptr< + std::unordered_map>> + var_node_map) + : ngb_node_map(var_node_map) {} + + void build_graph(const std::shared_ptr& op); + + private: + std::shared_ptr< + std::unordered_map>> + ngb_node_map; +}; + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc new file mode 100644 index 0000000000000000000000000000000000000000..d967b2780c21713a2f9a73a3402964103f44269e --- /dev/null +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -0,0 +1,220 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_NGRAPH +#include + +#include +#include + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/ngraph_operator.h" +#include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/framework/var_type.h" + +namespace paddle { +namespace framework { + +static std::map pd2ng_type_map = { + {proto::VarType::FP32, ngraph::element::f32}, + {proto::VarType::FP64, ngraph::element::f64}, + {proto::VarType::INT32, ngraph::element::i32}, + {proto::VarType::INT64, ngraph::element::i64}, + {proto::VarType::BOOL, ngraph::element::boolean}, +}; + +typedef enum { /* nGraph support state on ops */ + FULL_TRAIN, /* Support full ops for train */ + PARTIAL_TRAIN, /* Support partial ops for train */ + FULL_TEST, /* Support full list of ops for test */ + PARTIAL_TEST /* Support partial list of ops for test */ +} op_state; + +class NgraphOperator { + public: + explicit NgraphOperator(const Scope& scope, const platform::Place& place, + const std::vector>& ops, + const std::unordered_map< + std::string, ngraph::element::Type>& var_type_map, + const std::unordered_set& persist, + const std::unordered_set& fetches, + const std::unordered_set& post_op_inputs, + op_state ng_op_state) + : scope_(scope), + place_(place), + fused_ops_(ops), + var_type_map_(var_type_map), + persistables_(persist), + fetches_(fetches), + post_op_inputs_(post_op_inputs), + ng_op_state_(ng_op_state) {} + + void Run(const Scope& scope, const platform::Place& place) const; + + private: + static std::unordered_map> + func_cache; + const Scope& scope_; + const platform::Place& place_; + std::vector> fused_ops_; + std::unordered_map var_type_map_; + std::unordered_set persistables_; + std::unordered_set fetches_; + std::unordered_set post_op_inputs_; + op_state ng_op_state_; +}; + +std::vector>::iterator>> +FusedOperator::FusedOpIntervals( + std::vector>* ops) { + std::vector>::iterator>> + intervals; + if (ops->empty()) { + return intervals; + } + size_t size = ops->size(); + size_t left = 0; + while (left < size && ops.at(left)->Type() != kFeedOpType) { + ++left; + } + if (left == size) { + return intervals; + } + while (left < size && ops->at(left)->Type() == kFeedOpType) { + ++left; + } + + size_t right = left; + while (right < size && ops->at(right)->Type() != kFetchOpType) { + ++right; + } + if (right == size) { + return intervals; + } + if (left >= right) return intervals; + + // (left, right - 1) represents indices between feed and fetch + size_t pivot = left; + while (pivot < right) { + auto op_type = ops->at(pivot)->Type(); + if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) == + paddle::framework::NgraphBridge::NG_NODE_MAP.end()) { + ++pivot; + } else { + size_t start = pivot, end = start; + while (pivot < right && + (paddle::framework::NgraphBridge::NG_NODE_MAP.find( + ops.at(pivot)->Type()) != + paddle::framework::NgraphBridge::NG_NODE_MAP.end())) { + ++pivot; + ++end; + } + std::vector>::iterator> + interval = {ops->begin() + start, ops->begin() + end}; + intervals.push_back(interval); + } + } // end while + + return intervals; +} + +FusedOperator::FusedOperator( + const ProgramDesc& prog, size_t block_id, + std::vector>::iterator start, + std::vector>::iterator end, + const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) { + for (std::vector>::iterator it = start; + it != end; ++it) { + fused_ops_.push_back(std::move(*it)); + } + + for (std::vector>::iterator it = end; + (*it)->Type() != kFetchOpType; ++it) { + for (auto& var_name_item : (*it)->Inputs()) { + for (auto& var_name : var_name_item.second) { + post_op_inputs_.insert(var_name); + } + } + } + + if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) { + is_complete = true; + } + + Process(); +} + +void FusedOperator::Process() { + auto& bdesc = pdesc_.Block(block_); + for (auto& var : bdesc.AllVars()) { + if (!(var->GetType() == proto::VarType::SELECTED_ROWS || + var->GetType() == proto::VarType::LOD_TENSOR || + var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) { + continue; + } + + auto var_name = var->Name(); + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var_name != "fetch" && var_name != "feed") { + auto pd_type = var->GetDataType(); + if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) { + PADDLE_THROW("Data type of var %s not found in pd2ng_type_map", + var_name); + } + var_type_map_[var_name] = pd2ng_type_map[pd_type]; + } + + if (var->Persistable()) { + persistables_.insert(var->Name()); + } + } + + for (auto* op : bdesc.AllOps()) { + if (op->Type() == kFetchOpType) { + std::string fetch_target_name = op->Input("X")[0]; + fetches_.insert(fetch_target_name); + } + } +} + +void FusedOperator::RunImpl(const Scope& scope, + const platform::Place& place) const { + op_state ng_op_state = PARTIAL_TEST; + auto& bdesc = pdesc_.Block(block_); + for (auto* op : bdesc.AllOps()) { + if (op->Type().find("_grad") != std::string::npos) { + ng_op_state = PARTIAL_TRAIN; + break; + } + } + + if (is_full) { + ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; + } + + NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_, + persistables_, fetches_, post_op_inputs_, + ng_op_state); + ngraph_op.Run(scope, place); +} + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h new file mode 100644 index 0000000000000000000000000000000000000000..0f655cef1dde624bcf4944b5c096279097e1c8ae --- /dev/null +++ b/paddle/fluid/framework/ngraph_operator.h @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_NGRAPH + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/ngraph_bridge.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/variant.h" + +#include "ngraph/ngraph.hpp" + +namespace paddle { +namespace framework { + +class FusedOperator : public OperatorBase { + public: + static std::vector< + std::vector>::iterator>> + FusedOpIntervals( + std::vector>* ops); + + explicit FusedOperator( + const ProgramDesc& prog, size_t block_id, + std::vector>::iterator start, + std::vector>::iterator end, + const std::string& type = "fused_op", const VariableNameMap& inputs = {}, + const VariableNameMap& outputs = {}, const AttributeMap& attrs = {}); + + void RunImpl(const Scope& scope, const platform::Place& place) const final; + + private: + const ProgramDesc pdesc_; + size_t block_; + std::vector> fused_ops_; + std::unordered_map var_type_map_; + std::unordered_set persistables_; + std::unordered_set fetches_; + std::unordered_set post_op_inputs_; + bool is_full_ = false; + + void Process(); +}; +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 5624878d439873e5f6aee6ec9234e31d5c77ff97..6bd744edc22e6a90ce64e9d699e7f3c5c60d4908 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -150,14 +150,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { #endif } - // The profile has a process-wide mutex, results in serious performance issue - // in concurrency scenerio. Here use an `if` to fix this issue. - // Please not remove the `if`, ask @Superjomn if there are any concern. +// The profile has a process-wide mutex, results in serious performance issue +// in concurrency scenerio. Here use an `if` to fix this issue. +// Please not remove the `if`, ask @Superjomn if there are any concern. +#ifndef _WIN32 if (platform::IsProfileEnabled()) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::RecordEvent record_event(Type(), pool.Get(place)); RunImpl(scope, place); - } else { + } else +#endif + { RunImpl(scope, place); } VLOG(30) << place << " " << DebugStringEx(&scope); diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 0c407f8c1d11a8a0f99551fc51d2ef2be5262c63..bbeef150254f8f7a1f382a5b81055a6a5589eee1 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include // for unique_ptr +#include #include +#include #include "glog/logging.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" @@ -36,6 +38,16 @@ DEFINE_double( "Memory size threshold (GB) when the garbage collector clear tensors." "Disabled when this value is less than 0"); +// When in inference scenario, the scopes will not be written by two threads in +// a mean time, but a scope may be read by multiple threads concurrently, and +// the mutex will cause serious performance issue. +// So the mutex is disabled when `ON_INFER`. +#ifdef ON_INFER +#define SCOPE_LOCK_GUARD +#else +#define SCOPE_LOCK_GUARD std::lock_guard lock(mutex_); +#endif + namespace paddle { namespace framework { @@ -49,18 +61,18 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - std::lock_guard lock(mutex_); + SCOPE_LOCK_GUARD kids_.push_back(new Scope(this)); return *kids_.back(); } Variable* Scope::Var(const std::string& name) { - std::lock_guard lock(mutex_); + SCOPE_LOCK_GUARD return VarInternal(name); } Variable* Scope::Var(std::string* name) { - std::lock_guard lock(mutex_); + SCOPE_LOCK_GUARD auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; @@ -69,34 +81,34 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { - std::lock_guard lock(mutex_); + SCOPE_LOCK_GUARD return FindVarInternal(name); } Variable* Scope::FindLocalVar(const std::string& name) const { - std::lock_guard lock(mutex_); + SCOPE_LOCK_GUARD return FindVarLocally(name); } const Scope* Scope::FindScope(const Variable* var) const { - std::lock_guard lock(mutex_); + SCOPE_LOCK_GUARD return FindScopeInternal(var); } void Scope::DropKids() { - std::lock_guard lock(mutex_); + SCOPE_LOCK_GUARD for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - std::lock_guard lock(mutex_); + SCOPE_LOCK_GUARD auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { - std::lock_guard lock(mutex_); + SCOPE_LOCK_GUARD std::vector known_vars; known_vars.reserve(this->vars_.size()); for (auto& p : vars_) { @@ -106,9 +118,10 @@ std::vector Scope::LocalVarNames() const { } void Scope::DeleteScope(Scope* scope) const { - std::lock_guard lock(mutex_); + SCOPE_LOCK_GUARD auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); - PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); + PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", + this, scope); this->kids_.erase(it); // When making memory benchmark on Fluid, we have to delete scope sync. if (FLAGS_benchmark || FLAGS_eager_delete_scope) { @@ -119,7 +132,7 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { - std::lock_guard lock(mutex_); + SCOPE_LOCK_GUARD std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { @@ -132,12 +145,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - std::lock_guard lock(mutex_); + SCOPE_LOCK_GUARD RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - std::lock_guard lock(mutex_); + SCOPE_LOCK_GUARD auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; @@ -189,5 +202,46 @@ Variable* Scope::FindVarLocally(const std::string& name) const { return nullptr; } +std::string GenScopeTreeDebugInfo(Scope* root) { + std::stringstream os; + + if (!root) return ""; + + // level traversal + std::queue queue; + queue.push(root); + + std::vector scopes; + + while (!queue.empty()) { + auto* end = queue.back(); + Scope* q = nullptr; + while (q != end) { + q = queue.front(); + queue.pop(); + os << q << " "; + scopes.push_back(q); + + for (auto* c : q->kids()) { + queue.push(c); + } + } + // end of a level + os << "\n------------------------------------------\n"; + } + + os << "\nDetails:\n\n"; + + for (Scope* q : scopes) { + os << "====\n"; + os << q << ":\n"; + for (auto& var : q->LocalVarNames()) { + os << " - " << var << "\n"; + } + } + + return os.str(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 9462620e829ec815e1553f6378a67463ea3b8aa3..1901ffbe57e0d85193c3a218f06eba06a0f287a5 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -78,11 +78,11 @@ class Scope { /// Drop all kids scopes belonged to this scope. void DropKids(); - std::list& kids() const { return kids_; } - /// Find if a scope exists in the kid scopes bool HasKid(const Scope* scope) const; + const std::list& kids() const { return kids_; } + // enumerate all the variables current contains. std::vector LocalVarNames() const; @@ -118,12 +118,17 @@ class Scope { // Scope in `kids_` are owned by this class. mutable std::list kids_; - Scope const* parent_{nullptr}; + const Scope* parent_{nullptr}; DISABLE_COPY_AND_ASSIGN(Scope); private: mutable std::mutex mutex_; }; + +// Generate some debug string about the inherience structure of scope, quite +// naive. +std::string GenScopeTreeDebugInfo(Scope*); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index 3319c772ec789bc5b28307906adfb2a9417d9182..f4f2b769d5e47d8fba8d08476df4cd8e54133551 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -63,6 +63,26 @@ struct TensorCopyVisitor { int64_t size_; }; +struct TensorFillVisitor { + TensorFillVisitor(framework::Tensor* dst, int64_t dst_offset, int64_t size, + float value) + : dst_(dst), dst_offset_(dst_offset), size_(size) {} + + template + void apply() const { + // TODO(qiao): support other place + platform::CPUPlace cpu; + auto* tensor_data = dst_->mutable_data(cpu); + auto* start = tensor_data + dst_offset_; + auto* end = start + size_; + std::fill(start, end, static_cast(0.0)); + } + + framework::Tensor* dst_; + int64_t dst_offset_; + int64_t size_; +}; + void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, const platform::DeviceContext& dev_ctx) { { // the 1st field, uint32_t version @@ -120,7 +140,17 @@ bool SelectedRows::HasKey(int64_t key) const { : true; } -int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) { +int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown, + bool is_test) { + if (is_test) { + auto iter = id_to_index_.find(key); + if (iter == id_to_index_.end()) { + return -1; + } else { + return iter->second; + } + } + rwlock_->RDLock(); auto iter = id_to_index_.find(key); if (iter == id_to_index_.end()) { @@ -172,7 +202,7 @@ void SelectedRows::SyncIndex() { } void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, - bool auto_grown) { + bool auto_grown, bool is_test) { PADDLE_ENFORCE(value->IsInitialized(), "The value tensor should be initialized."); if (ids.numel() == 0) { @@ -183,11 +213,19 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, "output tensor should have the same shape with table " "except the dims[0]."); for (int i = 0; i < ids.numel(); ++i) { - int64_t index = AutoGrownIndex(ids.data()[i], auto_grown); - framework::VisitDataType( - framework::ToDataType(value_->type()), - TensorCopyVisitor(value, i * value_width, *value_.get(), - index * value_width, value_width)); + auto id = ids.data()[i]; + int64_t index = AutoGrownIndex(id, auto_grown, is_test); + if (index < 0) { + VLOG(5) << "id " << id << " not in the table, return 0"; + framework::VisitDataType( + framework::ToDataType(value_->type()), + TensorFillVisitor(value, i * value_width, value_width, 0.0)); + } else { + framework::VisitDataType( + framework::ToDataType(value_->type()), + TensorCopyVisitor(value, i * value_width, *value_.get(), + index * value_width, value_width)); + } } } } diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index daf5e95304fb84eaba26a30c45414d5021e7ffcb..55ca02038e083da4f8984f70fecf4ca2d878088e 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -105,7 +105,7 @@ class SelectedRows { * the value */ void Get(const framework::Tensor& ids, framework::Tensor* value, - bool auto_grown = false); + bool auto_grown = false, bool is_test = false); /* * @brief Get the index of the key from id_to_index_ map. If the key not @@ -118,7 +118,7 @@ class SelectedRows { * * @return index of the key. */ - int64_t AutoGrownIndex(int64_t key, bool auto_grown); + int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false); void SyncIndex(); diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc index 9c427a4ae4c9660b107ca891a60db306cb09301f..3b0509e0344efedf08ab21cac0a075049617ca97 100644 --- a/paddle/fluid/framework/selected_rows_test.cc +++ b/paddle/fluid/framework/selected_rows_test.cc @@ -84,10 +84,14 @@ TEST(SelectedRows, SparseTable) { data[i * embedding_width + j] = static_cast(i); } } - ASSERT_EQ(table.AutoGrownIndex(10, true), 0); - ASSERT_EQ(table.AutoGrownIndex(8, true), 1); - ASSERT_EQ(table.AutoGrownIndex(8, true), 1); - ASSERT_EQ(table.AutoGrownIndex(6, true), 2); + ASSERT_EQ(table.AutoGrownIndex(10, true, false), 0); + ASSERT_EQ(table.AutoGrownIndex(8, true, false), 1); + ASSERT_EQ(table.AutoGrownIndex(8, true, false), 1); + ASSERT_EQ(table.AutoGrownIndex(6, true, false), 2); + for (int64_t i = 11; i < 20; i++) { + ASSERT_EQ(table.AutoGrownIndex(i, true, true), -1); + ASSERT_TRUE(!table.HasKey(i)); + } ASSERT_TRUE(table.HasKey(10)); ASSERT_TRUE(table.HasKey(8)); ASSERT_TRUE(table.HasKey(6)); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index e5678cf607a8ff3763e79c1f321a81c021846fb1..fc656613010d18608c2780c96212f2d7bb674704 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -16,9 +16,21 @@ cc_library(paddle_fluid_api DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) +get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) +get_property(fluid_third_partys GLOBAL PROPERTY FLUID_THRID_PARTYS) +if (WIN32) +list(APPEND fluid_third_partys gflags glog protobuf cblas) +endif(WIN32) # paddle_fluid_origin exclude inference api interface -cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) +if(WIN32) + sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) + if(WITH_GPU AND NOT WITH_DSO) + target_link_libraries(paddle_fluid_origin ${cuda_modules}) + endif(WITH_GPU AND NOT WITH_DSO) +else(WIN32) + cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) +endif(WIN32) add_subdirectory(api) @@ -27,13 +39,17 @@ set(SHARED_INFERENCE_SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc) -if (WITH_GPU AND TENSORRT_FOUND) - set(STATIC_INFERENCE_APIS ${STATIC_INFERENCE_APIS} paddle_inference_tensorrt_subgraph_engine) - set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/api/api_tensorrt_subgraph_engine.cc) -endif() -# Create static library -cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) +if(WIN32) + sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array + analysis_config paddle_pass_builder) + if(WITH_GPU AND NOT WITH_DSO) + target_link_libraries(paddle_fluid ${cuda_modules}) + endif(WITH_GPU AND NOT WITH_DSO) +else(WIN32) + cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array + analysis_config paddle_pass_builder) +endif(WIN32) if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. @@ -42,11 +58,20 @@ if(NOT APPLE) endif() # Create shared library -cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} - DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array) +if(WIN32) + sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} + DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) + target_link_libraries(paddle_fluid_shared shlwapi) + if(WITH_GPU AND NOT WITH_DSO) + target_link_libraries(paddle_fluid_origin ${cuda_modules}) + endif(WITH_GPU AND NOT WITH_DSO) +else(WIN32) + cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} + DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) +endif() set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) -if(NOT APPLE) +if(NOT APPLE AND NOT WIN32) # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac. set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map") set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 0354f9e6e9588af601210b8a71ae98c1f90d62f0..eb89fc5e1124e97b082d6299e3efc44591a8b01b 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,24 +1,25 @@ -cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) -set(analysis_deps - framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) +unset(analysis_deps CACHE) +set(analysis_deps # analysis_deps can be extended accross the project + framework_proto proto_desc graph pass paddle_fluid_api executor pretty_log + ir_pass_manager + CACHE INTERNAL "") -cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc +add_subdirectory(ir_passes) +add_subdirectory(passes) + +cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES}) + +cc_library(argument SRCS argument.cc DEPS scope proto_desc) +cc_library(analysis_pass SRCS analysis_pass.cc DEPS proto_desc) + +cc_library(analysis SRCS analyzer.cc helper.cc - # passes - analysis_pass.cc - fluid_to_data_flow_graph_pass.cc - data_flow_graph_to_fluid_pass.cc - dfg_graphviz_draw_pass.cc - tensorrt_subgraph_pass.cc - tensorrt_subgraph_node_mark_pass.cc - fluid_to_ir_pass.cc - model_store_pass.cc - DEPS ${analysis_deps}) + analysis_pass + DEPS ${analysis_deps} + ) -cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) -cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid) function(inference_analysis_test TARGET) if(WITH_TESTING) @@ -34,13 +35,3 @@ function(inference_analysis_test TARGET) endfunction(inference_analysis_test) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api) -inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) -inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) -inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc) -inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc) -inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc) -inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc) -inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc) -inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) -inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc) -inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h index 13805ea4acf936b242bcd86b2faf89813753a9fe..299f235a74ae0ffb663be61079607d8ac1105a97 100644 --- a/paddle/fluid/inference/analysis/analysis_pass.h +++ b/paddle/fluid/inference/analysis/analysis_pass.h @@ -19,42 +19,36 @@ limitations under the License. */ #include #include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/inference/analysis/argument.h" -#include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/helper.h" -#include "paddle/fluid/inference/analysis/node.h" namespace paddle { namespace inference { namespace analysis { +/* + * AnalysisPass is a pass used to control the IR passes. + */ class AnalysisPass { public: AnalysisPass() = default; virtual ~AnalysisPass() = default; - // Mutable Pass. - virtual bool Initialize(Argument *argument) { return false; } - // Readonly Pass. - virtual bool Initialize(const Argument &argument) { return false; } - // Virtual method overriden by subclasses to do any necessary clean up after - // all passes have run. - virtual bool Finalize() { return false; } - - // Create a debugger Pass that draw the DFG by graphviz toolkit. - virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; } - - // Run on a single DataFlowGraph. - virtual void Run(DataFlowGraph *x) = 0; + // Run on a single Graph. + void Run(Argument* argument) { RunImpl(argument); } // Human-readable short representation. virtual std::string repr() const = 0; // Human-readable long description. virtual std::string description() const { return "No DOC"; } -}; -// GraphPass processes on any GraphType. -class DataFlowGraphPass : public AnalysisPass {}; + protected: + // User should implement these. + virtual void RunImpl(Argument* argument) = 0; + + Argument* argument_{nullptr}; +}; } // namespace analysis } // namespace inference diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index d55303a51e9fee3057455470b4b3139dc5f85e89..c8ed373ee7c32552608d501aa642677f940cd520 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -15,138 +15,23 @@ #include "paddle/fluid/inference/analysis/analyzer.h" #include #include - -#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" -#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" -#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" -#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h" -#include "paddle/fluid/inference/analysis/model_store_pass.h" -#include "paddle/fluid/inference/analysis/pass_manager.h" -#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h" -#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" - -DEFINE_bool(IA_enable_tensorrt_subgraph_engine, false, - "Enable subgraph to TensorRT engine for acceleration"); - -DEFINE_bool(IA_enable_ir, false, "Turn on IR support"); - -DEFINE_string(IA_graphviz_log_root, "./", - "Graphviz debuger for data flow graphs."); - -DEFINE_string(IA_output_storage_path, "", "optimized model output path"); +#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h" +#include "paddle/fluid/inference/analysis/passes/passes.h" namespace paddle { namespace inference { namespace analysis { -class DfgPassManagerImpl final : public DfgPassManager { - public: - DfgPassManagerImpl() { - // TODO(Superjomn) set the key with pass reprs. - if (!FLAGS_IA_enable_ir) { - AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass); - } else { - AddPass("fluid-to-ir-pass", new FluidToIrPass); - } - TryAddTensorRtPass(); - AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass); - if (!FLAGS_IA_output_storage_path.empty()) { - AddPass("model-store-pass", new ModelStorePass); - } - } +Analyzer::Analyzer() {} - std::string repr() const override { return "dfg-pass-manager"; } - std::string description() const override { return "DFG pass manager."; } +void Analyzer::Run(Argument *argument) { RunIrAnalysis(argument); } - private: - void AddPass(const std::string& name, AnalysisPass* pass) { - VLOG(30) << "Adding pass " << name; - Register(name, pass); - AddGraphvizDebugerPass(pass); - } +void Analyzer::RunIrAnalysis(Argument *argument) { + std::vector passes({"ir_analysis_compose_pass"}); - void TryAddTensorRtPass() { - if (FLAGS_IA_enable_tensorrt_subgraph_engine) { - auto trt_teller = [&](const Node* node) { - std::unordered_set teller_set( - {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", - "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", - "elementwise_add", "dropout"}); - if (!node->IsFunction()) return false; - - const auto* func = static_cast(node); - if (teller_set.count(func->func_type())) { - return true; - } else { - return false; - } - }; - - AddPass("tensorrt-subgraph-marker", - new TensorRTSubgraphNodeMarkPass(trt_teller)); - AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller)); - } - } - - // Add the graphviz debuger pass if the parent pass has one. - void AddGraphvizDebugerPass(AnalysisPass* pass) { - auto* debuger_pass = pass->CreateGraphvizDebugerPass(); - if (debuger_pass) { - Register(debuger_pass->repr(), debuger_pass); - } + for (auto &pass : passes) { + PassRegistry::Global().Retreive(pass)->Run(argument); } -}; - -Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); } - -void Analyzer::Run(Argument* argument) { - std::vector passes; - passes.push_back("graph_viz_pass"); // add graphviz for debug. -#ifdef PADDLE_WITH_MKLDNN - if (use_mkldnn_) { - VLOG(30) << "Adding MKL-DNN placement pass"; - passes.push_back("mkldnn_placement_pass"); - } -#endif - // infer_clean_graph_pass should be the first default pass - // after mkldnn_placement_pass. - passes.push_back("infer_clean_graph_pass"); - passes.push_back("graph_viz_pass"); // add graphviz for debug. - for (auto& pass : ir_passes_) { - // skip mkldnn pass when use_mkldnn_ = false; - bool skip_pass = (!use_mkldnn_) && pass.find("mkldnn") != std::string::npos; - if (!disabled_ir_passes_.count(pass) && !skip_pass) { - passes.push_back(pass); - passes.push_back("graph_viz_pass"); // add graphviz for debug. - } - } - argument->Set(kFluidToIrPassesAttr, new std::vector(passes)); - - for (auto& x : data_) { - PADDLE_ENFORCE(x->Initialize(argument)); - x->RunAll(); - PADDLE_ENFORCE(x->Finalize()); - } -} - -Analyzer& Analyzer::IncludeAllIrPasses() { - ir_passes_ = all_ir_passes_; - return *this; -} - -Analyzer& Analyzer::DisableIrPasses(const std::vector& passes) { - disabled_ir_passes_.insert(passes.begin(), passes.end()); - return *this; -} - -Analyzer& Analyzer::IncludeIrPasses(const std::vector& passes) { - ir_passes_ = passes; - return *this; -} - -Analyzer& Analyzer::SetUseMkldnn(bool use_mkldnn) { - use_mkldnn_ = use_mkldnn; - return *this; } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 3af1d572dfd81197797dd7e57d87ba12c2f3548e..b43e67f20f493cd8151871ca3a36eb6fdadcf9ff 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -40,56 +40,21 @@ limitations under the License. */ #include #include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/flags.h" -#include "paddle/fluid/inference/analysis/pass_manager.h" namespace paddle { namespace inference { namespace analysis { -class Analyzer : public OrderedRegistry { +class Analyzer final { public: - // Register all the pass-managers. Analyzer(); void Run(Argument* argument); - Analyzer& DisableIrPasses(const std::vector& passes); - Analyzer& IncludeIrPasses(const std::vector& passes); - Analyzer& IncludeAllIrPasses(); - Analyzer& SetUseMkldnn(bool use_mkldnn); - DISABLE_COPY_AND_ASSIGN(Analyzer); - private: - // All avaiable IR passes. - // The bigger fuse comes first, so that the small operators prefer to be - // merged in a larger fuse op. The small fusion will not break the pattern of - // larger fusion. - const std::vector all_ir_passes_{{ - // Manual update the passes here. - "attention_lstm_fuse_pass", // - "seqconv_eltadd_relu_fuse_pass", // - "embedding_fc_lstm_fuse_pass", // - "fc_lstm_fuse_pass", // - "mul_lstm_fuse_pass", // - "fc_gru_fuse_pass", // - "mul_gru_fuse_pass", // - "seq_concat_fc_fuse_pass", // - "fc_fuse_pass", // - "conv_bn_fuse_pass", // - "conv_eltwiseadd_bn_fuse_pass", // -#ifdef PADDLE_WITH_MKLDNN - "depthwise_conv_mkldnn_pass", // - "conv_bias_mkldnn_fuse_pass", // - "conv_relu_mkldnn_fuse_pass", // - "conv_elementwise_add_mkldnn_fuse_pass", // -#endif - }}; - - std::unordered_set disabled_ir_passes_; - // Ir passes to run - std::vector ir_passes_; - bool use_mkldnn_; + protected: + void RunIrAnalysis(Argument* argument); }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 5430e5c1ef1c70d27295ebc1a9bd427cd95f006a..48fc5dda2a5bfa24d679d4bf655e580dafc614b3 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -27,21 +27,21 @@ namespace analysis { using namespace framework; // NOLINT TEST(Analyzer, analysis_without_tensorrt) { - FLAGS_IA_enable_tensorrt_subgraph_engine = false; Argument argument; - argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); + argument.SetModelDir(FLAGS_inference_model_dir); + argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); + Analyzer analyser; analyser.Run(&argument); } TEST(Analyzer, analysis_with_tensorrt) { - FLAGS_IA_enable_tensorrt_subgraph_engine = true; Argument argument; - argument.Set("minimum_subgraph_size", new int(0)); - argument.Set("max_batch_size", new int(3)); - argument.Set("workspace_size", new int(1 << 20)); - argument.Set("precision_mode", new std::string("FP32")); - argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); + argument.SetTensorRtMaxBatchSize(3); + argument.SetTensorRtWorkspaceSize(1 << 20); + argument.SetModelDir(FLAGS_inference_model_dir); + argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); + Analyzer analyser; analyser.Run(&argument); } diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 9495e2435c79ff660c64322d2acd8e058e09e563..d7a2f3d1e3a3251263c8670aef5db538fa2c48ea 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -24,13 +24,16 @@ #pragma once #include +#include +#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/inference/analysis/data_flow_graph.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/variant.h" namespace paddle { namespace inference { namespace analysis { +using framework::ir::Graph; /* * The argument definition of both Pass and PassManagers. @@ -39,75 +42,99 @@ namespace analysis { */ struct Argument { Argument() = default; - explicit Argument(const std::string& fluid_model_dir) - : fluid_model_dir(new std::string(fluid_model_dir)) {} - // The directory of the trained model. - std::unique_ptr fluid_model_dir; - // The path of `__model__` and `param`, this is used when the file name of - // model and param is changed. - std::unique_ptr fluid_model_program_path; - std::unique_ptr fluid_model_param_path; - - // The graph that process by the Passes or PassManagers. - std::unique_ptr main_dfg; - - // The original program desc. - std::unique_ptr origin_program_desc; - - // The processed program desc. - std::unique_ptr transformed_program_desc; - - // The output storage path of ModelStorePass. - std::unique_ptr model_output_store_path; - - // Support for any other attributes. - template - void Set(const std::string& key, T* data) { - PADDLE_ENFORCE_NOT_NULL(data); - PADDLE_ENFORCE(!attrs_.count(key), "Duplicate set Argument's attr [%s]", - key); - attrs_[key] = data; - attr_deleters_[key] = [data, key]() { - VLOG(30) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; - VLOG(30) << "argument delete attr: " << key; - delete data; - }; - } - - bool Has(const std::string& name) const { return attrs_.count(name); } - - template - T* Release(const std::string& key) { - PADDLE_ENFORCE(attrs_.count(key)); - auto* res = boost::any_cast(attrs_.at(key)); - attrs_.erase(key); - attr_deleters_.erase(key); - return res; - } - - template - T& Get(const std::string& key) { - PADDLE_ENFORCE(Has(key)); - return *boost::any_cast(attrs_.at(key)); - } - - ~Argument() { - for (auto& item : attr_deleters_) { - item.second(); - } - } + explicit Argument(const std::string& model_dir) { SetModelDir(model_dir); } + + using unique_ptr_t = std::unique_ptr>; + using fusion_statis_t = std::unordered_map; + + bool Has(const std::string& key) const { return valid_fields_.count(key); } + +#define DECL_ARGUMENT_FIELD(field__, Field, type__) \ + public: \ + type__& field__() { \ + PADDLE_ENFORCE(Has(#field__)); \ + return field__##_; \ + } \ + void Set##Field(const type__& x) { \ + field__##_ = x; \ + valid_fields_.insert(#field__); \ + } \ + DECL_ARGUMENT_FIELD_VALID(field__); \ + type__* field__##_ptr() { return &field__##_; } \ + \ + private: \ + type__ field__##_; + +#define DECL_ARGUMENT_FIELD_VALID(field__) \ + bool field__##_valid() { return Has(#field__); } + +#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__) \ + public: \ + type__& field__() { \ + PADDLE_ENFORCE_NOT_NULL(field__##_); \ + PADDLE_ENFORCE(Has(#field__)); \ + return *static_cast(field__##_.get()); \ + } \ + void Set##Field(type__* x) { \ + field__##_ = \ + unique_ptr_t(x, [](void* x) { delete static_cast(x); }); \ + valid_fields_.insert(#field__); \ + } \ + void Set##Field##NotOwned(type__* x) { \ + valid_fields_.insert(#field__); \ + field__##_ = unique_ptr_t(x, [](void* x) {}); \ + } \ + DECL_ARGUMENT_FIELD_VALID(field__); \ + type__* field__##_ptr() { \ + PADDLE_ENFORCE(Has(#field__)); \ + return static_cast(field__##_.get()); \ + } \ + type__* Release##Field() { \ + PADDLE_ENFORCE(Has(#field__)); \ + valid_fields_.erase(#field__); \ + return static_cast(field__##_.release()); \ + } \ + \ + private: \ + unique_ptr_t field__##_; + + // Model path + DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string); + // Model specified with program and parameters files. + DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string); + DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string); + + // The overall graph to work on. + DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); + // The overall Scope to work on. + DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope); + + DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc); + + // The ir passes to perform in analysis phase. + DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses, + std::vector); + + DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); + DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); + DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller, + std::function); + DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); + DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); + + // The program transformed by IR analysis phase. + DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, + framework::proto::ProgramDesc); + + DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t); private: - std::unordered_map attrs_; - std::unordered_map> attr_deleters_; + std::unordered_set valid_fields_; }; -#define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) -#define ANALYSIS_ARGUMENT_CHECK_FIELD(field__) \ - if (UNLIKELY(!(field__))) { \ - LOG(ERROR) << "field " << #field__ << " should be set."; \ - return false; \ - } +#define ARGUMENT_CHECK_FIELD(argument__, fieldname__) \ + PADDLE_ENFORCE(argument__->Has(#fieldname__), \ + "the argument field [%s] should be set", #fieldname__); } // namespace analysis } // namespace inference diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc deleted file mode 100644 index bdcb30f159ec17a8d3b23d020b2ea082e71ace1b..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ /dev/null @@ -1,496 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/inference/analysis/data_flow_graph.h" -#include "paddle/fluid/inference/analysis/dot.h" -#include "paddle/fluid/inference/analysis/node.h" - -namespace paddle { -namespace inference { -namespace analysis { -using ir_node_t = framework::ir::Node; -using ir_graph_t = framework::ir::Graph; - -// It is a better idea that the inputs and outputs of this graph is set manually -// before, but there must be a Pass that helps to prune the unnecessary ops that -// do not contribute to the given targets, so in this pass, analysis and get the -// inputs and outputs is OK. -void DataFlowGraph::Build() { - inputs_.clear(); - outputs_.clear(); - std::unordered_set ins; - std::unordered_set outs; - for (auto &node : nodes.nodes()) { - for (auto *in : node->inlinks) { - ins.insert(in); - } - for (auto *out : node->outlinks) { - outs.insert(out); - } - } - - // The nodes that in ins but not in outs is the graph's inputs - // similarly, the nodes that in outs but not in ins is the graphs' outputs - for (auto *in : ins) { - if (!outs.count(in)) { - inputs_.push_back(in); - } - } - for (auto *out : outs) { - if (!ins.count(out)) { - outputs_.push_back(out); - } - } - - Clean(); -} - -void DataFlowGraph::Build(const framework::proto::ProgramDesc &prog) { - // insert vars - // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id - // will keep updating to its latest alias during the graph-building. - std::unordered_map var2id; - auto &main_block = prog.blocks(framework::kRootBlockIndex); - for (int i = 0; i < main_block.vars_size(); i++) { - const auto &var = main_block.vars(i); - auto *v = nodes.Create(Node::Type::kValue); - v->SetName(var.name()); - v->SetPbDesc(const_cast(static_cast(&var))); - v->SetPbMsg(var.SerializeAsString()); - var2id[var.name()] = v->id(); - } - - // The variables in a SSA can only write once, so if a variable is written - // multiple times(quite common in our ProgramDesc design), multiple alias - // Nodes of this variable will be created, and each will just write once. - - // An set that keep all the names of the variables(the original, not alias) - // that have been written(as outputs). Once an Op's output variable hit the - // set, it should create a new alias and update the global alias for this - // variable. And that make a Data Flow Graph a SSA. - std::unordered_set unique_written_vars; - for (int i = 0; i < main_block.ops_size(); i++) { - const auto &op = main_block.ops(i); - auto *o = nodes.Create(Node::Type::kFunction); - o->SetName(op.type()); - static_cast(o)->SetFuncType(op.type()); - // Link to the original protobuf message's memory, make it easier to - // generate from a data flow graph to fluid ProgramDesc. - o->SetPbDesc(const_cast(static_cast(&op))); - o->SetPbMsg(op.SerializeAsString()); - - // set inputs and outputs - for (int j = 0; j < op.inputs_size(); j++) { - auto &in_var = op.inputs(j); - for (int k = 0; k < in_var.arguments_size(); k++) { - auto *in = nodes.GetMutable(var2id.at(in_var.arguments(k))); - in->outlinks.push_back(o); - o->inlinks.push_back(in); - unique_written_vars.insert(in); - } - } - for (int j = 0; j < op.outputs_size(); j++) { - auto &out_var = op.outputs(j); - for (int k = 0; k < out_var.arguments_size(); k++) { - auto *out = nodes.GetMutable(var2id[out_var.arguments(k)]); - if (unique_written_vars.count(out)) { - // Loop found, for example, a = op(a), use SSA, change to a1 = op(a). - auto *out_alias = nodes.Create(Node::Type::kValue); - out_alias->SetName(out->name()); - out_alias->SetPbDesc(out->pb_desc()); - out_alias->SetPbMsg(out->pb_msg()); - var2id[out_alias->name()] = - out_alias->id(); // update variable's alias Node - LOG(INFO) << "loop found in graph, create SSA alias node [" - << out_alias->repr() << "] for [" << out->repr() << "]"; - out = out_alias; - } - out->inlinks.push_back(o); - o->outlinks.push_back(out); - } - } - } - // Analysis and extract the inputs and outputs of this graph. - Build(); -} - -void DataFlowGraph::Build(const framework::ir::Graph &graph) { - // Create nodes - std::unordered_map ir_node_map; - for (auto *ir_node : graph.Nodes()) { - Node *x{nullptr}; - if (ir_node->IsOp()) { - PADDLE_ENFORCE(ir_node->Op()); - VLOG(40) << "get op " << ir_node << " " << ir_node->Name(); - x = nodes.Create(Node::Type::kFunction); - x->attr("ir_node").Pointer() = ir_node; - PADDLE_ENFORCE(ir_node->Op()->Proto()); - x->SetName(ir_node->Op()->Proto()->type()); - x->SetPbMsg(ir_node->Op()->Proto()->SerializeAsString()); - } else if (ir_node->IsVar()) { - // Not create a Node for IR ControlDepVar, considering Inference currently - // just used in single thread scenerio. - VLOG(40) << "get var " << ir_node->Name(); - x = nodes.Create(Node::Type::kValue); - x->attr("ir_node").Pointer() = ir_node; - x->SetName(ir_node->Name()); - // x->SetPbMsg(ir_node->Var()->Proto()->SerializeAsString()); - } else { - PADDLE_THROW("Failed to create an Node from IR, unknown type"); - } - ir_node_map.emplace(ir_node, x); - } - VLOG(40) << "finish creating Nodes"; - - VLOG(40) << "to create edge"; - // Create links - for (auto *ir_node : graph.Nodes()) { - auto it = ir_node_map.find(ir_node); - // Skip ControlDepVar. - if (it == ir_node_map.end()) continue; - auto *node = it->second; - for (auto *x : ir_node->inputs) { - if (!ir_node_map.count(x)) continue; - node->inlinks.push_back(ir_node_map.at(x)); - } - for (auto *x : ir_node->outputs) { - if (!ir_node_map.count(x)) continue; - node->outlinks.push_back(ir_node_map.at(x)); - } - } - - Build(); - PADDLE_ENFORCE(!inputs_.empty(), - "Can't deduce any inputs from the graph, Is the graph empty?"); - - ir_graph = &graph; - VLOG(30) << "finished build from IR"; -} - -void DataFlowGraph::Clean() { - for (auto &node : nodes.nodes()) { - std::unordered_set inlinks_set(node->inlinks.begin(), - node->inlinks.end()); - std::unordered_set outlinks_set(node->outlinks.begin(), - node->outlinks.end()); - if (inlinks_set.size() < node->inlinks.size()) { - node->inlinks.assign(inlinks_set.begin(), inlinks_set.end()); - } - if (outlinks_set.size() < node->outlinks.size()) { - node->outlinks.assign(outlinks_set.begin(), outlinks_set.end()); - } - } -} - -std::string DataFlowGraph::DotString() const { - Dot dot; - - // Add nodes - for (size_t i = 0; i < nodes.size(); i++) { - const Node &node = nodes.Get(i); - dot.AddNode(node.repr(), node.dot_attrs()); - } - - // Add edges - for (size_t i = 0; i < nodes.size(); i++) { - const Node &node = nodes.Get(i); - for (auto &in : node.inlinks) { - dot.AddEdge(in->repr(), node.repr(), {}); - } - } - return dot.Build(); -} - -std::string DataFlowGraph::HumanReadableInfo(bool show_values, - bool show_functions) const { - std::stringstream values, functions; - for (auto &n : nodes.nodes()) { - if (show_values && n->IsValue()) { - values << n->repr() << "\n"; - } - if (show_functions && n->IsFunction()) { - functions << n->repr() << "\n"; - } - } - return "Values:\n" + values.str() + "\n\n" + "Functions:\n" + functions.str(); -} - -// -// NodesBFSIterator -// - -GraphTraits::NodesBFSIterator::NodesBFSIterator( - const std::vector &source) - : queue_(source.begin(), source.end()) {} - -GraphTraits::NodesBFSIterator::NodesBFSIterator( - GraphTraits::NodesBFSIterator &&other) noexcept - : queue_(std::move(other.queue_)), - visited_(std::move(other.visited_)) {} - -GraphTraits::NodesBFSIterator::NodesBFSIterator( - const GraphTraits::NodesBFSIterator &other) - : queue_(other.queue_), visited_(other.visited_) {} - -Node &GraphTraits::NodesBFSIterator::operator*() { - PADDLE_ENFORCE(!queue_.empty()); - return *queue_.front(); -} - -Node *GraphTraits::NodesBFSIterator::operator->() { - PADDLE_ENFORCE(!queue_.empty()); - return queue_.front(); -} - -GraphTraits::NodesBFSIterator & -GraphTraits::NodesBFSIterator::operator=( - const GraphTraits::NodesBFSIterator &other) { - queue_ = other.queue_; - visited_ = other.visited_; - return *this; -} - -GraphTraits::NodesBFSIterator - &GraphTraits::NodesBFSIterator::operator++() { - PADDLE_ENFORCE(!queue_.empty()); - auto *cur = queue_.front(); - visited_.insert(cur); - queue_.pop_front(); - for (auto *output : cur->outlinks) { - if (!visited_.count(output)) { - queue_.push_back(output); - visited_.insert(output); - } - } - return *this; -} - -bool GraphTraits::NodesBFSIterator::operator==( - const GraphTraits::NodesBFSIterator &other) { - if (queue_.empty()) return other.queue_.empty(); - if ((!queue_.empty()) && (!other.queue_.empty())) { - return queue_.front() == other.queue_.front() && - visited_.size() == other.visited_.size(); - // equality of queue and - // visited. Just a light but week implementation. - } - return false; -} - -// -// NodesDFSIterator -// -GraphTraits::NodesDFSIterator::NodesDFSIterator( - const std::vector &source) { - for (auto *x : source) stack_.push(x); -} - -GraphTraits::NodesDFSIterator::NodesDFSIterator( - GraphTraits::NodesDFSIterator &&other) noexcept - : stack_(std::move(other.stack_)), - visited_(std::move(other.visited_)) {} - -GraphTraits::NodesDFSIterator::NodesDFSIterator( - const GraphTraits::NodesDFSIterator &other) - : stack_(other.stack_), visited_(other.visited_) {} - -Node &GraphTraits::NodesDFSIterator::operator*() { - PADDLE_ENFORCE(!stack_.empty()); - return *stack_.top(); -} - -GraphTraits::NodesDFSIterator - &GraphTraits::NodesDFSIterator::operator++() { - if (stack_.empty()) return *this; - visited_.insert(stack_.top()); - auto *cur = stack_.top(); - stack_.pop(); - for (auto *x : cur->outlinks) { - if (!visited_.count(x)) { - stack_.push(x); - visited_.insert(x); - } - } - return *this; -} -bool GraphTraits::NodesDFSIterator::operator==( - const GraphTraits::NodesDFSIterator &other) { - if (stack_.empty()) return other.stack_.empty(); - if ((!stack_.empty()) && (!other.stack_.empty())) { - return stack_.top() == other.stack_.top(); - } - return false; -} - -GraphTraits::NodesDFSIterator & -GraphTraits::NodesDFSIterator::operator=( - const GraphTraits::NodesDFSIterator &other) { - stack_ = other.stack_; - visited_ = other.visited_; - return *this; -} -Node *GraphTraits::NodesDFSIterator::operator->() { - return stack_.top(); -} - -inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) { - return node.inlinks.size() == n; -} - -GraphTraits::NodesTSIterator::NodesTSIterator( - const std::vector &source) { - PADDLE_ENFORCE(!source.empty(), - "Start points of topological sorting should not be empty!"); - // CHECK all the inputs' in-degree is 0 - for (auto *node : source) { - PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0)); - } - - std::unordered_set visited; - std::unordered_set to_visit{source.begin(), source.end()}; - - std::vector inlink_visited; - while (!to_visit.empty()) { - std::vector queue(to_visit.begin(), to_visit.end()); - for (auto *p : queue) { - if (p->deleted()) { - visited.insert(p); - to_visit.erase(p); - continue; - } - inlink_visited.clear(); - - std::copy_if(p->inlinks.begin(), p->inlinks.end(), - std::back_inserter(inlink_visited), - [&](Node *x) { return visited.count(x); }); - - if (inlink_visited.size() == p->inlinks.size()) { - sorted_.push_back(p); - for (auto *_ : p->outlinks) { - if (!visited.count(_)) { - to_visit.insert(_); - } - } - - to_visit.erase(p); - visited.insert(p); - } - } - } -} - -GraphTraits::NodesTSIterator::NodesTSIterator( - const paddle::inference::analysis::GraphTraits< - DataFlowGraph>::NodesTSIterator &other) - : sorted_(other.sorted_), cursor_(other.cursor_) {} - -Node &GraphTraits::NodesTSIterator::operator*() { - PADDLE_ENFORCE_LT(cursor_, sorted_.size()); - return *sorted_[cursor_]; -} - -paddle::inference::analysis::GraphTraits::NodesTSIterator - &GraphTraits::NodesTSIterator::operator++() { - if (++cursor_ >= sorted_.size()) { - sorted_.clear(); - cursor_ = 0; - } - return *this; -} -paddle::inference::analysis::GraphTraits::NodesTSIterator & -GraphTraits::NodesTSIterator::operator=( - const paddle::inference::analysis::GraphTraits< - DataFlowGraph>::NodesTSIterator &other) { - cursor_ = other.cursor_; - sorted_ = other.sorted_; - return *this; -} - -bool GraphTraits::NodesTSIterator::operator==( - const paddle::inference::analysis::GraphTraits< - DataFlowGraph>::NodesTSIterator &other) { - return sorted_ == other.sorted_ && cursor_ == other.cursor_; -} - -Node *GraphTraits::NodesTSIterator::operator->() { - PADDLE_ENFORCE_LT(cursor_, sorted_.size()); - return sorted_[cursor_]; -} - -std::pair, std::vector> -ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT - std::unordered_set nodes(graph.begin(), graph.end()); - std::unordered_set inputs; - std::unordered_set outputs; - // Input a Value, check whether its inlink is in the subgraph. - auto inlink_in_subgraph = [&](Node *n) { - for (auto *in : n->inlinks) { - if (nodes.count(in)) return true; - } - return false; - }; - - for (auto &node : graph) { - for (auto *in : node->inlinks) { - // The Value that is written by nodes inside a sub-graph shouldn't be the - // input of the sub-graph. - if (!nodes.count(in) && in->type() == Node::Type::kValue && - !inlink_in_subgraph(in)) { - inputs.insert(in); - } - } - for (auto *out : node->outlinks) { - if (!nodes.count(out) && out->type() == Node::Type::kValue) { - outputs.insert(out); - } - } - } - return std::make_pair(std::vector(inputs.begin(), inputs.end()), - std::vector(outputs.begin(), outputs.end())); -} - -// Filter the Intermediate results of the subgraph node. -void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { - std::vector op_nodes; - for (auto &node : GraphTraits(*graph).nodes_in_TS()) { - if (node.type() == Node::Type::kValue || node.deleted()) { - continue; - } - op_nodes.push_back(&node); - } - size_t op_num = op_nodes.size(); - for (size_t i = 0; i < op_num; i++) { - if (op_nodes[i]->type() == Node::Type::kFunction) continue; - std::unordered_set follow_up_input_names; - for (size_t j = i + 1; j < op_num; j++) { - for (auto *in : op_nodes[j]->inlinks) { - follow_up_input_names.insert(in->name()); - } - } - std::vector filtered_subgraph_outlinks; - for (auto *out : op_nodes[i]->outlinks) { - if (follow_up_input_names.count(out->name())) { - filtered_subgraph_outlinks.push_back(out); - } else { - out->SetDeleted(); - } - } - // The filtered_subgraph_outlinks may be empty. - op_nodes[i]->outlinks = filtered_subgraph_outlinks; - } -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h deleted file mode 100644 index 437e097acd24aad384df6712ce0de6106b3b5c65..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ /dev/null @@ -1,209 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - * Data flow graph is an pass that build the basic graph. It contains a graph - * and the iterators that enable the iteration over the graph. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/inference/analysis/graph_traits.h" -#include "paddle/fluid/inference/analysis/node.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace inference { -namespace analysis { - -/* - * DataFlowGraph - A container of Value and Function Nodes. - * - * This is the base graph for any other type of graphs, such as SSA or CFG. - */ -struct DataFlowGraph { - NodeMap nodes; - // inputs and outputs are deduced from the graph. - // Used to interact with IR. - const framework::ir::Graph *ir_graph{nullptr}; - - // Extract inputs and outputs of the graph. - void Build(); - - void Build(const framework::proto::ProgramDesc &prog); - - // Build a graph from ir::Graph. - void Build(const framework::ir::Graph &graph); - - // Get an attribute. - AnyAttr &Attr(const std::string &key) { return attrs_[key]; } - - // Output a DOT graph file for debug. - std::string DotString() const; - - std::string HumanReadableInfo(bool show_values = true, - bool show_functions = true) const; - - const std::vector &inputs() const { - PADDLE_ENFORCE(!inputs_.empty(), - "No inputs are deduced, need to Build() first."); - return inputs_; - } - const std::vector &outputs() const { - PADDLE_ENFORCE(!outputs_.empty(), - "No outputs are deduced, need to Build() first."); - return outputs_; - } - - private: - mutable std::vector inputs_; - mutable std::vector outputs_; - std::unordered_map attrs_; - - // Remove duplicate edges and so on. - void Clean(); -}; - -/* - * An graph trait help to traverse the graph using BFS. - * The BFS start from a graph's inputs, the graph should be fully-connected, so - * that the iterator can reach the end. - */ -template <> -struct GraphTraits { - // BFS iterator on nodes. - struct NodesBFSIterator - : public std::iterator { - NodesBFSIterator() = default; - explicit NodesBFSIterator(const std::vector &source); - NodesBFSIterator(NodesBFSIterator &&other) noexcept; - // NOTE Heavy to use. - NodesBFSIterator(const NodesBFSIterator &other); - - Node &operator*(); - NodesBFSIterator &operator++(); - Node *operator->(); - // TODO(Superjomn) current implementation just compare the first - // element, need to compare the graph and all the elements in the queue and - // set. - NodesBFSIterator &operator=(const NodesBFSIterator &other); - bool operator==(const NodesBFSIterator &other); - bool operator!=(const NodesBFSIterator &other) { return !(*this == other); } - - private: - std::deque queue_; - std::unordered_set visited_; - }; - - // DFS iterator on nodes. - struct NodesDFSIterator - : public std::iterator { - NodesDFSIterator() = default; - NodesDFSIterator(const std::vector &source); - NodesDFSIterator(NodesDFSIterator &&other) noexcept; - NodesDFSIterator(const NodesDFSIterator &other); - - Node &operator*(); - NodesDFSIterator &operator++(); - // TODO(Superjomn) current implementation just compare the first - // element, need to compare the graph and all the elements in the queue and - // set. - NodesDFSIterator &operator=(const NodesDFSIterator &other); - bool operator==(const NodesDFSIterator &other); - bool operator!=(const NodesDFSIterator &other) { return !(*this == other); } - Node *operator->(); - - private: - std::stack stack_; - std::unordered_set visited_; - }; - - // Topological sorting iterator on nodes. - struct NodesTSIterator - : public std::iterator { - NodesTSIterator() = default; - NodesTSIterator(const std::vector &source); - NodesTSIterator(NodesTSIterator &&other) - : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) { - other.cursor_ = 0; - } - NodesTSIterator(const NodesTSIterator &other); - - Node &operator*(); - NodesTSIterator &operator++(); - // TODO(Superjomn) current implementation just compare the first - // element, need to compare the graph and all the elements in the queue and - // set. - NodesTSIterator &operator=(const NodesTSIterator &other); - bool operator==(const NodesTSIterator &other); - bool operator!=(const NodesTSIterator &other) { return !(*this == other); } - Node *operator->(); - - private: - std::vector sorted_; - size_t cursor_{0}; - }; - - explicit GraphTraits(const DataFlowGraph &graph) : graph_(graph) {} - - // default use BFS to visit the nodes. - iterator_range nodes() { - return iterator_range(nodes_bfs_begin(), nodes_bfs_end()); - } - iterator_range nodes_in_BFS() { - return iterator_range(nodes_bfs_begin(), nodes_bfs_end()); - } - iterator_range nodes_in_DFS() { - return iterator_range(nodes_dfs_begin(), nodes_dfs_end()); - } - iterator_range nodes_in_TS() { - return iterator_range(nodes_ts_begin(), nodes_ts_end()); - } - - private: - NodesBFSIterator nodes_bfs_begin() { - return NodesBFSIterator(graph_.inputs()); - } - NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); } - - NodesDFSIterator nodes_dfs_begin() { - return NodesDFSIterator(graph_.inputs()); - } - NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); } - - NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_.inputs()); } - NodesTSIterator nodes_ts_end() { return NodesTSIterator(); } - - private: - const DataFlowGraph &graph_; -}; - -// Extract the inputs and outputs of a graph. The inputs and outputs of a -// sub-graph is the inputs nodes and output nodes that doesn't inside the -// sub-graph. -std::pair, std::vector> -ExtractInputAndOutputOfSubGraph(std::vector &graph); // NOLINT - -void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph); -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc deleted file mode 100644 index 50ce20621fb289023ecccf7bb39d98169765d5ee..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc +++ /dev/null @@ -1,168 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/inference/analysis/data_flow_graph.h" -#include "paddle/fluid/framework/op_proto_maker.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" - -namespace paddle { -namespace inference { -namespace analysis { - -TEST(DataFlowGraph, BFS) { - auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); - auto dfg = ProgramDescToDFG(desc); - dfg.Build(); - - for (auto* in : dfg.inputs()) { - LOG(INFO) << "inputs: " << in->name() << " " - << static_cast(in->type()); - } - for (auto* out : dfg.outputs()) { - LOG(INFO) << "outputs: " << out->name() << " " - << static_cast(out->type()); - } - - size_t count = 0; - for (auto& node : GraphTraits(dfg).nodes()) { - LOG(INFO) << "visiting " << node.name(); - ++count; - } - ASSERT_EQ(count, dfg.nodes.size()); -} - -TEST(DataFlowGraph, DFS) { - auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); - DataFlowGraph dfg; - dfg.Build(desc); - size_t count = 0; - for (auto& node : GraphTraits(dfg).nodes_in_DFS()) { - LOG(INFO) << "visiting " << node.name(); - ++count; - } - ASSERT_EQ(count, dfg.nodes.size()); -} - -// Topological sorting. -/* - * Graph topology - * inputs: 0, 1, 2 - * 0 -> 4 - * 0 -> 5 - * 1 -> 6 - * 2 -> 7 - * 4 -> 5 - * 4 -> 7 - * 4 -> 3 - * 7 -> 3 - */ -TEST(DataFlowGraph, TS) { - DataFlowGraph graph; - - for (int i = 0; i < 8; i++) { - auto* node = graph.nodes.Create(Node::Type::kValue); - node->SetName("node-" + std::to_string(i)); - } - - auto add_link = [&](int i, int j) { - Node* source = graph.nodes.GetMutable(i); - Node* target = graph.nodes.GetMutable(j); - target->inlinks.push_back(source); - source->outlinks.push_back(target); - }; - - add_link(0, 4); - add_link(0, 5); - add_link(1, 6); - add_link(2, 7); - add_link(4, 5); - add_link(4, 7); - add_link(4, 3); - add_link(7, 3); - graph.Build(); - - auto its = GraphTraits(graph).nodes_in_TS(); - std::vector sorted_ids; - for (auto it = its.begin(); it != its.end(); ++it) { - LOG(INFO) << it->name(); - sorted_ids.push_back(it->id()); - } - - // Assert a occurs prior to b in the sorted_ids. - auto assert_positive_sequence_pair = [&](int a, int b) { - auto a_offset = std::find(sorted_ids.begin(), sorted_ids.end(), a); - auto b_offset = std::find(sorted_ids.begin(), sorted_ids.end(), b); - ASSERT_LT(a_offset, b_offset); - }; - - assert_positive_sequence_pair(2, 7); - assert_positive_sequence_pair(7, 3); - assert_positive_sequence_pair(4, 3); - assert_positive_sequence_pair(0, 4); - assert_positive_sequence_pair(0, 5); - assert_positive_sequence_pair(1, 6); - assert_positive_sequence_pair(4, 5); - assert_positive_sequence_pair(4, 7); -} - -TEST(DataFlowGraph, Build_ProgramDesc) { - auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); - DataFlowGraph graph; - graph.Build(desc); - ASSERT_EQ(graph.nodes.size(), 38UL); -} - -void SetOp(framework::ProgramDesc* prog, const std::string& type, - const std::vector& inputs, - const std::vector& outputs) { - auto* op = prog->MutableBlock(0)->AppendOp(); - op->SetType(type); - op->SetInput("Xs", inputs); - op->SetOutput("Xs", outputs); - op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(), - static_cast(framework::OpRole::kForward)); -} - -TEST(DataFlowGraph, Build_IR_Graph) { - framework::ProgramDesc prog; - for (auto& v : std::vector({"a", "b", "c", "d", "e", "f"})) { - auto* var = prog.MutableBlock(0)->Var(v); - var->SetType(framework::proto::VarType::SELECTED_ROWS); - if (v == "c") { - var->SetPersistable(true); - } - } - - SetOp(&prog, "OP0", std::vector({"a"}), - std::vector({"b"})); - SetOp(&prog, "OP1", std::vector({"a"}), - std::vector({"c"})); - SetOp(&prog, "mul", std::vector({"b", "c"}), - std::vector({"d"})); - SetOp(&prog, "elementwise_add", std::vector({"d", "e"}), - std::vector({"f"})); - - DataFlowGraph graph; - - framework::ir::Graph ir_graph(prog); - - graph.Build(ir_graph); - - ASSERT_EQ(graph.nodes.size(), ir_graph.Nodes().size()); -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc deleted file mode 100644 index dbe138514b20a4be20e7cca800f8e12b230e7824..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ /dev/null @@ -1,285 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" -#include -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/framework/proto_desc.h" -#include "paddle/fluid/inference/analysis/analyzer.h" -#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" -#include "paddle/fluid/inference/io.h" - -namespace paddle { -namespace inference { - -namespace analysis { - -using framework::proto::ProgramDesc; - -std::vector ExtractParameters( - const std::vector> &nodes); - -bool DataFlowGraphToFluidPass::Initialize(Argument *argument) { - ANALYSIS_ARGUMENT_CHECK_FIELD(argument) - ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc) - // The transformed_program_desc should inherit all the VarDesc and BlockDesc - // from the original program desc. The operators of the main block(the first - // block) should rewritten by data flow graph. - argument->transformed_program_desc.reset( - new ProgramDesc(*argument->origin_program_desc)); - argument->transformed_program_desc->mutable_blocks(framework::kRootBlockIndex) - ->clear_ops(); - desc_ = argument->transformed_program_desc.get(); - argument_ = argument; - return true; -} - -bool DataFlowGraphToFluidPass::Finalize() { return true; } - -void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) { - // FilterRedundantOutputOfSubGraph(graph); - for (auto &node : GraphTraits(*graph).nodes_in_TS()) { - if (node.deleted()) continue; - - switch (node.type()) { - case Node::Type::kFunction: { - AddFluidOp(&node); - } break; - case Node::Type::kFunctionBlock: { - AddEngineOp(&node); - } break; - default: - continue; - } - } - - if (argument_->Has(framework::ir::kParamScopeAttr)) { - LOG(WARNING) << "parameter changes in the scope takes effect"; - } - - PADDLE_ENFORCE(argument_->transformed_program_desc.get()); -} - -void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { - PADDLE_ENFORCE(node); - PADDLE_ENFORCE(node->IsFunction()); - PADDLE_ENFORCE(node->pb_desc() || !node->pb_msg().empty(), - "node has invalid protobuf repr."); - - // currently only the main block is analyzed. - PADDLE_ENFORCE(desc_); - auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); - auto *op = main_block->add_ops(); - - if (node->pb_desc()) { - auto *ori_op = static_cast(node->pb_desc()); - *op = - *ori_op; // copy the attributes, by default, these will not be changed - // by analysis phrase. - // The inputs and outputs of the existing ops are not changed by tensorrt - // subgraph pass. - // NOTE It might be changed by other passes in the long run. - } else { - op->ParseFromString(node->pb_msg()); - } -} - -void CreateTrtEngineOp(Node *node, Argument *argument, - framework::proto::BlockDesc *block) { - PADDLE_ENFORCE(argument->main_dfg.get()); - const DataFlowGraph &graph = *(argument->main_dfg); - static int counter{0}; - PADDLE_ENFORCE(node->IsFunctionBlock()); - framework::OpDesc desc; - auto *func = static_cast(node); - - // collect inputs - std::unordered_set input_names; - std::unordered_set input_names_with_id; - for (auto *x : func->inlinks) { - input_names.insert(x->name()); - input_names_with_id.insert(x->name() + std::to_string(x->id())); - } - desc.SetInput( - "Xs", std::vector(input_names.begin(), input_names.end())); - - std::unordered_set output_names; - std::unordered_set output_names_with_id; - for (auto *x : func->outlinks) { - output_names.insert(x->name()); - output_names_with_id.insert(x->name() + std::to_string(x->id())); - } - - desc.SetOutput( - "Ys", std::vector(output_names.begin(), output_names.end())); - desc.SetType("tensorrt_engine"); - - std::unordered_map output_name_map; - - // The following procedure is used to rename all the intermediate - // variables and the output variables of the subgraph. - // Why we do this? - // During the transition from fluid OP to tensorrt OP, we map - // the input and output Tensor(fluid data structure) of fluid OP - // to the correspondin ITensor (trt data structure) through the - // Tensor name. When we set up ITensor for an variable, we must - // ensure that it has not been set before. - // If there is variable in the fluid graph, which is not only the - // input of a OP, but also the output of a Op, there will be problems. - // So we have to rename the variable in the subgraph to make sure - // it is either an OP's input or an OP's output. - - auto subgraph_nodes = func->subgraph; - for (int index = 0; index < block->ops_size(); index++) { - framework::proto::OpDesc *op = block->mutable_ops(index); - auto correspond_node = subgraph_nodes[index]; - PADDLE_ENFORCE_EQ(correspond_node->name(), op->type()); - - std::unordered_map var2id; - for (auto *in_var : correspond_node->inlinks) { - var2id[in_var->name()] = in_var->id(); - } - // rename for the input variables of op inside subgraph - for (int i = 0; i < op->inputs_size(); i++) { - framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i); - std::vector replaced_names; - for (int k = 0; k < in_var->arguments_size(); k++) { - std::string arg_value = in_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (input_names_with_id.count(arg_value_with_id)) { - replaced_names.push_back(arg_value); - } else { - replaced_names.push_back(arg_value_with_id); - } - } - in_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - in_var->add_arguments(replaced_names[k]); - } - } - var2id.clear(); - for (auto out_var : correspond_node->outlinks) { - var2id[out_var->name()] = out_var->id(); - } - - // rename for the output variables of op inside subgraph - for (int i = 0; i < op->outputs_size(); i++) { - framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); - std::vector replaced_names; - for (int k = 0; k < out_var->arguments_size(); k++) { - std::string arg_value = out_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (output_names_with_id.count(arg_value_with_id)) { - output_name_map[arg_value] = arg_value_with_id; - } - replaced_names.push_back(arg_value_with_id); - } - out_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - out_var->add_arguments(replaced_names[k]); - } - } - } - // When tensorrt engine runs at the end of the operation, - // output_mapping help us copy the data from the renamed ITensor - // to Tensor. - std::vector output_mapping; - for (auto name : output_names) { - PADDLE_ENFORCE(output_name_map.count(name) != 0); - output_mapping.push_back(output_name_map[name]); - } - - PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc"); - // Set attrs - - SetAttr(desc.Proto(), "subgraph", block->SerializeAsString()); - SetAttr(desc.Proto(), "max_batch_size", argument->Get("max_batch_size")); - SetAttr(desc.Proto(), "workspace_size", argument->Get("workspace_size")); - SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++)); - SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); - SetAttr(desc.Proto(), "output_name_mapping", output_mapping); - node->SetPbMsg(desc.Proto()->SerializeAsString()); -} - -std::vector ExtractParameters( - const std::vector> &nodes) { - std::vector parameters; - for (const auto &node : nodes) { - if (!node->IsValue()) continue; - PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first"); - framework::proto::VarDesc var; - var.ParseFromString(node->pb_msg()); - if (var.persistable()) { - parameters.push_back(var.name()); - } - } - return parameters; -} - -void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { - // TODO(Superjomn) Here need to expose some arguments for default setting. - PADDLE_ENFORCE(node->IsFunctionBlock()); - auto *block_node = static_cast(node); - framework::proto::BlockDesc proto; - framework::BlockDesc block_desc(nullptr, &proto); - block_desc.Proto()->set_parent_idx(-1); - block_desc.Proto()->set_idx(0); - VLOG(40) << "origin variable size: " - << argument_->origin_program_desc->blocks(0).vars().size(); - VLOG(40) << "transformed variable size: " - << block_desc.Proto()->vars().size(); - // copy ops. - - for (auto *node : block_node->subgraph) { - auto *op = block_desc.AppendOp(); - PADDLE_ENFORCE(!node->pb_msg().empty()); - op->Proto()->ParseFromString(node->pb_msg()); - } - - *block_desc.Proto()->mutable_vars() = - argument_->origin_program_desc->blocks(0).vars(); - PADDLE_ENFORCE(!block_desc.Proto()->vars().empty()); - CreateTrtEngineOp(node, argument_, block_desc.Proto()); - auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); - auto *op = main_block->add_ops(); - PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block"); - op->ParseFromString(node->pb_msg()); -} - -namespace { -class DFG_DebuggerPass : public DFG_GraphvizDrawPass { - public: - using Config = DFG_GraphvizDrawPass::Config; - explicit DFG_DebuggerPass(const Config &config) - : DFG_GraphvizDrawPass(config) {} - - std::string repr() const override { return "dfg-to-fluid-debuger-pass"; } - - bool Finalize() override { return true; } -}; -} // namespace - -AnalysisPass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const { - return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config( - FLAGS_IA_graphviz_log_root, - "data_flow_graph_to_fluid_graphviz_debugger")); -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h deleted file mode 100644 index 891c7226e245fa3b92892785362c186185a61f62..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -/* - * This file implements the transformation from fluid ProgramDesc to data flow - * graph. - */ - -#pragma once - -#include -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/inference/analysis/analysis_pass.h" -#include "paddle/fluid/inference/analysis/data_flow_graph.h" - -namespace paddle { -namespace inference { - -namespace analysis { -class DataFlowGraphToFluidPass final : public DataFlowGraphPass { - public: - DataFlowGraphToFluidPass() = default; - - bool Initialize(Argument *argument) override; - bool Finalize() override; - - void Run(DataFlowGraph *graph) override; - - std::string repr() const override { return "DFG to fluid"; } - std::string description() const override { - return "Transform a DFG to a Fluid ProgramDesc"; - } - - AnalysisPass *CreateGraphvizDebugerPass() const override; - - protected: - // Add a Fluid Op into the ProgramDesc. - void AddFluidOp(Node *node); - // Add a EngineOp into the ProgramDesc. - void AddEngineOp(Node *node); - - private: - framework::proto::ProgramDesc *desc_; - Argument *argument_; -}; -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc deleted file mode 100644 index 4ef381db295b986b91173a728b6d98640f6f4f51..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" - -#include -#include -#include -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/io.h" - -namespace paddle { -namespace inference { -namespace analysis { - -TEST(DataFlowGraph, Test) { - Argument argument(FLAGS_inference_model_dir); - - FluidToDataFlowGraphPass pass0; - DataFlowGraphToFluidPass pass1; - ASSERT_TRUE(pass0.Initialize(&argument)); - ASSERT_TRUE(pass1.Initialize(&argument)); - - pass0.Run(argument.main_dfg.get()); - pass1.Run(argument.main_dfg.get()); - - pass0.Finalize(); - pass1.Finalize(); - - LOG(INFO) << argument.main_dfg->nodes.size(); -} - -}; // namespace analysis -}; // namespace inference -}; // namespace paddle diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc deleted file mode 100644 index 8888529a57a29c4349095c2ff4c527346716e026..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" - -namespace paddle { -namespace inference { -namespace analysis { - -int DFG_GraphvizDrawPass::counter_{0}; - -void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) { - auto content = Draw(graph); - auto dot_path = GenDotPath(); - std::ofstream file(dot_path); - file.write(content.c_str(), content.size()); - file.close(); - - auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png"; - std::string message; - VLOG(30) << "draw to " << png_path; - ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message); -} - -std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) { - Dot dot; - // Add nodes - for (size_t i = 0; i < graph->nodes.size(); i++) { - const Node &node = graph->nodes.Get(i); - if (config_.display_deleted_node || !node.deleted()) { - dot.AddNode(node.repr(), node.dot_attrs()); - } - } - // Add edges - for (size_t i = 0; i < graph->nodes.size(); i++) { - const Node &node = graph->nodes.Get(i); - if (!config_.display_deleted_node && node.deleted()) continue; - for (auto &out : node.outlinks) { - if (!config_.display_deleted_node && out->deleted()) continue; - dot.AddEdge(node.repr(), out->repr(), {}); - } - } - return dot.Build(); -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h deleted file mode 100644 index e537bfc0e64d4ff46b3d61499a1a0298ed83533f..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - * This file create an DFG_GraphvizDrawPass which helps to draw a data flow - * graph's structure using graphviz. - */ - -#pragma once - -#include -#include -#include "paddle/fluid/inference/analysis/analysis_pass.h" -#include "paddle/fluid/inference/analysis/dot.h" - -namespace paddle { -namespace inference { -namespace analysis { - -/* - * Output a dot file and write to some place. - */ -class DFG_GraphvizDrawPass : public DataFlowGraphPass { - public: - struct Config { - Config(const std::string &dir, const std::string &id, - bool display_deleted_node = false) - : dir(dir), id(id), display_deleted_node(display_deleted_node) {} - - // The directory to store the .dot or .png files. - const std::string dir; - // The identifier for this dot file. - const std::string id; - // Whether to display deleted nodes, default false. - const bool display_deleted_node; - }; - - explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {} - - bool Initialize(Argument *argument) override { return true; } - void Run(DataFlowGraph *graph) override; - bool Finalize() override { return true; } - - std::string repr() const override { return "DFG graphviz drawer"; } - std::string description() const override { - return "Debug a DFG by draw with graphviz"; - } - - protected: - // A counter to add a number prefix to the debugger image output so that they - // will sort in the triggered order. - static int counter_; - - // Path of the dot file to output. - std::string GenDotPath() const { - return config_.dir + "/" + std::to_string(counter_++) + "-graph_" + - config_.id + ".dot"; - } - - virtual std::string Draw(DataFlowGraph *graph); - - Config config_; -}; - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc deleted file mode 100644 index 928be7917047382d9b86294f6039b26b0ebf6f49..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" - -#include -#include -#include -#include "paddle/fluid/inference/analysis/ut_helper.h" - -namespace paddle { -namespace inference { -namespace analysis { - -TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) { - Argument argument(FLAGS_inference_model_dir); - FluidToDataFlowGraphPass pass0; - ASSERT_TRUE(pass0.Initialize(&argument)); - pass0.Run(argument.main_dfg.get()); - - // auto dfg = ProgramDescToDFG(*argument.origin_program_desc); - - DFG_GraphvizDrawPass::Config config("./", "test"); - DFG_GraphvizDrawPass pass(config); - pass.Initialize(&argument); - pass.Run(argument.main_dfg.get()); - - // test content - std::ifstream file("./0-graph_test.dot"); - ASSERT_TRUE(file.is_open()); - - std::string line; - int no{0}; - while (std::getline(file, line)) { - no++; - } - // DFG is sensitive to ProgramDesc, be careful to change the existing models. - ASSERT_EQ(no, 83); -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/dot_tester.cc b/paddle/fluid/inference/analysis/dot_tester.cc index 56ceb9bd5d6f41a601d66f6124fb7b4099c9337e..c785a312bf96c3586ea990fd9028cfd3b930d577 100644 --- a/paddle/fluid/inference/analysis/dot_tester.cc +++ b/paddle/fluid/inference/analysis/dot_tester.cc @@ -16,7 +16,6 @@ #include #include -#include "paddle/fluid/inference/analysis/data_flow_graph.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc deleted file mode 100644 index 2b7d632c839e735ca03c6e17b94307b40cc13374..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/inference/analysis/analyzer.h" -#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" -#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" - -namespace paddle { -namespace inference { -namespace analysis { - -bool FluidToDataFlowGraphPass::Initialize(Argument *argument) { - ANALYSIS_ARGUMENT_CHECK_FIELD(argument); - if (argument->origin_program_desc) { - LOG(WARNING) << "argument's origin_program_desc is already set, might " - "duplicate called"; - } - if (!argument->fluid_model_program_path) { - ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir); - argument->fluid_model_program_path.reset( - new std::string(*argument->fluid_model_dir + "/__model__")); - } - ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path); - auto program = LoadProgramDesc(*argument->fluid_model_program_path); - argument->origin_program_desc.reset( - new framework::proto::ProgramDesc(program)); - - if (!argument->main_dfg) { - argument->main_dfg.reset(new DataFlowGraph); - } - desc_ = argument->origin_program_desc.get(); - return true; -} - -bool FluidToDataFlowGraphPass::Finalize() { return true; } - -void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { - PADDLE_ENFORCE(graph); - PADDLE_ENFORCE(desc_); - graph->Build(*desc_); -} - -namespace { -class DFG_DebuggerPass : public DFG_GraphvizDrawPass { - public: - using Config = DFG_GraphvizDrawPass::Config; - explicit DFG_DebuggerPass(const Config &config) - : DFG_GraphvizDrawPass(config) {} - std::string repr() const override { return "fluid-to-dfg-debuger-pass"; } - bool Finalize() override { return true; } -}; -} - -AnalysisPass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const { - return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config( - FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger")); -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h deleted file mode 100644 index b9e262020e9522e167b998d57e2be2ac19b48447..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -/* - * This file implements the transformation from data flow graph to fluid - * ProgramDesc. - */ - -#pragma once - -#include - -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/inference/analysis/analysis_pass.h" -#include "paddle/fluid/inference/analysis/data_flow_graph.h" - -namespace paddle { -namespace inference { -namespace analysis { - -/* - * Transform a FluidDesc to a SSA. - */ -class FluidToDataFlowGraphPass final : public DataFlowGraphPass { - public: - FluidToDataFlowGraphPass() = default; - - bool Initialize(Argument *argument) override; - bool Finalize() override; - - void Run(DataFlowGraph *graph) override; - - std::string repr() const override { return "fluid-to-data-flow-graph"; } - std::string description() const override { - return "transform a fluid ProgramDesc to a data flow graph."; - } - - AnalysisPass *CreateGraphvizDebugerPass() const override; - - private: - framework::proto::ProgramDesc const *desc_; -}; - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc deleted file mode 100644 index 267a0a84ebf75615e0b390f4a1b3bf3b51793fc7..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" - -#include -#include "paddle/fluid/inference/analysis/ut_helper.h" - -namespace paddle { -namespace inference { -namespace analysis { - -TEST(FluidToDataFlowGraphPass, Test) { - FluidToDataFlowGraphPass pass; - Argument argument(FLAGS_inference_model_dir); - pass.Initialize(&argument); - pass.Run(argument.main_dfg.get()); - // Analysis is sensitive to ProgramDesc, careful to change the original model. - ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL); - pass.Finalize(); - ASSERT_FALSE(argument.main_dfg->DotString().empty()); - EXPECT_FALSE(argument.main_dfg->inputs().empty()); -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc deleted file mode 100644 index 9f52af670b8e26c31230bc003af4c9ddc5b67802..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/inference/io.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace inference { -namespace analysis { - -void FluidToIrPass::EnableParamModify(const std::string &model_dir, - const std::string &prog_file, - const std::string ¶m_file) { - PADDLE_ENFORCE(argument_); - argument_->Set(framework::ir::kParamScopeAttr, new framework::Scope); - // Load parameters. - VLOG(30) << "Loading parameters from " << model_dir; - LoadParams(&argument_->Get(framework::ir::kParamScopeAttr), - model_dir, prog_file, param_file); -} - -bool FluidToIrPass::LoadParams(framework::Scope *scope, const std::string &dir, - const std::string &prog_file, - const std::string ¶m_file) { - platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); - framework::Executor executor(place); - PADDLE_ENFORCE(argument_->origin_program_desc.get()); - framework::ProgramDesc program(*argument_->origin_program_desc); - if ((!prog_file.empty()) && (!param_file.empty())) { - LOG(INFO) << "load single model file from " << prog_file; - Load(&executor, scope, prog_file, param_file); - } else if (!dir.empty()) { - LOG(INFO) << "load from dir " << dir; - Load(&executor, scope, dir); - } else { - LOG(ERROR) << "failed to load parameters"; - return false; - } - return true; -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h deleted file mode 100644 index c2599e218a2306f9353b843b7ea3f18aeacb008e..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/inference/analysis/analysis_pass.h" -#include "paddle/fluid/inference/analysis/flags.h" -#include "paddle/fluid/inference/analysis/ir_pass_manager.h" - -namespace paddle { -namespace inference { -namespace analysis { - -static const char kFluidToIrPassesAttr[] = "__fluid_to_ir_passes__"; - -class FluidToIrPass final : public DataFlowGraphPass { - public: - FluidToIrPass() = default; - - bool Initialize(Argument *argument) override { - ANALYSIS_ARGUMENT_CHECK_FIELD(argument); - PADDLE_ENFORCE(argument->Has(kFluidToIrPassesAttr), - "argument need the attr %s", kFluidToIrPassesAttr); - argument_ = argument; - if (argument->origin_program_desc) { - LOG(WARNING) << "argument's origin_program_desc is already set, might " - "duplicate called"; - } - // set fluid model program path - if (!argument->fluid_model_program_path) { - ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir); - argument->fluid_model_program_path.reset( - new std::string(*argument->fluid_model_dir + "/__model__")); - } - ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path); - // Load program. - auto program = LoadProgramDesc(*argument->fluid_model_program_path); - argument->origin_program_desc.reset( - new framework::proto::ProgramDesc(program)); - // Create main data flow graph. - if (!argument->main_dfg) { - argument->main_dfg.reset(new DataFlowGraph); - } - argument->Set("ir_program_desc", new ProgramDesc(program)); - - LOG(INFO) << "Loading parameters"; - // Load parameters to argument if needed. - if (argument->fluid_model_dir || (argument->fluid_model_program_path && - argument->fluid_model_param_path)) { -#define SAFE_GET(ATTR) std::string ATTR = argument->ATTR ? *argument->ATTR : ""; - SAFE_GET(fluid_model_dir); - SAFE_GET(fluid_model_program_path); - SAFE_GET(fluid_model_param_path); -#undef SAFE_GET - EnableParamModify(fluid_model_dir, fluid_model_program_path, - fluid_model_param_path); - } - - return true; - } - - bool Finalize() override { return true; } - - void Run(DataFlowGraph *graph) override { - // Call all the IR Passes - IRPassManager ir_passes(argument_->Get("ir_program_desc"), - nullptr); - // Pass the scope from analysis to IR if needed. - if (argument_->Has(framework::ir::kParamScopeAttr)) { - // Here the address is passed, attention that IR doesn't own the scope, so - // the real scope in analysis should live during the IR phase. - ir_passes.graph().Set( - framework::ir::kParamScopeAttr, - new framework::Scope *(&argument_->Get( - framework::ir::kParamScopeAttr))); - } - - if (FLAGS_IA_enable_ir) { - const auto &ir_passes_to_apply = - argument_->Get>(kFluidToIrPassesAttr); - ir_passes.Apply(ir_passes_to_apply); - } - - PADDLE_ENFORCE(argument_->main_dfg.get()); - argument_->main_dfg->Build(ir_passes.graph()); - // inherit the arguments from ir. - if (ir_passes.graph().Has(framework::ir::kFuseStatisAttr)) { - argument_->Set( - framework::ir::kFuseStatisAttr, - new std::unordered_map( - ir_passes.graph().Get>( - framework::ir::kFuseStatisAttr))); - } - } - - void EnableParamModify(const std::string &model_dir, - const std::string &prog_file, - const std::string ¶m_file); - - std::string repr() const override { return "fluid-to-ir-pass"; } - - private: - // Load parameters from a single file or from a directory. - bool LoadParams(framework::Scope *scope, const std::string &dir, - const std::string &prog_file, const std::string ¶m_file); - - private: - Argument *argument_{nullptr}; -}; - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/graph_traits.cc b/paddle/fluid/inference/analysis/graph_traits.cc deleted file mode 100644 index 2ea70a1d2060e03769d67060dc6f008207342b52..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/graph_traits.cc +++ /dev/null @@ -1,15 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/inference/analysis/graph_traits.h" diff --git a/paddle/fluid/inference/analysis/graph_traits.h b/paddle/fluid/inference/analysis/graph_traits.h deleted file mode 100644 index aed2b1e8e27d94b430201d70ecf09d4acc33c8fa..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/graph_traits.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - * This file defines the GraphTraits template class that should be specified - * by classes that want to be iteratable by generic graph iterators. - * - * This file also defines the marker class Inverse that is used to iterate over - * graphs in a graph defined, inverse ordering... - */ - -#pragma once - -#include "paddle/fluid/inference/analysis/helper.h" - -namespace paddle { -namespace inference { -namespace analysis { - -/* - * This class should be specialized by different graph types... - * That's why the base class is empty. - */ -template -struct GraphTraits { - // using NodesBFSIterator = xxx - - // NodesBFSIterator nodes_begin(); - // NodesBFSIterator nodes_end(); -}; - -/* - * Inverse - This class is used as a marker class to tell the graph iterator to - * iterate in a graph defined Inverse order. - */ -template -struct Inverse { - const GraphType &graph; - - explicit Inverse(const GraphType &graph) : graph(graph) {} -}; - -/* - * Provide a partial specialization of GraphTraits so that the inverse of an - * inverse turns into the original graph. - */ -template -struct GraphTraits>> : GraphTraits {}; - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 5151e2b69ac199dea136535ba445e890596f6227..269a0da9f9378601373e42d741f519843b111ec6 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace inference { @@ -101,20 +102,20 @@ class OrderedRegistry { public: T *Register(const std::string &name, T *x) { PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name); - dic_[name] = data_.size(); - data_.emplace_back(std::unique_ptr(x)); - return data_.back().get(); + dic_[name] = elements_.size(); + elements_.emplace_back(std::unique_ptr(x)); + return elements_.back().get(); } T *Lookup(const std::string &name) { auto it = dic_.find(name); if (it == dic_.end()) return nullptr; - return data_[it->second].get(); + return elements_[it->second].get(); } protected: std::unordered_map dic_; - std::vector> data_; + std::vector> elements_; }; template @@ -124,20 +125,6 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) { return *var->GetMutable(); } -static void ExecShellCommand(const std::string &cmd, std::string *message) { - char buffer[128]; - std::shared_ptr pipe(popen(cmd.c_str(), "r"), pclose); - if (!pipe) { - LOG(ERROR) << "error running command: " << cmd; - return; - } - while (!feof(pipe.get())) { - if (fgets(buffer, 128, pipe.get()) != nullptr) { - *message += buffer; - } - } -} - static framework::proto::ProgramDesc LoadProgramDesc( const std::string &model_path) { std::ifstream fin(model_path, std::ios::in | std::ios::binary); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index e76708baf4b39afb0febbcf3ff71281dfbfc8627..fce5e1cac92064a320179243380ea02b2c5d7838 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -18,6 +18,8 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/argument.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/string/pretty_log.h" namespace paddle { @@ -27,21 +29,33 @@ using string::PrettyLogEndl; using string::PrettyLog; using string::Style; -IRPassManager::IRPassManager(const ProgramDesc &program, - framework::Scope *scope) - : program_(program) { - graph_.reset(new framework::ir::Graph(program)); - if (scope) - graph_->Set(framework::ir::kParamScopeAttr, new framework::Scope *(scope)); +IRPassManager::IRPassManager(Argument *argument) { + ARGUMENT_CHECK_FIELD(argument, main_program); + graph_ = std::unique_ptr(new Graph(argument->main_program())); + if (argument->Has("scope")) { + graph_->Set(framework::ir::kParamScopeAttr, + new framework::Scope *( + const_cast(&argument->scope()))); + } + + ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); + CreatePasses(argument, argument->ir_analysis_passes()); } -void IRPassManager::Apply(const std::vector &passes) { - // Apply all the passes +void IRPassManager::CreatePasses(Argument *argument, + const std::vector &passes) { std::string pre_pass; int pass_num = 0; for (const std::string &pass_name : passes) { - PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass_name); auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); + + // Set some pass attributes. + if (pass_name == "ir_analysis_pass") { + pass->Set("tensorrt_node_teller", + new SubgraphDetector::NodeInsideSubgraphTeller( + argument->tensorrt_node_teller())); + } + if (pass_name == "graph_viz_pass") { std::string dot_file_path = std::to_string(pass_num) + "_ir_" + (pre_pass.empty() ? "origin" : pre_pass) + @@ -49,11 +63,47 @@ void IRPassManager::Apply(const std::vector &passes) { pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); pass_num++; } - graph_ = pass->Apply(std::move(graph_)); + + if (pass_name == "tensorrt_subgraph_pass") { + PADDLE_ENFORCE(argument->tensorrt_node_teller_valid()); + pass->SetNotOwned("tensorrt_node_teller", + argument->tensorrt_node_teller_ptr()); + pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); + pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); + } + + // graph_ = pass->Apply(std::move(graph_)); pre_pass = pass_name; + + passes_.emplace_back(std::move(pass)); } } +std::unique_ptr IRPassManager::Apply(std::unique_ptr graph) { + if (passes_.empty()) { + return graph; + } + PADDLE_ENFORCE(graph.get()); + // Apply all the passes + for (const auto &pass : passes_) { + PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); + graph = pass->Apply(std::move(graph)); + } + return std::move(graph); +} + +framework::proto::ProgramDesc IRPassManager::AcquireProgram( + std::unique_ptr *graph, const ProgramDesc &program) const { + auto pass = + framework::ir::PassRegistry::Instance().Get("graph_to_program_pass"); + + ProgramDesc desc(program); + pass->SetNotOwned("program", &desc); + auto *the_graph = graph->release(); + *graph = pass->Apply(std::unique_ptr(the_graph)); + return *desc.Proto(); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h index bb230283b7c2cc783d0b68ea0aa3cca1cabc75e6..983a582649706fa6eedb5aa459b5ac53b98f658b 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.h +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -20,27 +20,38 @@ * for inference. */ +#pragma once + +#include +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/argument.h" namespace paddle { namespace inference { namespace analysis { using framework::ProgramDesc; +using framework::ir::Graph; class IRPassManager final { public: - IRPassManager(const ProgramDesc &program, framework::Scope *scope); + explicit IRPassManager(Argument *argument); + + std::unique_ptr Apply(std::unique_ptr graph); - void Apply(const std::vector &passes); + framework::proto::ProgramDesc AcquireProgram( + std::unique_ptr *graph, const ProgramDesc &program) const; framework::ir::Graph &graph() const { return *graph_; } private: - std::unique_ptr graph_; - ProgramDesc program_; + void CreatePasses(Argument *argument, const std::vector &passes); + + std::unique_ptr graph_; + std::vector> passes_; }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..c71cff889ed7cdb95f79b9bc89a9ca5ab370271c --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -0,0 +1,7 @@ +cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) +cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector) +set(analysis_deps ${analysis_deps} + subgraph_detector tensorrt_subgraph_pass + CACHE INTERNAL "") + +set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc similarity index 53% rename from paddle/fluid/inference/analysis/subgraph_splitter.cc rename to paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc index 3688ea15d959309d33901c360cb1055e2ac489a5..b6a5dfd087c95d0ccb0f5adfa4f754cfa5a44f14 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc @@ -12,46 +12,110 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/inference/analysis/subgraph_splitter.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" +#include +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/node.h" namespace paddle { namespace inference { namespace analysis { -const char *SubGraphSplitter::kMarkerAttrName = - "_sub_graph_splitter_inside_sub_graph"; +using framework::ir::Node; + +std::pair, std::vector> +ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT + std::unordered_set nodes(graph.begin(), graph.end()); + std::unordered_set inputs; + std::unordered_set outputs; + // Input a Value, check whether its inlink is in the subgraph. + auto inlink_in_subgraph = [&](Node *n) { + for (auto *in : n->inputs) { + if (nodes.count(in)) return true; + } + return false; + }; + + for (auto &node : graph) { + for (auto *in : node->inputs) { + // The Value that is written by nodes inside a sub-graph shouldn't be the + // input of the sub-graph. + if (!nodes.count(in) && in->IsVar() && !inlink_in_subgraph(in)) { + inputs.insert(in); + } + } + for (auto *out : node->outputs) { + if (!nodes.count(out) && out->IsVar()) { + outputs.insert(out); + } + } + } + return std::make_pair(std::vector(inputs.begin(), inputs.end()), + std::vector(outputs.begin(), outputs.end())); +} + +// Filter the Intermediate results of the subgraph node. +void FilterRedundantOutputOfSubGraph(Graph *graph) { + std::vector op_nodes; + for (auto &node : TopologicalSort(*graph)) { + if (node.IsVar() || Agent(&node).deleted()) { + continue; + } + op_nodes.push_back(&node); + } + size_t op_num = op_nodes.size(); + for (size_t i = 0; i < op_num; i++) { + if (op_nodes[i]->IsOp()) continue; + std::unordered_set follow_up_input_names; + for (size_t j = i + 1; j < op_num; j++) { + for (auto *in : op_nodes[j]->inputs) { + follow_up_input_names.insert(in->Name()); + } + } + std::vector filtered_subgraph_outlinks; + for (auto *out : op_nodes[i]->outputs) { + if (follow_up_input_names.count(out->Name())) { + filtered_subgraph_outlinks.push_back(out); + } else { + Agent(out).set_deleted(true); + } + } + // The filtered_subgraph_outlinks may be empty. + op_nodes[i]->outputs = filtered_subgraph_outlinks; + } +} -std::vector> SubGraphSplitter::operator()() { +std::vector> SubgraphDetector::operator()() { MarkNodesInsideSubGraph(); return ExtractSubGraphs(); } // Mark the output variables inside a subgraph with the func. -inline void MarkOutLinksInSubGraph(const Function *func) { - for (auto *var : func->outlinks) { - var->attr(SubGraphSplitter::kMarkerAttrName).Bool() = true; +inline void MarkOutLinksInSubGraph(const Node *func) { + for (auto *var : func->outputs) { + Agent(var).set_marked(true); } } -void SubGraphSplitter::MarkNodesInsideSubGraph() { - for (auto &node : GraphTraits(*graph_).nodes()) { +void SubgraphDetector::MarkNodesInsideSubGraph() { + for (auto &node : framework::ir::GraphTraits::DFS(*graph_)) { if (node_inside_subgraph_teller_(&node)) { - node.attr(kMarkerAttrName).Bool() = true; - if (node.type() == Node::Type::kFunction) { + Agent(&node).set_marked(true); + if (node.IsOp()) { // If a function is inside the sub-graph, mark all the output variables // to be inside too, so that two marked functions will be inside a same // sub-graph, lets take a example: A_function->var->B_function, if // A_function is marked, var should also be marked, so that B_function // will be in the same sub-graph with A_function if B_function is // marked. - MarkOutLinksInSubGraph(static_cast(&node)); + MarkOutLinksInSubGraph(&node); } } } } -const char *kUnionFindParent = "_sub_graph_splitter_union_find_parent_"; - // Use the Union Find(UF) algorithm to find fully connected sub-graphs, if node // a's output is node b, that is a and b is in the same sub-graph. The UF // algorithm will group them to the same cluster. @@ -60,8 +124,8 @@ using node_map_t = std::unordered_map; int UnionFindGetAncestor(const node_map_t &node_map, size_t id) { int tmp = id; do { - tmp = node_map.at(tmp)->attr(kUnionFindParent).Int32(); - } while (node_map.at(tmp)->attr(kUnionFindParent).Int32() != tmp); + tmp = Agent(node_map.at(tmp)).union_find_parent(); + } while (Agent(node_map.at(tmp)).union_find_parent() != tmp); return tmp; } // Make this two node share the same ancestor. @@ -69,9 +133,9 @@ int UnionFindGetAncestor(const node_map_t &node_map, size_t id) { void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) { int a_ancestor = UnionFindGetAncestor(node_map, a); int b_ancestor = UnionFindGetAncestor(node_map, b); - node_map.at(b_ancestor)->attr(kUnionFindParent).Int32() = a_ancestor; - node_map.at(a)->attr(kUnionFindParent).Int32() = a_ancestor; - node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor; + Agent(node_map.at(b_ancestor)).set_union_find_parent(a_ancestor); + Agent(node_map.at(a)).set_union_find_parent(a_ancestor); + Agent(node_map.at(b)).set_union_find_parent(a_ancestor); } // This is a simple representation of a graph. @@ -195,16 +259,21 @@ void FlexibleDFS(const std::vector &source, bool reverse, } } -std::vector> SubGraphSplitter::ExtractSubGraphs() { +std::vector> SubgraphDetector::ExtractSubGraphs() { // Run the Extract algorithm to find all subgraphs. std::vector marked_nodes; // We use brief_node_map to represent the original graph in order to avoid // changing the original graph. std::unordered_map brief_node_map; - for (auto &node : GraphTraits(*graph_).nodes_in_TS()) { + std::unordered_set valid_node_ids; + for (auto *node : graph_->Nodes()) { + valid_node_ids.insert(node->id()); + } + + for (auto &node : framework::ir::GraphTraits::TS(*graph_)) { brief_node_map[node.id()] = new BriefNode(&node); - if (node.attr(kMarkerAttrName).Bool()) { + if (Agent(&node).marked()) { marked_nodes.push_back(&node); } } @@ -213,26 +282,34 @@ std::vector> SubGraphSplitter::ExtractSubGraphs() { node_map_t node_map; // id to ptr for (auto *n : marked_nodes) { // n's parent == n.id means it is the ancestor - n->attr(kUnionFindParent).Int32() = n->id(); + Agent(n).set_union_find_parent(n->id()); node_map[n->id()] = n; } // create breif node map for (auto &itr : brief_node_map) { - for (Node *node : itr.second->node->inlinks) { - itr.second->inlinks.push_back(brief_node_map[node->id()]); + for (Node *node : itr.second->node->inputs) { + if (!valid_node_ids.count(node->id())) { + LOG(INFO) << "invalid node id " << node->id(); + continue; + } + itr.second->inlinks.push_back(brief_node_map.at(node->id())); } - for (Node *node : itr.second->node->outlinks) { - itr.second->outlinks.push_back(brief_node_map[node->id()]); + for (Node *node : itr.second->node->outputs) { + if (!valid_node_ids.count(node->id())) { + LOG(INFO) << "invalid node id " << node->id(); + continue; + } + itr.second->outlinks.push_back(brief_node_map.at(node->id())); } } for (auto &itr : brief_node_map) { BriefNode *brief_node = itr.second; - if (!brief_node->node->attr(kMarkerAttrName).Bool()) { - VLOG(40) << brief_node->node->id() << " node not a trt candicate."; + if (!Agent(brief_node->node).marked()) { + VLOG(4) << brief_node->node->id() << " node not a trt candidate."; continue; } @@ -254,7 +331,7 @@ std::vector> SubGraphSplitter::ExtractSubGraphs() { std::unordered_set contract_nodes; for (auto *out : brief_node->outlinks) { // must be an trt candidate - if (!out->node->attr(kMarkerAttrName).Bool()) continue; + if (!Agent(out->node).marked()) continue; // get all dst input nodes except src. std::vector source_nodes; for (auto *n : out->inlinks) { @@ -289,9 +366,8 @@ std::vector> SubGraphSplitter::ExtractSubGraphs() { std::unordered_map> clusters; for (auto *n : marked_nodes) { - if (n->type() == Node::Type::kFunction) { - clusters[UnionFindGetAncestor(node_map, - n->attr(kUnionFindParent).Int32())] + if (n->IsOp()) { + clusters[UnionFindGetAncestor(node_map, Agent(n).union_find_parent())] .push_back(n); } } @@ -304,28 +380,59 @@ std::vector> SubGraphSplitter::ExtractSubGraphs() { return result; } -void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); } +void SubGraphFuser::operator()() { ReplaceNodesWithSubGraphs(); } + +void RemoveIntermediateOutputInSubgraph(const std::vector &subgraph, + Graph *graph, + std::vector *outputs) { + std::unordered_set subgraph_set(subgraph.begin(), subgraph.end()); + std::unordered_set valid_output; + + for (auto *output : *outputs) { + int num_used = 0; + for (auto *node : output->outputs) { + if (!subgraph_set.count(node)) ++num_used; + if (num_used > 0) valid_output.insert(output); + } + } + + outputs->assign(valid_output.begin(), valid_output.end()); +} + +void DetachDeletedNodes(framework::ir::Graph *graph) { + std::unordered_set nodes; + for (auto *node : graph->Nodes()) { + if (Agent(node).deleted()) { + node->inputs.clear(); + node->outputs.clear(); + } + } +} -void SubGraphFuse::ReplaceNodesWithSubGraphs() { - auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)(); +void SubGraphFuser::ReplaceNodesWithSubGraphs() { + auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)(); for (auto &subgraph : subgraphs) { - if (subgraph.size() <= argument_->Get("minimum_subgraph_size")) - continue; + if (subgraph.size() <= (size_t)min_subgraph_size_) continue; + LOG(INFO) << "detect a subgraph size " << subgraph.size(); std::unordered_set subgraph_uniq(subgraph.begin(), subgraph.end()); // replace this sub-graph with the first node. Two steps: 1. Create a Block // Node that contains this subgraph 2. Mark the nodes inside the sub-graph // as deleted. 3. Replace the deleted node with the new Block Node. - auto *block_node = static_cast( - graph_->nodes.Create(Node::Type::kFunctionBlock)); + framework::OpDesc empty_desc; + empty_desc.SetType("tensorrt_engine"); + auto *block_node = graph_->CreateOpNode(&empty_desc); + Agent(block_node).set_subgraph({}); auto io = ExtractInputAndOutputOfSubGraph(subgraph); - block_node->inlinks = std::move(io.first); - block_node->outlinks = std::move(io.second); + block_node->inputs = std::move(io.first); + block_node->outputs = std::move(io.second); + + RemoveIntermediateOutputInSubgraph(subgraph, graph_, &block_node->outputs); for (auto *node : subgraph) { // TODO(Superjomn) need a unified mechanism to treat deleted node in each // pass. - node->SetDeleted(); - block_node->subgraph.push_back(node); + Agent(node).set_deleted(true); + Agent(block_node).subgraph()->push_back(node); } // Change all the sub-graph's inputs and outputs corresponding inlink and @@ -339,16 +446,92 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() { std::unordered_set uniq(nodes.begin(), nodes.end()); nodes.assign(uniq.begin(), uniq.end()); }; - for (auto *i : block_node->inlinks) { - inlink_or_outlink_cleaner(i->outlinks); + for (auto *i : block_node->inputs) { + inlink_or_outlink_cleaner(i->outputs); } - for (auto *&o : block_node->outlinks) { - inlink_or_outlink_cleaner(o->inlinks); + for (auto *&o : block_node->outputs) { + inlink_or_outlink_cleaner(o->inputs); } } + // DetachDeletedNodes(graph_); FilterRedundantOutputOfSubGraph(graph_); } +inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) { + return node.inputs.size() == n; +} + +NodesTSIterator::NodesTSIterator(const std::vector &source) { + PADDLE_ENFORCE(!source.empty(), + "Start points of topological sorting should not be empty!"); + // CHECK all the inputs' in-degree is 0 + for (auto *node : source) { + PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0)); + } + + std::unordered_set visited; + std::unordered_set to_visit{source.begin(), source.end()}; + + std::vector inlink_visited; + while (!to_visit.empty()) { + std::vector queue(to_visit.begin(), to_visit.end()); + for (auto *p : queue) { + if (Agent(p).deleted()) { + visited.insert(p); + to_visit.erase(p); + } + + inlink_visited.clear(); + + std::copy_if(p->inputs.begin(), p->inputs.end(), + std::back_inserter(inlink_visited), + [&](Node *x) -> bool { return visited.count(x) != 0; }); + + if (inlink_visited.size() == p->inputs.size()) { + sorted_.push_back(p); + for (auto *_ : p->outputs) { + if (!visited.count(_)) { + to_visit.insert(_); + } + } + + to_visit.erase(p); + visited.insert(p); + } + } + } +} + +NodesTSIterator::NodesTSIterator(const NodesTSIterator &other) + : sorted_(other.sorted_), cursor_(other.cursor_) {} + +Node &NodesTSIterator::operator*() { + PADDLE_ENFORCE_LT(cursor_, sorted_.size()); + return *sorted_[cursor_]; +} + +NodesTSIterator &NodesTSIterator::operator++() { + if (++cursor_ >= sorted_.size()) { + sorted_.clear(); + cursor_ = 0; + } + return *this; +} +NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) { + cursor_ = other.cursor_; + sorted_ = other.sorted_; + return *this; +} + +bool NodesTSIterator::operator==(const NodesTSIterator &other) { + return sorted_ == other.sorted_ && cursor_ == other.cursor_; +} + +Node *NodesTSIterator::operator->() { + PADDLE_ENFORCE_LT(cursor_, sorted_.size()); + return sorted_[cursor_]; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h new file mode 100644 index 0000000000000000000000000000000000000000..ea88edd042aa9d46f66af1aa92f2cb273696c118 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h @@ -0,0 +1,182 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file defines the the class to partition a graph. + */ + +#pragma once + +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/inference/analysis/argument.h" +#include "paddle/fluid/inference/analysis/helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +using framework::ir::Graph; + +const char kIsFunctionNode[] = "__is_function_node__"; +const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__"; +const char kSubgraphSplitterMarkerAttrName[] = + "_sub_graph_splitter_inside_sub_graph"; + +/* + * Detect the nodes in a sub-graph that meet some conditions. This class doesn't + * modify the graph. + */ +class SubgraphDetector { + public: + // Tell whether a node is inside a sub-graph. + using NodeInsideSubgraphTeller = + std::function; + + SubgraphDetector(Graph *graph, const NodeInsideSubgraphTeller &teller) + : graph_(graph), node_inside_subgraph_teller_(teller) {} + + std::vector> operator()(); + + protected: + // Mark the nodes inside the accepted sub-graph using + // node_inside_subgraph_teller. + void MarkNodesInsideSubGraph(); + + // Merge the marked nodes into sub-graphs and return the sub-graphs. + std::vector> ExtractSubGraphs(); + + private: + Graph *graph_; + NodeInsideSubgraphTeller node_inside_subgraph_teller_; +}; + +/* + * SubGraphFuser - Replace some nodes with the sub-graph node they are inside. + * To some extent, the TensorRT engine is just a fusion op for a model. + */ +class SubGraphFuser { + public: + using NodeInsideSubgraphTeller = SubgraphDetector::NodeInsideSubgraphTeller; + + SubGraphFuser(Graph *graph, const NodeInsideSubgraphTeller &teller, + int min_subgraph_size) + : graph_(graph), + node_inside_subgraph_teller_(teller), + min_subgraph_size_{min_subgraph_size} {} + + // The main method which run all the logic. + void operator()(); + + protected: + // Remove the nodes inside sub-graphs and replace with the SubGraphNode. + void ReplaceNodesWithSubGraphs(); + + private: + Graph *graph_; + NodeInsideSubgraphTeller node_inside_subgraph_teller_; + int min_subgraph_size_; +}; + +struct NodeWrapper { + bool deleted{false}; + bool marked{false}; + int union_find_parent{-1}; + std::vector subgraph; +}; + +/* + * ir::Node agent for subgraph detector. + */ +struct Agent { + explicit Agent(framework::ir::Node *x) : x_(x) {} + + NodeWrapper &wrapper() { + if (!x_->IsWrappedBy()) { + x_->WrappedBy(new NodeWrapper); + } + return x_->template Wrapper(); + } + + bool deleted() { return wrapper().deleted; } + void set_deleted(bool x) { wrapper().deleted = x; } + + bool marked() { return wrapper().marked; } + void set_marked(bool x) { wrapper().marked = x; } + + void set_subgraph(const std::vector &x) { + wrapper().subgraph = x; + } + + int union_find_parent() { return wrapper().union_find_parent; } + void set_union_find_parent(int v) { wrapper().union_find_parent = v; } + + std::vector *subgraph() { return &wrapper().subgraph; } + std::vector &inputs() { return x_->inputs; } + std::vector &outputs() { return x_->outputs; } + + private: + framework::ir::Node *x_; +}; + +// Topological sorting iterator on nodes. +struct NodesTSIterator + : public std::iterator { + NodesTSIterator() = default; + explicit NodesTSIterator(const std::vector &source); + NodesTSIterator(NodesTSIterator &&other) + : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) { + other.cursor_ = 0; + } + NodesTSIterator(const NodesTSIterator &other); + + framework::ir::Node &operator*(); + NodesTSIterator &operator++(); + // TODO(Superjomn) current implementation just compare the first + // element, need to compare the graph and all the elements in the queue and + // set. + NodesTSIterator &operator=(const NodesTSIterator &other); + bool operator==(const NodesTSIterator &other); + bool operator!=(const NodesTSIterator &other) { return !(*this == other); } + framework::ir::Node *operator->(); + + private: + std::vector sorted_; + size_t cursor_{0}; +}; + +// The nodes those have no input will be treated as start points. +static std::vector ExtractStartPoints(const Graph &g) { + std::vector result; + for (auto *node : g.Nodes()) { + if (node->inputs.empty()) { + result.push_back(node); + } + } + return result; +} + +static iterator_range TopologicalSort(const Graph &g) { + auto start_points = ExtractStartPoints(g); + PADDLE_ENFORCE(!start_points.empty()); + NodesTSIterator x(start_points); + return iterator_range(NodesTSIterator(start_points), + NodesTSIterator()); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..21fd8d2df49698d7fa38d906f7921f092ca916a3 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -0,0 +1,220 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include +#include +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" + +namespace paddle { +namespace inference { +namespace analysis { + +using framework::ir::Node; + +std::vector ExtractParameters( + const std::unordered_set &nodes); + +std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( + + std::unique_ptr graph) const { + framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); + + auto teller = + Get("tensorrt_node_teller"); + + SubGraphFuser fuser(graph.get(), teller, 2 /*min subgraph size*/); + fuser(); + + for (auto *node : graph->Nodes()) { + if (node->IsOp() && !Agent(node).subgraph()->empty()) { + CreateTensorRTOp(node, graph.get()); + + std::unordered_set nodes2remove( + Agent(node).subgraph()->begin(), Agent(node).subgraph()->end()); + framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); + } + } + + std::unordered_set nodes2remove; + for (auto *node : graph->Nodes()) { + if (node->IsOp() && Agent(node).deleted()) { + nodes2remove.insert(node); + } + } + framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); + + return graph; +} + +void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, + Graph *graph) const { + auto *op_desc = node->Op(); + static int counter{0}; + auto &subgraph = *Agent(node).subgraph(); + PADDLE_ENFORCE(!subgraph.empty()); + + // An fake block desc. + framework::proto::BlockDesc block_proto; + framework::BlockDesc block_desc(nullptr, &block_proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + for (auto *node : subgraph) { + auto *op = block_desc.AppendOp(); + *op->Proto() = *node->Op()->Proto(); + } + + // collect inputs + std::unordered_set input_names; + std::unordered_set input_names_with_id; + for (auto *x : node->inputs) { + input_names.insert(x->Name()); + input_names_with_id.insert(x->Name() + std::to_string(x->id())); + } + op_desc->SetInput( + "Xs", std::vector(input_names.begin(), input_names.end())); + + std::unordered_set output_names; + std::unordered_set output_names_with_id; + for (auto *x : node->outputs) { + output_names.insert(x->Name()); + output_names_with_id.insert(x->Name() + std::to_string(x->id())); + } + + op_desc->SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); + op_desc->SetType("tensorrt_engine"); + + std::unordered_map output_name_map; + + // The following procedure is used to rename all the intermediate + // variables and the output variables of the subgraph. + // Why we do this? + // During the transition from fluid OP to tensorrt OP, we map + // the input and output Tensor(fluid data structure) of fluid OP + // to the corresponding ITensor (trt data structure) through the + // Tensor name. When we set up ITensor for an variable, we must + // ensure that it has not been set before. + // If there is variable in the fluid graph, which is not only the + // input of a OP, but also the output of a Op, there will be problems. + // So we have to rename the variable in the subgraph to make sure + // it is either an OP's input or an OP's output. + + auto &subgraph_nodes = *Agent(node).subgraph(); + for (size_t index = 0; index < block_desc.OpSize(); index++) { + framework::proto::OpDesc *op = block_desc.Op(index)->Proto(); + auto correspond_node = subgraph_nodes[index]; + PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); + + std::unordered_map var2id; + for (auto *in_var : correspond_node->inputs) { + var2id[in_var->Name()] = in_var->id(); + } + // rename for the input variables of op inside subgraph + for (int i = 0; i < op->inputs_size(); i++) { + // one input + auto *in_var = op->mutable_inputs(i); + std::vector replaced_names; + for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments + std::string arg_value = in_var->arguments(k); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (input_names_with_id.count(arg_value_with_id)) { + replaced_names.push_back(arg_value); + } else { + replaced_names.push_back(arg_value_with_id); + } + } + in_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + in_var->add_arguments(replaced_names[k]); + } + } + var2id.clear(); + for (auto out_var : correspond_node->outputs) { + var2id[out_var->Name()] = out_var->id(); + } + + // rename for the output variables of op inside subgraph + for (int i = 0; i < op->outputs_size(); i++) { + framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); + std::vector replaced_names; + for (int k = 0; k < out_var->arguments_size(); k++) { + std::string arg_value = out_var->arguments(k); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (output_names_with_id.count(arg_value_with_id)) { + output_name_map[arg_value] = arg_value_with_id; + } + replaced_names.push_back(arg_value_with_id); + } + out_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + out_var->add_arguments(replaced_names[k]); + } + } + } + + // When tensorrt engine runs at the end of the operation, + // output_mapping help us copy the data from the renamed ITensor + // to Tensor. + std::vector output_mapping; + for (auto name : output_names) { + // LOG(INFO) << name << " " << output_name_map.size(); + PADDLE_ENFORCE(output_name_map.count(name) != 0); + output_mapping.push_back(output_name_map[name]); + } + + *block_desc.Proto()->mutable_vars() = + const_cast(&graph->program()) + ->Proto() + ->blocks(0) + .vars(); + PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), + "the block has no var-desc"); + PADDLE_ENFORCE(!output_mapping.empty()); + // Set attrs + SetAttr(op_desc->Proto(), "subgraph", + block_desc.Proto()->SerializeAsString()); + SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); + SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); + SetAttr(op_desc->Proto(), "engine_uniq_key", + "trt-" + std::to_string(counter++)); + SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); + SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); +} + +std::vector ExtractParameters( + const std::unordered_set &nodes) { + std::vector parameters; + for (const auto &node : nodes) { + if (!node->IsVar()) continue; + if (node->Var()->Persistable()) { + parameters.push_back(node->Name()); + } + } + return parameters; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle + +REGISTER_PASS(tensorrt_subgraph_pass, + paddle::inference::analysis::TensorRtSubgraphPass) + .RequirePassAttr("tensorrt_node_teller") + .RequirePassAttr("max_batch_size") + .RequirePassAttr("workspace_size"); diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h similarity index 55% rename from paddle/fluid/inference/analysis/model_store_pass_tester.cc rename to paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h index d6493fc25edf25003504542f1b01c4105754c8df..502353b95fc15e763900a0caf1649257508f0880 100644 --- a/paddle/fluid/inference/analysis/model_store_pass_tester.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h @@ -12,31 +12,24 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/model_store_pass.h" - -#include -#include -#include "paddle/fluid/inference/analysis/analyzer.h" +#pragma once +#include +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace inference { namespace analysis { -DEFINE_string(inference_model_dir, "", "Model path"); - -TEST(DFG_StorePass, test) { - Analyzer analyzer; - Argument argument(FLAGS_inference_model_dir); - argument.model_output_store_path.reset( - new std::string("./_dfg_store_pass_tmp")); - // disable storage in alalyzer - FLAGS_IA_output_storage_path = ""; - analyzer.Run(&argument); +class TensorRtSubgraphPass : public framework::ir::FusePassBase { + public: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; - ModelStorePass pass; - pass.Initialize(&argument); - pass.Run(argument.main_dfg.get()); -} + private: + void CreateTensorRTOp(framework::ir::Node *x, + framework::ir::Graph *graph) const; + void CleanIntermediateOutputs(framework::ir::Node *node); +}; } // namespace analysis } // namespace inference diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc deleted file mode 100644 index 4f40a7a1adc324b824af8e3831901abcbffaeca6..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/model_store_pass.cc +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "paddle/fluid/inference/analysis/analyzer.h" -#include "paddle/fluid/inference/analysis/argument.h" -#include "paddle/fluid/inference/analysis/model_store_pass.h" - -namespace paddle { -namespace inference { -namespace analysis { - -void ModelStorePass::Run(DataFlowGraph *x) { - if (!argument_->fluid_model_param_path) { - PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir); - argument_->fluid_model_param_path.reset( - new std::string(*argument_->fluid_model_dir + "param")); - } - PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path); - // Directly copy param file to destination. - std::stringstream ss; - // NOTE these commands only works on linux. - ss << "mkdir -p " << *argument_->model_output_store_path; - VLOG(30) << "run command: " << ss.str(); - PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); - ss.str(""); - - ss << "cp " << *argument_->fluid_model_dir << "/*" - << " " << *argument_->model_output_store_path; - VLOG(30) << "run command: " << ss.str(); - PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); - - // Store program - PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc, - "program desc is not transformed, should call " - "DataFlowGraphToFluidPass first."); - VLOG(30) << "store analyzed program to " - << *argument_->model_output_store_path; - const std::string program_output_path = - *argument_->model_output_store_path + "/__model__"; - std::ofstream file(program_output_path, std::ios::binary); - PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.", - program_output_path); - const std::string serialized_message = - argument_->transformed_program_desc->SerializeAsString(); - file.write(serialized_message.c_str(), serialized_message.size()); -} - -bool ModelStorePass::Finalize() { return true; } - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc deleted file mode 100644 index 3339b5044df0cf91d00aa9ddad310d4bf263bc3c..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/node.cc +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/inference/analysis/node.h" -#include "glog/logging.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace inference { -namespace analysis { - -std::vector Value::dot_attrs() const { - return std::vector({Dot::Attr("style", "filled,rounded"), - Dot::Attr("shape", "box"), - Dot::Attr("fillcolor", "red")}); -} - -std::vector Function::dot_attrs() const { - return std::vector({Dot::Attr("style", "filled,rounded"), - Dot::Attr("shape", "diamond"), - Dot::Attr("fillcolor", "yellow")}); -} - -Node *NodeMap::Create(Node::Type type) { - switch (type) { - case Node::Type::kFunction: - nodes_.emplace_back(new Function); - break; - case Node::Type::kValue: - nodes_.emplace_back(new Value); - break; - case Node::Type::kFunctionBlock: - nodes_.emplace_back(new FunctionBlock); - break; - default: - PADDLE_THROW("Not supported node type."); - } - nodes_.back()->id_ = size() - 1; - return nodes_.back().get(); -} - -Node *NodeMap::GetMutable(size_t id) { - PADDLE_ENFORCE_GT(size(), id); - return nodes_[id].get(); -} - -const Node &NodeMap::Get(size_t id) const { - PADDLE_ENFORCE_GT(size(), id); - return *nodes_[id].get(); -} - -void NodeMap::Delete(size_t id) { - PADDLE_ENFORCE_LT(id, size()); - nodes_[id]->SetDeleted(); -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h deleted file mode 100644 index af34156bc2f101465d87cb10e2155745022eb521..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/node.h +++ /dev/null @@ -1,244 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - * This file defines the Node class and its subclasses. A Node is the basis - * analysis element in a computation graph. - * There are basically two kinds of nodes, the function node and value node. - */ -#pragma once - -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/inference/analysis/device.h" -#include "paddle/fluid/inference/analysis/dot.h" -#include "paddle/fluid/inference/analysis/helper.h" -#include "paddle/fluid/platform/variant.h" - -namespace paddle { -namespace inference { -namespace analysis { - -class NodeMap; - -// A helper class to maintain the status from Pass. -struct AnyAttr { - using any_t = - boost::variant; - // NOTE T should be a primary type or a struct combined by several primary - // types. - // NOTE the STL containers should not use here. - // Some usages - // Attr attr; - // attr.Bool() = true; - bool &Bool() { return As(); } - float &Float() { return As(); } - int32_t &Int32() { return As(); } - int64_t &Int64() { return As(); } - void *&Pointer() { return As(); } - std::string &String() { return As(); } - - template - T &As() { - if (type_index_ == typeid(AnyAttr)) { - type_index_ = typeid(T); - any_data_ = T(); - } else { - PADDLE_ENFORCE(type_index_ == typeid(T), "fetch error type"); - } - return boost::get(any_data_); - } - - private: - any_t any_data_; - std::type_index type_index_{typeid(AnyAttr)}; -}; - -/* - * Node Representation. - * - * This is a very important class for analysis. It is the base class of all - * nodes computed by a program that may be used as operands to other nodes. - * Node is the super class of other important classes such as Function and - * Value, some nodes can have a name. - */ -class Node { - public: - // Node type. NOTE the new node types should add here. - enum class Type { kNone = -1, kFunction, kValue, kFunctionBlock }; - - Node() = default; - - // Cast to a subclass type, Function for example. - template - Subclass &As() { - return *dynamic_cast(this); - } - - // Formatted representation of this Node. - virtual std::string repr() const { - return name() + "(" + std::to_string(id()) + ")"; - } - - // DOT node representation. One Node type can customize its own node - // representation. - virtual std::vector dot_attrs() const { - return std::vector({Dot::Attr("style", "filled")}); - } - - // Get an additional attribute and convert it to T data type. NOTE this will - // silently create a new attribute if not exists. - AnyAttr &attr(const std::string &name) const { return attrs_[name]; } - - int id() const { return id_; } - - // The Protobuf description is set/get with a void* to decouple Node interface - // from a specific kind of Protobuf message. - void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; } - void *pb_desc() const { return attr("pb_desc").Pointer(); } - - void SetPbMsg(const std::string &s) { attr("pb_msg").String() = s; } - const std::string &pb_msg() const { return attr("pb_msg").String(); } - - void SetDeleted() { deleted_ = true; } - bool deleted() const { return deleted_; } - - void SetName(const std::string &name) { name_ = name; } - const std::string &name() const { return name_; } - - void SetType(Type type) { type_ = type; } - Type type() const { return type_; } - - // Input links. - std::vector inlinks; - // Output links. - std::vector outlinks; - - // Type checks. - bool IsFunction() const { return type_ == Node::Type::kFunction; } - bool IsValue() const { return type_ == Node::Type::kValue; } - bool IsFunctionBlock() const { return type_ == Node::Type::kFunctionBlock; } - - virtual ~Node() {} - - friend class NodeMap; - - PADDLE_DISALLOW_COPY_AND_ASSIGN(Node); - - protected: - // The id number not the name is a node's unique identifier in the computation - // graph. - int id_{-1}; - std::string name_; - Type type_{Type::kNone}; - // Mark this node is deleted by some pass. - bool deleted_{false}; - mutable std::unordered_map attrs_; -}; - -class Function; -/* - * Value represents a value node, it has some attributes including dims, data - * type and so on. - */ -class Value : public Node { - public: - enum class DataType { kInt32, kInt64, kFloat32, kFloat64 }; - using Dims = std::vector; - - void SetDataType(DataType data_type) { data_type_ = data_type; } - DataType data_type() const { return data_type_; } - - void SetDims(const Dims &dims) { dims_ = dims; } - const Dims &dims() const { return dims_; } - - Device device() const { return device_; } - void SetDevice(Device device) { device_ = device; } - - std::vector dot_attrs() const override; - - PADDLE_DISALLOW_COPY_AND_ASSIGN(Value); - - protected: - Value() { SetType(Node::Type::kValue); } - friend class NodeMap; - - private: - DataType data_type_; - Dims dims_; - Device device_; -}; - -/* - * Function represents any kind of executable concepts that takes several Values - * as input, and outputs several Values. - */ -class Function : public Node { - public: - std::vector dot_attrs() const override; - - // Get the operator's type from Desc. - const std::string &func_type() const { return func_type_; } - // Set the operator's type. - void SetFuncType(const std::string &func_type) { func_type_ = func_type; } - - PADDLE_DISALLOW_COPY_AND_ASSIGN(Function); - - protected: - std::string func_type_; - Function() { SetType(Node::Type::kFunction); } - friend class NodeMap; -}; - -/* - * FunctionBlock is a Node that contains a sub-graph multiple Node. - */ -struct FunctionBlock : public Node { - std::string repr() const override { return "block-" + std::to_string(id()); } - std::vector subgraph; - - protected: - FunctionBlock() { SetType(Node::Type::kFunctionBlock); } - friend class NodeMap; -}; - -class NodeMap { - public: - // Create a new node with type. - Node *Create(Node::Type type); - - // Get a node by its id. - Node *GetMutable(size_t id); - - const Node &Get(size_t id) const; - - void Delete(size_t id); - - const std::vector> &nodes() const { return nodes_; } - - size_t size() const { return nodes_.size(); } - - private: - std::vector> nodes_; - std::unordered_map map_; -}; - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/node_tester.cc b/paddle/fluid/inference/analysis/node_tester.cc deleted file mode 100644 index 9207c15373fb4264ff0e738e93ae88e1c08b554c..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/node_tester.cc +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/inference/analysis/node.h" - -#include - -namespace paddle { -namespace inference { -namespace analysis { - -TEST(NodeAttr, bool) { - AnyAttr x; - x.Bool() = true; - ASSERT_EQ(x.Bool(), true); -} - -TEST(NodeAttr, int32) { - AnyAttr x; - x.Int32() = 32; - ASSERT_EQ(x.Int32(), 32); -} - -TEST(NodeAttr, string) { - AnyAttr x; - x.String() = "Hello"; - ASSERT_EQ(x.String(), "Hello"); -} - -TEST(Node, Attr) { - // Node is an abstract class, use Value instead for they share the same Attr - // logic. - NodeMap nodes; - auto* node = nodes.Create(Node::Type::kValue); - node->attr("v0").Int32() = 2008; - ASSERT_EQ(node->attr("v0").Int32(), 2008); - - node->attr("str").String() = "hello world"; - ASSERT_EQ(node->attr("str").String(), "hello world"); -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc deleted file mode 100644 index ce390ee8313d6e3e2f0d79fb59d2225e2779180b..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/pass_manager.cc +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/inference/analysis/pass_manager.h" -#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" -#include "paddle/fluid/string/pretty_log.h" - -namespace paddle { -namespace inference { -namespace analysis { - -bool PassManager::Initialize(Argument* argument) { - argument_ = argument; - for (auto& pass : data_) { - VLOG(30) << "Initializing pass [" << pass->repr() << "]"; - if (!pass->Initialize(argument)) { - LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]"; - return false; - } - } - return true; -} - -void DfgPassManager::RunAll() { - PADDLE_ENFORCE(argument_); - VLOG(30) << "Total " << data_.size() << " Analysys passes"; - for (auto& pass : data_) { - string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]", - pass->repr()); - pass->Run(argument_->main_dfg.get()); - } -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/pass_manager.h b/paddle/fluid/inference/analysis/pass_manager.h deleted file mode 100644 index 412747c4fcce73303703f586f7a04edf4cc5ee76..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/pass_manager.h +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - * This file defines the logic of pass management. The analysis for inference is - * a pipeline of Passes, a PassManager is a agency that helps to manage the - * executation of the Passes. - * - * There are two modes of Passes, the first one is called NodePass and takes - * an Node as input and output; the second one is called DFGPass and takes a - * DFG(Data Flow Graph) as input and output. It is hard to put all the passes in - * the same pipeline, there are two kinds of PassManagers, both takes a DFG as - * input and output a DFG, but the Passes inside are different: - * - * 1. NodePassManager: the passes inside are all NodePasses, it can have - * different graph trivial algorithm, for example, DFS_NodePassManager will - * trigger the passes in depth first order; - * 2. DfgPassManager: the passes inside are all DfgPasses. - */ - -#pragma once - -#include -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/inference/analysis/analysis_pass.h" - -namespace paddle { -namespace inference { -namespace analysis { - -/* - * PassManager is the base class for all pass managers, a pass manager has - * several Pass-es registered, and execute them in the linear order. - */ -class PassManager : public OrderedRegistry { - public: - PassManager() = default; - // Call all the passes' Initialize methods. The desc and data_flow_graph are - // globally shared, so pass them as the arguemnts for all the pass managers. - virtual bool Initialize(const Argument& argument) { return false; } - - virtual bool Initialize(Argument* argument); - - // Call all the passes' Finalize methods. - virtual bool Finalize() { - for (auto& pass : data_) { - if (!pass->Finalize()) { - LOG(ERROR) << "Failed to finalize pass [" << pass->repr() << "]"; - return false; - } - } - return true; - } - - // Run all the passes. - virtual void RunAll() = 0; - - // Short identifier. - virtual std::string repr() const = 0; - // Long description. - virtual std::string description() const = 0; - - virtual ~PassManager() = default; - - protected: - Argument* argument_{nullptr}; -}; - -/* - * A pass manager that process a DFG. - */ -class DfgPassManager : public PassManager { - public: - DfgPassManager() = default; - - void RunAll() override; - - virtual ~DfgPassManager() = default; -}; - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc deleted file mode 100644 index 72b0fbf7e571ec97a0ea093d01449c1d5ddb9b91..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/pass_manager_tester.cc +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" -#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" -#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" -#include "paddle/fluid/inference/analysis/pass_manager.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" - -namespace paddle { -namespace inference { -namespace analysis { - -class TestDfgPassManager final : public DfgPassManager { - public: - TestDfgPassManager() = default; - virtual ~TestDfgPassManager() = default; - // Short identifier. - std::string repr() const override { return "test-pass-manager"; } - // Long description. - std::string description() const override { return "test doc"; } -}; - -TEST(PassManager, DFG_pass_manager) { - TestDfgPassManager manager; - DFG_GraphvizDrawPass::Config config("./", "dfg.dot"); - - manager.Register("fluid-to-flow-graph", new FluidToDataFlowGraphPass); - manager.Register("graphviz", new DFG_GraphvizDrawPass(config)); - manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass); - - Argument argument(FLAGS_inference_model_dir); - - ASSERT_TRUE(&argument); - ASSERT_TRUE(manager.Initialize(&argument)); - manager.RunAll(); -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a30c27b1183a75de8c0bb50ef3617d747b239fae --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -0,0 +1,9 @@ +cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) +cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) +cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass) + +set(analysis_deps ${analysis_deps} + ir_graph_build_pass + ir_analysis_pass + analysis_passes + CACHE INTERNAL "") diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..233bfd6a42b7f123813d4ef5cecf353f7e88d208 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h" +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ir_pass_manager.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void IrAnalysisComposePass::RunImpl(Argument *argument) { + ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); + if (argument->use_tensorrt_valid() && argument->use_tensorrt()) { + InitTensorRTAttrs(argument); + } + ApplyIrPasses(argument); + CollectFusionStatis(argument); +} + +std::string IrAnalysisComposePass::repr() const { + return "ir-analysis-compose-pass"; +} + +void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) { + if (argument->use_tensorrt_valid() && argument->use_tensorrt()) { + LOG(INFO) << "Initing TensorRT pass"; + argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) { + std::unordered_set teller_set( + {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", + "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", + "elementwise_add", "dropout", "split"}); + if (!node->IsOp()) return false; + + if (teller_set.count(node->Op()->Type())) { + return true; + } else { + return false; + } + }); + } +} + +void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) { + std::vector passes({ + "ir_graph_build_pass", "ir_analysis_pass", + }); + for (const auto &pass : passes) { + VLOG(2) << "Run pass " << pass; + auto *the_pass = PassRegistry::Global().Retreive(pass); + the_pass->Run(argument); + } +} + +void IrAnalysisComposePass::CollectFusionStatis(Argument *argument) { + if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) { + LOG(INFO) << "argument has no fuse statis"; + return; + } + argument->SetFusionStatis( + argument->main_graph().Get( + framework::ir::kFuseStatisAttr)); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h similarity index 53% rename from paddle/fluid/inference/analysis/model_store_pass.h rename to paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h index f14b49e09c2f8e79c6fc4accdbf17f4f7a9bb1a3..53e2ebb0038a5c105f68a0146b3da90a6ae34af8 100644 --- a/paddle/fluid/inference/analysis/model_store_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h @@ -12,42 +12,35 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* - * This file defines ModelStorePass, which store the runtime DFG to a Paddle - * model in the disk, and that model can be reloaded for prediction. - */ - #pragma once + #include +#include #include "paddle/fluid/inference/analysis/analysis_pass.h" +#include "paddle/fluid/inference/analysis/passes/passes.h" namespace paddle { namespace inference { namespace analysis { -class ModelStorePass : public DataFlowGraphPass { +/* + * The analysis pass to run a list of IR passes (like a function call). + * Currently, it should be the first pass of analysis phase. + */ +class IrAnalysisComposePass : public AnalysisPass { public: - bool Initialize(Argument* argument) override { - if (!argument) { - LOG(ERROR) << "invalid argument"; - return false; - } - argument_ = argument; - return true; - } + void RunImpl(Argument* argument) override; + std::string repr() const override; - void Run(DataFlowGraph* x) override; + private: + void InitTensorRTAttrs(Argument* argument); - std::string repr() const override { return "DFG-store-pass"; } - std::string description() const override { - return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle - model in the disk, and that model can be reloaded for prediction again.)DD"; - } + void ApplyIrPasses(Argument* argument); - bool Finalize() override; + void CollectFusionStatis(Argument* argument); - private: - Argument* argument_{nullptr}; + // Assign a Scope for IR passes to modify the weights. + void AssignScopeToModify(Argument* argument); }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..e327bd39f0ae0b8fbe3b189e4bb26a23c44d910c --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" +#include "paddle/fluid/inference/analysis/ir_pass_manager.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void IrAnalysisPass::RunImpl(Argument* argument) { + ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); + ARGUMENT_CHECK_FIELD(argument, main_program); + ARGUMENT_CHECK_FIELD(argument, scope); + + auto* the_graph = argument->ReleaseMainGraph(); + auto graph = std::unique_ptr(the_graph); + + // Apply passes. + IRPassManager the_ir_manager(argument); + graph = the_ir_manager.Apply(std::move(graph)); + PADDLE_ENFORCE_GT(graph->Nodes().size(), 0); + argument->SetIrAnalyzedProgram(new framework::proto::ProgramDesc( + the_ir_manager.AcquireProgram(&graph, argument->main_program()))); + argument->SetMainGraph(graph.release()); +} + +std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; } + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/node_attr_flags.h b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h similarity index 70% rename from paddle/fluid/inference/analysis/node_attr_flags.h rename to paddle/fluid/inference/analysis/passes/ir_analysis_pass.h index a3f70e5419a66969e8fb20152a8a8ace39316f57..d8a7449807585257c153d3c8958555ea2306afa3 100644 --- a/paddle/fluid/inference/analysis/node_attr_flags.h +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h @@ -12,20 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. -/* - * This file contains all the flags that declared in Node::Attr. - * - * The Node::Attr is designed to share information between different passes, one - * can get other's attributes in a Node by the flags in this file. - */ #pragma once + +#include +#include "paddle/fluid/inference/analysis/analysis_pass.h" + namespace paddle { namespace inference { namespace analysis { -#define DECLARE_NODE_ATTR(flag__) const char ATTR_##flag__[] = #flag__; - -DECLARE_NODE_ATTR(supported_by_tensorrt) // bool +/* + * Perform IR analysis passes. + * + * It is used to fuse some + */ +class IrAnalysisPass : public AnalysisPass { + public: + void RunImpl(Argument* argument) override; + std::string repr() const override; +}; } // namespace analysis } // namespace inference diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..a30fef08b5726c965637e2fb489bdb2036bd2a8d --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" +#include +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { + +extern void ReadBinaryFile(const std::string &filename, std::string *contents); + +namespace analysis { + +void IrGraphBuildPass::RunImpl(Argument *argument) { + if (!argument->scope_valid()) { + argument->SetScope(new framework::Scope); + } + + if (argument->model_dir_valid()) { + auto program = LoadModel(argument->model_dir(), argument->scope_ptr()); + argument->SetMainProgram(program.release()); + } else if (argument->model_program_path_valid() && + argument->model_params_path_valid()) { + auto program = + LoadModel(argument->model_program_path(), argument->model_params_path(), + argument->scope_ptr()); + argument->SetMainProgram(program.release()); + } else { + PADDLE_THROW( + "either model_dir or (program path and parameter path) should be set."); + } + + auto graph = std::unique_ptr(new Graph(argument->main_program())); + argument->SetMainGraph(graph.release()); + argument->main_graph().Set(framework::ir::kParamScopeAttr, + new framework::Scope *(argument->scope_ptr())); +} + +std::unique_ptr IrGraphBuildPass::LoadModel( + const std::string &path, framework::Scope *scope) { + platform::CPUPlace place; + framework::Executor exe(place); + return Load(&exe, scope, path); +} + +std::unique_ptr IrGraphBuildPass::LoadModel( + const std::string &program_path, const std::string ¶ms_path, + framework::Scope *scope) { + platform::CPUPlace place; + framework::Executor exe(place); + return Load(&exe, scope, program_path, params_path); +} + +std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; } + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..3291e4f6ad3ca3079e672350805cab1f1e7b2413 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h @@ -0,0 +1,46 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/analysis_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Load program and parameter to memory from the disk. + */ +class IrGraphBuildPass : public AnalysisPass { + public: + void RunImpl(Argument *argument) override; + + std::string repr() const override; + + private: + std::unique_ptr LoadModel(const std::string &path, + framework::Scope *scope); + std::unique_ptr LoadModel( + const std::string &program_path, const std::string ¶ms_path, + framework::Scope *scope); + + std::string model_binary_str_; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc new file mode 100644 index 0000000000000000000000000000000000000000..2ef515f45f2483df8d1238b4758d6729d0299ce9 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/passes.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/passes.h" +#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc" +#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" +#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { +PassRegistry::PassRegistry() { + passes_.emplace("ir_analysis_pass", + std::unique_ptr(new IrAnalysisPass)); + passes_.emplace("ir_graph_build_pass", + std::unique_ptr(new IrGraphBuildPass)); + passes_.emplace("ir_analysis_compose_pass", + std::unique_ptr(new IrAnalysisComposePass)); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/passes/passes.h similarity index 61% rename from paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc rename to paddle/fluid/inference/analysis/passes/passes.h index 367c25805d05f8d10fb8341158760ac6356a5c48..ea07e0dcbd992c9d10c6662909798ef79a01e3a7 100644 --- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc +++ b/paddle/fluid/inference/analysis/passes/passes.h @@ -12,24 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h" +#pragma once -#include -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include +#include "paddle/fluid/inference/analysis/analysis_pass.h" namespace paddle { namespace inference { namespace analysis { -TEST(FluidToIrPass, Test) { - FluidToIrPass pass; - Argument argument(FLAGS_inference_model_dir); - argument.Set(kFluidToIrPassesAttr, - new std::vector({"infer_clean_graph_pass"})); - pass.Initialize(&argument); - pass.Run(argument.main_dfg.get()); -} +struct PassRegistry { + PassRegistry(); + + AnalysisPass* Retreive(const std::string& pass_type) { + return passes_[pass_type].get(); + } + + static PassRegistry& Global() { + static auto* x = new PassRegistry; + return *x; + } + + private: + std::unordered_map> passes_; +}; } // namespace analysis } // namespace inference diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h deleted file mode 100644 index 76e4fda0249e03c617d1b37c079dcd97f21387c1..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/subgraph_splitter.h +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - * This file defines the the class to partition a graph. - */ - -#pragma once - -#include - -#include "paddle/fluid/inference/analysis/argument.h" -#include "paddle/fluid/inference/analysis/data_flow_graph.h" -#include "paddle/fluid/inference/analysis/node.h" - -namespace paddle { -namespace inference { -namespace analysis { - -/* - * Detect the nodes in a sub-graph that meet some conditions. This class doesn't - * modify the graph. - */ -class SubGraphSplitter { - public: - static const char *kMarkerAttrName; - // Tell whether a node is inside a sub-graph. - using NodeInsideSubgraphTeller = std::function; - - SubGraphSplitter(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller) - : graph_(graph), node_inside_subgraph_teller_(teller) {} - - std::vector> operator()(); - - protected: - // Mark the nodes inside the accepted sub-graph using - // node_inside_subgraph_teller. - void MarkNodesInsideSubGraph(); - - // Merge the marked nodes into sub-graphs and return the sub-graphs. - std::vector> ExtractSubGraphs(); - - private: - DataFlowGraph *graph_; - NodeInsideSubgraphTeller node_inside_subgraph_teller_; -}; - -/* - * SubGraphFuse - Replace some nodes with the sub-graph node they are inside. To - * some extent, the TensorRT engine is just a fusion op for a model. - */ -class SubGraphFuse { - public: - using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller; - - SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller, - Argument *argument) - : graph_(graph), - node_inside_subgraph_teller_(teller), - argument_(argument) {} - - // The main method which run all the logic. - void operator()(); - - protected: - // Remove the nodes inside sub-graphs and replace with the SubGraphNode. - void ReplaceNodesWithSubGraphs(); - - private: - DataFlowGraph *graph_; - NodeInsideSubgraphTeller node_inside_subgraph_teller_; - Argument *argument_; -}; - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc deleted file mode 100644 index e1dc89fab5fb76d456b07c316ab1cabe6de23b26..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/inference/analysis/subgraph_splitter.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" - -namespace paddle { -namespace inference { -namespace analysis { - -SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) { - if (node->type() != Node::Type::kFunction) return false; - const auto* func = static_cast(node); - if (func->func_type() == "elementwise_add" || func->func_type() == "relu" || - func->func_type() == "conv2d" || func->func_type() == "mul" || - func->func_type() == "sigmoid" || func->func_type() == "softmax") { - LOG(INFO) << "sub-graph marked " << node->repr(); - return true; - } - return false; -}; - -TEST(SubGraphSplitter, Split) { - auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); - auto dfg = ProgramDescToDFG(desc); - LOG(INFO) << "spliter\n" << dfg.DotString(); - - ASSERT_GT(dfg.nodes.size(), 5UL); - - auto subgraphs = SubGraphSplitter(&dfg, teller)(); - - // Check the number of the marked nodes. - int marked_nodes = 0; - for (auto& node : dfg.nodes.nodes()) { - if (node->IsFunction() && - node->attr(SubGraphSplitter::kMarkerAttrName).Bool()) { - ++marked_nodes; - } - } - EXPECT_EQ(marked_nodes, 6); - - // For human debug. - for (auto& subgraph : subgraphs) { - LOG(INFO) << "subgraph size " << subgraph.size(); - for (auto* node : subgraph) { - LOG(INFO) << "node " << node->repr(); - } - } - - ASSERT_EQ(subgraphs.size(), 1UL); - // The last sub-graph has 5 Functions. - ASSERT_EQ(subgraphs.back().size(), 6UL); -} - -TEST(SubGraphSplitter, Fuse) { - auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); - auto dfg = ProgramDescToDFG(desc); - Argument argument; - argument.Set("minimum_subgraph_size", new int(3)); - - size_t count0 = dfg.nodes.size(); - - SubGraphFuse fuse(&dfg, teller, &argument); - fuse(); - - int count1 = 0; - for (auto& node : dfg.nodes.nodes()) { - if (node->deleted()) { - LOG(INFO) << "deleted " << node->repr(); - } - count1 += node->deleted(); - } - - // At least one nodes should be deleted. - ASSERT_EQ(dfg.nodes.size(), count0 + 1); // added a new FunctionBlock - ASSERT_EQ(11, count1); -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc deleted file mode 100644 index 174c8513f92cf869419f04cab5a54af65e9673b8..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "paddle/fluid/inference/analysis/analyzer.h" -#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" -#include "paddle/fluid/inference/analysis/node_attr_flags.h" -#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h" - -namespace paddle { -namespace inference { -namespace analysis { - -void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) { - for (auto &node : graph->nodes.nodes()) { - node->attr(ATTR_supported_by_tensorrt).Bool() = teller_(node.get()); - } -} - -class DfgDebuggerPass : public DFG_GraphvizDrawPass { - public: - explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config) - : DFG_GraphvizDrawPass(config) {} - - std::string repr() const override { - return "tensorrt-subgraph-node-mark-debugger"; - } - - bool Finalize() override { return true; } - - protected: - std::string Draw(DataFlowGraph *graph) override { - Dot dot; - // Add nodes - for (size_t i = 0; i < graph->nodes.size(); i++) { - const Node &node = graph->nodes.Get(i); - if (config_.display_deleted_node || !node.deleted()) { - auto dot_attr = node.dot_attrs(); - if (node.attr(ATTR_supported_by_tensorrt).Bool()) { - dot_attr.assign( - {Dot::Attr{"color", "green"}, Dot::Attr{"style", "filled"}}); - } - dot.AddNode(node.repr(), dot_attr); - } - } - // Add edges - for (size_t i = 0; i < graph->nodes.size(); i++) { - const Node &node = graph->nodes.Get(i); - if (!config_.display_deleted_node && node.deleted()) continue; - for (auto &in : node.inlinks) { - if (!config_.display_deleted_node && in->deleted()) continue; - dot.AddEdge(in->repr(), node.repr(), {}); - } - } - return dot.Build(); - } -}; - -AnalysisPass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const { - DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root, - "tensorrt_marked_node"); - return new DfgDebuggerPass(config); -} -bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; } - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h deleted file mode 100644 index c881a54c240538b68abdcb9060db69de3bf2b8bb..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* - * This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops - * that supported by TensorRT engine. - */ - -#pragma once - -#include -#include "paddle/fluid/inference/analysis/analysis_pass.h" -#include "paddle/fluid/inference/analysis/subgraph_splitter.h" - -namespace paddle { -namespace inference { -namespace analysis { - -/* - * Mark the operators that TensorRT engine supports. - */ -class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass { - public: - using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller; - - explicit TensorRTSubgraphNodeMarkPass(const teller_t& teller) - : teller_(teller) {} - - bool Initialize(Argument* argument) override { return true; } - - // This class get a sub-graph as input and determine whether to transform this - // sub-graph into TensorRT. - void Run(DataFlowGraph* graph) override; - - std::string repr() const override { return "tensorrt-sub-subgraph-mark"; } - std::string description() const override { - return "tensorrt sub-graph mark pass"; - } - - AnalysisPass* CreateGraphvizDebugerPass() const override; - bool Finalize() override; - - private: - teller_t teller_; -}; - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc deleted file mode 100644 index c1d932878e559180af987594535959afdf475587..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h" - -#include -#include "paddle/fluid/inference/analysis/node_attr_flags.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" - -namespace paddle { -namespace inference { -namespace analysis { - -TEST(TensorRTSubgraphNodeMarkPass, test) { - // init - FluidToDataFlowGraphPass pass; - Argument argument(FLAGS_inference_model_dir); - ASSERT_TRUE(pass.Initialize(&argument)); - pass.Run(argument.main_dfg.get()); - - TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) { - return node->IsFunction() && - static_cast(node)->func_type() == "mul"; - }; - TensorRTSubgraphNodeMarkPass pass1(teller); - ASSERT_TRUE(pass1.Initialize(&argument)); - pass1.Run(argument.main_dfg.get()); - - int counter{0}; - for (auto& node : argument.main_dfg->nodes.nodes()) { - counter += node->attr(ATTR_supported_by_tensorrt).Bool(); - } - ASSERT_EQ(counter, 2); - LOG(INFO) << counter << " nodes marked"; -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc deleted file mode 100644 index 3aa65f223a9e70b8ba7e387d1766ec6a97aee385..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" -#include "paddle/fluid/inference/analysis/subgraph_splitter.h" - -namespace paddle { -namespace inference { -namespace analysis { - -TensorRTSubGraphPass::TensorRTSubGraphPass( - const TensorRTSubGraphPass::NodeInsideSubgraphTeller &teller) - : node_inside_subgraph_teller_(teller) {} - -void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { - SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)(); - VLOG(40) << "debug info " - << graph->HumanReadableInfo(false /*show_values*/, - true /*show_functions*/); -} - -} // namespace analysis -} // namespace inference - -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h deleted file mode 100644 index 3545da9109d79964f36c3d7e738620cc2e0f9a6c..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/inference/analysis/analysis_pass.h" -#include "paddle/fluid/inference/analysis/node.h" -#include "paddle/fluid/inference/analysis/subgraph_splitter.h" - -namespace paddle { -namespace inference { -namespace analysis { - -/* - * Parse the graph and replace TensorRT supported nodes with SubGraphNode - */ -class TensorRTSubGraphPass : public DataFlowGraphPass { - public: - // Tell whether to transform a sub-graph into TensorRT. - using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller; - - explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller); - - bool Initialize(Argument* argument) override { - argument_ = argument; - return true; - } - - // This class get a sub-graph as input and determine whether to transform this - // sub-graph into TensorRT. - void Run(DataFlowGraph* graph) override; - - bool Finalize() override { return true; } - - std::string repr() const override { return "tensorrt-sub-graph"; } - std::string description() const override { return "tensorrt sub graph pass"; } - - private: - NodeInsideSubgraphTeller node_inside_subgraph_teller_; - Argument* argument_; -}; - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc deleted file mode 100644 index 9748e24b06295a4e7c2995429e6588cd0f225fe6..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" - -#include -#include -#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" - -namespace paddle { -namespace inference { -namespace analysis { - -DEFINE_string(dot_dir, "./", ""); - -TEST(TensorRTSubGraphPass, main) { - std::unordered_set teller_set( - {"elementwise_add", "mul", "sigmoid"}); - SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) { - if (node->type() != Node::Type::kFunction) return false; - const auto* func = static_cast(node); - if (teller_set.count(func->func_type())) return true; - return false; - }; - - Argument argument(FLAGS_inference_model_dir); - argument.Set("minimum_subgraph_size", new int(0)); - argument.Set("max_batch_size", new int(3)); - argument.Set("workspace_size", new int(1 << 20)); - argument.Set("precision_mode", new std::string("FP32")); - - DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"}; - DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"}; - - DFG_GraphvizDrawPass dfg_pass(config); - DFG_GraphvizDrawPass dfg_pass1(config1); - FluidToDataFlowGraphPass pass0; - TensorRTSubGraphPass trt_pass(std::move(teller)); - - dfg_pass.Initialize(&argument); - dfg_pass1.Initialize(&argument); - pass0.Initialize(&argument); - trt_pass.Initialize(&argument); - - argument.main_dfg.reset(new DataFlowGraph); - pass0.Run(argument.main_dfg.get()); - dfg_pass.Run(argument.main_dfg.get()); - trt_pass.Run(argument.main_dfg.get()); - dfg_pass1.Run(argument.main_dfg.get()); - - // Check the TRT op's block desc - for (auto& node : argument.main_dfg->nodes.nodes()) { - if (node->IsFunctionBlock()) { - LOG(INFO) << "get function block"; - } - } -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h index 1073a6f686eaeeaaae2d93ab044149b7df518085..d599099a8050eaeabb8e0544b1bfe3b6b46b17ec 100644 --- a/paddle/fluid/inference/analysis/ut_helper.h +++ b/paddle/fluid/inference/analysis/ut_helper.h @@ -18,8 +18,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/inference/analysis/data_flow_graph.h" -#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/helper.h" namespace paddle { @@ -32,29 +30,6 @@ namespace analysis { DEFINE_string(inference_model_dir, "", "inference test model dir"); -static DataFlowGraph ProgramDescToDFG( - const framework::proto::ProgramDesc& desc) { - DataFlowGraph graph; - FluidToDataFlowGraphPass pass; - Argument argument; - argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); - argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); - pass.Initialize(&argument); - pass.Run(&graph); - pass.Finalize(); - return graph; -} - -class DFG_Tester : public ::testing::Test { - protected: - void SetUp() override { - auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); - argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); - } - - Argument argument; -}; - } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index fd05c967774b336275f4c7bd98313bc1d750502f..82f74a269a5915dfa1d97a28f5ae15a12ea0b154 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -17,17 +17,22 @@ if(APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") endif(APPLE) -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}) + +set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) - set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor) + set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) endif() cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope) -cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor) +cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder) +cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder) +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api) cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api) + + cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) @@ -40,20 +45,10 @@ endif() cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} ARGS --dirname=${WORD2VEC_MODEL_DIR}) -if(WITH_GPU AND TENSORRT_FOUND) -cc_library(paddle_inference_tensorrt_subgraph_engine - SRCS api_tensorrt_subgraph_engine.cc - DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy) - if(WITH_TESTING) - inference_base_test(test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS ${inference_deps} - ARGS --dirname=${WORD2VEC_MODEL_DIR}) - endif() -endif() - if (WITH_ANAKIN AND WITH_MKL) # only needed in CI # compile the libinference_anakin_api.a and anakin.so. - cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy) - cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope) + cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy) + cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy) function(anakin_target target_name) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endfunction() diff --git a/paddle/fluid/inference/api/README.md b/paddle/fluid/inference/api/README.md index 20969fac6c8f894ffb4a02b48f795e2a0dcbd096..a2d685d723bd9ab2b84969adb86e177a8754328d 100644 --- a/paddle/fluid/inference/api/README.md +++ b/paddle/fluid/inference/api/README.md @@ -2,25 +2,15 @@ Paddle inference offers the APIs in `C` and `C++` languages. -One can easily deploy a model trained by Paddle following the steps as below: +You can easily deploy a model trained by Paddle following the steps as below: 1. Optimize the native model; 2. Write some codes for deployment. +## The APIs -Let's explain the steps in detail. - -## Optimize the native Fluid Model - -The native model that get from the training phase needs to be optimized for that. - -- Clean the noise such as the cost operators that do not need inference; -- Prune unnecessary computation fork that has nothing to do with the output; -- Remove extraneous variables; -- Memory reuse for native Fluid executor; -- Translate the model storage format to some third-party engine's, so that the inference API can utilize the engine for acceleration; - -We have an official tool to do the optimization, call `paddle_inference_optimize --help` for more information. +All the released APIs are located in the `paddle_inference_api.h` header file. +The stable APIs are wrapped by `namespace paddle`, the unstable APIs are protected by `namespace paddle::contrib`. ## Write some codes diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc new file mode 100644 index 0000000000000000000000000000000000000000..5ccd2dc5ab353b1634b651a4b7caa2af0da75ce4 --- /dev/null +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle_pass_builder.h" // NOLINT + +namespace paddle { + +PassStrategy *contrib::AnalysisConfig::pass_builder() const { + PADDLE_ENFORCE( + pass_builder_.get(), + "Should call constructor first, that will init the pass_builder_."); + return pass_builder_.get(); +} + +contrib::AnalysisConfig::AnalysisConfig(bool use_gpu) { + this->use_gpu = use_gpu; + if (use_gpu) { + pass_builder_.reset(new GpuPassStrategy); + } else { + pass_builder_.reset(new CpuPassStrategy); + } +} + +contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { + // fields from Config + model_dir = other.model_dir; + // fields from NativeConfig + use_gpu = other.use_gpu; + device = other.device; + fraction_of_gpu_memory = other.fraction_of_gpu_memory; + prog_file = other.prog_file; + param_file = other.param_file; + specify_input_name = other.specify_input_name; + // fields from this. + enable_ir_optim = other.enable_ir_optim; + use_feed_fetch_ops = other.use_feed_fetch_ops; + use_tensorrt_ = other.use_tensorrt_; + tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; + tensorrt_workspace_size_ = other.tensorrt_workspace_size_; + + if (use_gpu) { + pass_builder_.reset(new GpuPassStrategy( + *static_cast(other.pass_builder()))); + } else { + pass_builder_.reset(new CpuPassStrategy( + *static_cast(other.pass_builder()))); + } +} + +contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { + // fields from Config + model_dir = other.model_dir; + // fields from NativeConfig + use_gpu = other.use_gpu; + device = other.device; + fraction_of_gpu_memory = other.fraction_of_gpu_memory; + prog_file = other.prog_file; + param_file = other.param_file; + specify_input_name = other.specify_input_name; + // fields from this. + enable_ir_optim = other.enable_ir_optim; + use_feed_fetch_ops = other.use_feed_fetch_ops; + use_tensorrt_ = other.use_tensorrt_; + tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; + tensorrt_workspace_size_ = other.tensorrt_workspace_size_; + pass_builder_ = std::move(other.pass_builder_); +} + +void contrib::AnalysisConfig::EnableMKLDNN() { +#ifdef PADDLE_WITH_MKLDNN + pass_builder()->EnableMKLDNN(); + use_mkldnn_ = true; +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; + use_mkldnn_ = false; +#endif +} + +void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, + int max_batch_size) { + use_tensorrt_ = true; + tensorrt_workspace_size_ = workspace_size; + tensorrt_max_batchsize_ = max_batch_size; + // Append after the infer_clean pass. + pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index dd295854a87c9707aaa85e1e2c6111089e3fe885..76d205b737aeb456f242037f2b375d9c537b39f3 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -13,10 +13,13 @@ // limitations under the License. #include "paddle/fluid/inference/api/analysis_predictor.h" +#include +#include #include #include #include #include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/naive_executor.h" @@ -24,6 +27,9 @@ #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" +#if PADDLE_WITH_TENSORRT +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#endif #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/profiler.h" @@ -35,6 +41,17 @@ namespace paddle { using contrib::AnalysisConfig; +namespace { +bool IsPersistable(const framework::VarDesc *var) { + if (var->Persistable() && + var->GetType() != framework::proto::VarType::FEED_MINIBATCH && + var->GetType() != framework::proto::VarType::FETCH_LIST) { + return true; + } + return false; +} +} // namespace + bool AnalysisPredictor::Init( const std::shared_ptr &parent_scope, const std::shared_ptr &program) { @@ -52,36 +69,93 @@ bool AnalysisPredictor::Init( // no matter with or without MKLDNN paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); - if (config_.use_gpu) { - place_ = paddle::platform::CUDAPlace(config_.device); - LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim " - "is turned false."; - config_.enable_ir_optim = false; - } else { - place_ = paddle::platform::CPUPlace(); + if (!PrepareScope(parent_scope)) { + return false; + } + if (!CreateExecutor()) { + return false; + } + if (!PrepareProgram(program)) { + return false; + } + + // Prepare executor, create local variables. + if (!PrepareExecutor()) { + return true; } + + // Get the feed_target_names and fetch_target_names + PrepareFeedFetch(); + + return true; +} + +bool AnalysisPredictor::PrepareScope( + const std::shared_ptr &parent_scope) { if (parent_scope) { + PADDLE_ENFORCE_NOT_NULL( + parent_scope, + "Both program and parent_scope should be set in Clone mode."); scope_ = parent_scope; - sub_scope_ = &(parent_scope->NewScope()); + status_is_cloned_ = true; } else { paddle::framework::InitDevices(false); scope_.reset(new paddle::framework::Scope()); + status_is_cloned_ = false; } - - executor_.reset(new paddle::framework::NaiveExecutor(place_)); - + sub_scope_ = &scope_->NewScope(); + return true; +} +bool AnalysisPredictor::PrepareProgram( + const std::shared_ptr &program) { if (!program) { if (!LoadProgramDesc()) return false; - OptimizeInferenceProgram(); + + // Optimize the program, and load parameters and modify them in the + // scope_. + // This will change the scope_ address. + if (config_.enable_ir_optim) { + status_ir_optim_enabled_ = true; + OptimizeInferenceProgram(); + } else { + // If the parent_scope is passed, we assert that the persistable variables + // are already created, so just create the no persistable variables. + + // If not cloned, the parameters should be loaded + // OptimizeInferenceProgram. + // So in both cases, just the local variables are needed to load, not the + // parematers. + executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); + + // Load parameters + LOG(INFO) << "load parameters "; + LoadParameters(); + } } else { + // If the program is passed from external, no need to optimize it, this + // logic is used in the clone scenario. inference_program_ = program; } - executor_->Prepare(scope_.get(), *inference_program_, 0, + executor_->CreateVariables(*inference_program_, 0, false, sub_scope_); + + return true; +} +bool AnalysisPredictor::CreateExecutor() { + if (config_.use_gpu) { + status_use_gpu_ = true; + place_ = paddle::platform::CUDAPlace(config_.device); + } else { + place_ = paddle::platform::CPUPlace(); + } + executor_.reset(new paddle::framework::NaiveExecutor(place_)); + return true; +} +bool AnalysisPredictor::PrepareExecutor() { + executor_->Prepare(sub_scope_, *inference_program_, 0, config_.use_feed_fetch_ops); - // Get the feed_target_names and fetch_target_names - PrepareFeedFetch(); + PADDLE_ENFORCE_NOT_NULL(sub_scope_); return true; } @@ -206,54 +280,40 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, return true; } +// NOTE All the members in AnalysisConfig should be copied to Argument. void AnalysisPredictor::OptimizeInferenceProgram() { - LOG(INFO) << "optimize begin"; - FLAGS_IA_enable_ir = config_.enable_ir_optim; - FLAGS_IA_enable_tensorrt_subgraph_engine = false; - FLAGS_IA_output_storage_path = ""; // Don't output the model. + status_program_optimized_ = true; + + argument_.SetUseGPU(config_.use_gpu); // Analyze inference_program if (!config_.model_dir.empty()) { - argument_.fluid_model_dir.reset(new std::string(config_.model_dir)); + argument_.SetModelDir(config_.model_dir); } else { PADDLE_ENFORCE( !config_.param_file.empty(), "Either model_dir or (param_file, prog_file) should be set."); PADDLE_ENFORCE(!config_.prog_file.empty()); - argument_.fluid_model_program_path.reset( - new std::string(config_.prog_file)); - argument_.fluid_model_param_path.reset(new std::string(config_.param_file)); + argument_.SetModelProgramPath(config_.prog_file); + argument_.SetModelParamsPath(config_.param_file); } - argument_.origin_program_desc.reset( - new ProgramDesc(*inference_program_->Proto())); - - switch (config_.ir_mode) { - case contrib::AnalysisConfig::IrPassMode::kExclude: - Analyzer() - .IncludeAllIrPasses() - .SetUseMkldnn(config_._use_mkldnn) - .DisableIrPasses(config_.ir_passes) - .Run(&argument_); - break; - case contrib::AnalysisConfig::IrPassMode::kInclude: - Analyzer() - .SetUseMkldnn(config_._use_mkldnn) - .IncludeIrPasses(config_.ir_passes) - .Run(&argument_); - break; - default: - LOG(ERROR) << "Only kExclude and kInclude modes are supoorted yet."; + if (config_.use_gpu && config_.use_tensorrt_) { + argument_.SetUseTensorRT(true); + argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); + argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); } - CHECK(argument_.transformed_program_desc); - VLOG(50) << "to prepare executor"; + auto passes = config_.pass_builder()->AllPasses(); + if (!config_.enable_ir_optim) passes.clear(); + argument_.SetIrAnalysisPasses(passes); + argument_.SetScopeNotOwned(const_cast(scope_.get())); + Analyzer().Run(&argument_); + + PADDLE_ENFORCE(argument_.scope_valid()); + VLOG(5) << "to prepare executor"; + ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program); inference_program_.reset( - new framework::ProgramDesc(*argument_.transformed_program_desc)); - if (argument_.Has(framework::ir::kParamScopeAttr)) { - // Update scope. - scope_.reset( - argument_.Release(framework::ir::kParamScopeAttr)); - } + new framework::ProgramDesc(argument_.ir_analyzed_program())); LOG(INFO) << "== optimize end =="; } @@ -283,10 +343,12 @@ std::unique_ptr CreatePaddlePredictor< if (!dynamic_cast(predictor.get())->Init(nullptr)) { return nullptr; } - return predictor; + return std::move(predictor); } void AnalysisPredictor::PrepareFeedFetch() { + PADDLE_ENFORCE_NOT_NULL(sub_scope_); + CreateFeedFetchVar(sub_scope_); for (auto *op : inference_program_->Block(0).AllOps()) { if (op->Type() == "feed") { int idx = boost::get(op->GetAttr("col")); @@ -305,6 +367,14 @@ void AnalysisPredictor::PrepareFeedFetch() { } } +void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) { + PADDLE_ENFORCE_NOT_NULL(scope); + auto *var = scope->Var("feed"); + var->GetMutable(); + var = scope->Var("fetch"); + var->GetMutable(); +} + std::unique_ptr AnalysisPredictor::GetInputTensor( const std::string &name) { PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); @@ -335,27 +405,98 @@ bool AnalysisPredictor::ZeroCopyRun() { bool AnalysisPredictor::LoadProgramDesc() { // Initialize the inference program - std::unique_ptr tmp_exe( - new framework::Executor(platform::CPUPlace())); + std::string filename; if (!config_.model_dir.empty()) { - // Parameters are saved in separate files sited in - // the specified `dirname`. - inference_program_ = paddle::inference::Load( - static_cast(tmp_exe.get()), scope_.get(), - config_.model_dir); + filename = config_.model_dir + "/__model__"; } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { // All parameters are saved in a single file. // The file names should be consistent with that used // in Python API `fluid.io.save_inference_model`. - inference_program_ = paddle::inference::Load( - static_cast(tmp_exe.get()), scope_.get(), - config_.prog_file, config_.param_file); + filename = config_.prog_file; } else { + if (config_.model_dir.empty() && config_.prog_file.empty()) { + LOG(ERROR) + << "Either model_dir or (prog_file, param_file) should be set."; + return false; + } LOG(ERROR) << string::Sprintf( "not valid model path '%s' or program path '%s'.", config_.model_dir, config_.param_file); return false; } + + std::string pb_content; + // Read binary + std::ifstream fin(filename, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", filename); + fin.seekg(0, std::ios::end); + + pb_content.resize(fin.tellg()); + fin.seekg(0, std::ios::beg); + fin.read(&(pb_content.at(0)), pb_content.size()); + fin.close(); + + // Create ProgramDesc + framework::proto::ProgramDesc proto; + proto.ParseFromString(pb_content); + inference_program_.reset(new framework::ProgramDesc(proto)); + return true; +} + +bool AnalysisPredictor::LoadParameters() { + PADDLE_ENFORCE_NOT_NULL(inference_program_.get(), + "The inference program should be loaded first."); + const auto &global_block = inference_program_->MutableBlock(0); + + // create a temporary program to load parameters. + + std::unique_ptr load_program( + new framework::ProgramDesc()); + framework::BlockDesc *load_block = load_program->MutableBlock(0); + std::vector params; + + for (auto *var : global_block->AllVars()) { + if (IsPersistable(var)) { + VLOG(3) << "persistable variable's name: " << var->Name(); + + framework::VarDesc *new_var = load_block->Var(var->Name()); + new_var->SetShape(var->GetShape()); + new_var->SetDataType(var->GetDataType()); + new_var->SetType(var->GetType()); + new_var->SetLoDLevel(var->GetLoDLevel()); + new_var->SetPersistable(true); + + if (!config_.param_file.empty()) { + params.push_back(new_var->Name()); + } else { + // append_op + framework::OpDesc *op = load_block->AppendOp(); + op->SetType("load"); + op->SetOutput("Out", {new_var->Name()}); + op->SetAttr("file_path", {config_.model_dir + "/" + new_var->Name()}); + op->CheckAttrs(); + } + } + } + + if (!config_.param_file.empty()) { + // sort paramlist to have consistent ordering + std::sort(params.begin(), params.end()); + // append just the load_combine op + framework::OpDesc *op = load_block->AppendOp(); + op->SetType("load_combine"); + op->SetOutput("Out", params); + op->SetAttr("file_path", {config_.param_file}); + op->CheckAttrs(); + } + + // Use NaiveExecutor to Load parameters. + platform::CPUPlace place; + framework::NaiveExecutor e(place); + e.Prepare(scope_.get(), *load_program, 0, false); + e.Run(); + VLOG(3) << "get " << scope_->LocalVarNames().size() << " vars after load"; + return true; } @@ -385,3 +526,27 @@ std::unique_ptr CreatePaddlePredictor( } } // namespace paddle + +#if PADDLE_WITH_TENSORRT +USE_TRT_CONVERTER(elementwise_add_weight); +USE_TRT_CONVERTER(elementwise_add_tensor); +USE_TRT_CONVERTER(elementwise_sub_tensor); +USE_TRT_CONVERTER(elementwise_div_tensor); +USE_TRT_CONVERTER(elementwise_mul_tensor); +USE_TRT_CONVERTER(elementwise_max_tensor); +USE_TRT_CONVERTER(elementwise_min_tensor); +USE_TRT_CONVERTER(elementwise_pow_tensor); +USE_TRT_CONVERTER(mul); +USE_TRT_CONVERTER(conv2d); +USE_TRT_CONVERTER(relu); +USE_TRT_CONVERTER(sigmoid); +USE_TRT_CONVERTER(tanh); +USE_TRT_CONVERTER(fc); +USE_TRT_CONVERTER(pool2d); +USE_TRT_CONVERTER(softmax); +USE_TRT_CONVERTER(batch_norm); +USE_TRT_CONVERTER(concat); +USE_TRT_CONVERTER(dropout); +USE_TRT_CONVERTER(pad); +USE_TRT_CONVERTER(split); +#endif diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index a9f4cce6dfa1c92301f57a7b1dd024a61f99d5ab..cf81b7db738d899566ddf32c5e5a40475c8e7bc7 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -23,7 +23,10 @@ #include "paddle/fluid/inference/api/details/reset_tensor_array.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/string/printf.h" - +#ifdef PADDLE_WITH_TESTING +#include +#include +#endif namespace paddle { using inference::analysis::Argument; @@ -54,6 +57,7 @@ class AnalysisPredictor : public PaddlePredictor { bool ZeroCopyRun() override; + void CreateFeedFetchVar(framework::Scope *scope); void PrepareFeedFetch(); void OptimizeInferenceProgram(); @@ -62,11 +66,17 @@ class AnalysisPredictor : public PaddlePredictor { std::unique_ptr Clone() override; - framework::Scope *scope() { return executor_->scope(); } + framework::Scope *scope() { return scope_.get(); } framework::ProgramDesc &program() { return *inference_program_; } protected: + bool PrepareProgram(const std::shared_ptr &program); + bool PrepareScope(const std::shared_ptr &parent_scope); + bool CreateExecutor(); + bool PrepareExecutor(); + bool LoadProgramDesc(); + bool LoadParameters(); bool SetFeed(const std::vector &input_datas, framework::Scope *scope); @@ -77,6 +87,14 @@ class AnalysisPredictor : public PaddlePredictor { PaddleTensor *output_data); ~AnalysisPredictor(); +// Some more detailed tests, they are made the friends of the predictor, so that +// the all the details can be tested. +#if PADDLE_WITH_TESTING + FRIEND_TEST(AnalysisPredictor, analysis_off); + FRIEND_TEST(AnalysisPredictor, analysis_on); + FRIEND_TEST(AnalysisPredictor, with_gpu); +#endif + private: contrib::AnalysisConfig config_; Argument argument_; @@ -92,6 +110,13 @@ class AnalysisPredictor : public PaddlePredictor { // concurrency problems, so cache them. std::vector feed_tensors_; details::TensorArrayBatchCleaner tensor_array_batch_cleaner_; + + private: + // Some status here that help to determine the status inside the predictor. + bool status_program_optimized_{false}; + bool status_is_cloned_{false}; + bool status_use_gpu_{false}; + bool status_ir_optim_enabled_{false}; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index f75c45f3a0438bc437e716160af8c5eab5b10fce..d67305670c91bb0814b8771332641e96974ade9d 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -12,16 +12,85 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/inference/api/analysis_predictor.h" #include #include +#include // NOLINT +#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" DEFINE_string(dirname, "", "dirname to tests."); namespace paddle { -namespace inference { using contrib::AnalysisConfig; +TEST(AnalysisPredictor, analysis_off) { + AnalysisConfig config(false); + config.model_dir = FLAGS_dirname; + config.enable_ir_optim = false; + + auto _predictor = CreatePaddlePredictor(config); + auto* predictor = static_cast(_predictor.get()); + + // Without analysis, the scope_ and sub_scope_ are created by predictor + // itself. + ASSERT_TRUE(predictor->scope_); + ASSERT_TRUE(predictor->sub_scope_); + ASSERT_EQ(predictor->scope_->parent(), nullptr); + ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get()); + // ir is turned off, so program shouldn't be optimized. + ASSERT_FALSE(predictor->status_program_optimized_); + LOG(INFO) << "scope parameters " << predictor->scope_->LocalVarNames().size(); + + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data.Reset(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + + std::vector inputs(4, tensor); + std::vector outputs; + ASSERT_TRUE(predictor->Run(inputs, &outputs)); +} + +TEST(AnalysisPredictor, analysis_on) { + AnalysisConfig config(false); + config.model_dir = FLAGS_dirname; + config.enable_ir_optim = true; + + auto _predictor = CreatePaddlePredictor(config); + auto* predictor = static_cast(_predictor.get()); + + ASSERT_TRUE(predictor->scope_); + ASSERT_TRUE(predictor->sub_scope_); + ASSERT_EQ(predictor->scope_->parent(), nullptr); + ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get()); + // ir is turned on, so program should be optimized. + ASSERT_TRUE(predictor->status_program_optimized_); + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data.Reset(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + + std::vector inputs(4, tensor); + std::vector outputs; + ASSERT_TRUE(predictor->Run(inputs, &outputs)); + + for (auto& output : outputs) { + LOG(INFO) << inference::DescribeTensor(output); + } + + // compare with NativePredictor + auto naive_predictor = CreatePaddlePredictor(config); + std::vector naive_outputs; + ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs)); + ASSERT_EQ(naive_outputs.size(), 1UL); + inference::CompareTensor(outputs.front(), naive_outputs.front()); +} + TEST(AnalysisPredictor, ZeroCopy) { AnalysisConfig config; config.model_dir = FLAGS_dirname; @@ -61,5 +130,59 @@ TEST(AnalysisPredictor, ZeroCopy) { LOG(INFO) << "output_data: " << out_data; } -} // namespace inference +TEST(AnalysisPredictor, Clone) { + AnalysisConfig config; + config.model_dir = FLAGS_dirname; + config.use_feed_fetch_ops = true; + config.enable_ir_optim = true; + + std::vector> predictors; + predictors.emplace_back(CreatePaddlePredictor(config)); + + LOG(INFO) << "************** to clone ************************"; + const int num_threads = 3; + for (int i = 1; i < num_threads; i++) { + predictors.emplace_back(predictors.front()->Clone()); + } + + auto* root_scope = + static_cast(predictors[0].get())->scope(); + ASSERT_FALSE(root_scope->kids().empty()); + LOG(INFO) << "***** scope ******\n" + << framework::GenScopeTreeDebugInfo(root_scope); + + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data.Reset(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + + std::vector inputs(4, tensor); + std::vector outputs; + predictors[0]->Run(inputs, &outputs); + + LOG(INFO) << "Run with single thread"; + for (int i = 0; i < num_threads; i++) { + LOG(INFO) << "run predictor " << i; + ASSERT_TRUE(predictors[i]->Run(inputs, &outputs)); + } + + LOG(INFO) << "Run with multiple threads"; + std::vector threads; + for (int i = 0; i < num_threads; i++) { + threads.emplace_back([&predictors, &inputs, i] { + LOG(INFO) << "thread #" << i << " running"; + std::vector outputs; + for (int j = 0; j < 10; j++) { + ASSERT_TRUE(predictors[i]->Run(inputs, &outputs)); + } + }); + } + + for (auto& t : threads) { + t.join(); + } +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index 20fab8078fedf837564496aa296648bf5970a348..9be059c73e20ebeeff2c4b6e8e5502e4a56fd0d6 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_pass_builder.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h index 04536ea3a53bbbc9293d92e69a23567e4bfd84c0..6a8b81cc57281b12cd3a4c89c863b20a824ce34a 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.h +++ b/paddle/fluid/inference/api/api_anakin_engine.h @@ -19,11 +19,13 @@ limitations under the License. */ #pragma once +#define WITH_ANAKIN + #include #include "framework/core/net/net.h" #include "framework/graph/graph.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_anakin_config.h" #include "saber/core/shape.h" #include "saber/saber_types.h" diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 5152b8670ddb206f0927c03149684af4a096df42..014bdc6a379744463e535df97af4c9c2e1651656 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -292,7 +292,14 @@ TEST(inference_api_native, image_classification_gpu) { // TEST(inference_api_native, image_classification_gpu_threads) { // MainThreadsImageClassification(true /*use_gpu*/); // } - #endif +TEST(PassBuilder, Delete) { + contrib::AnalysisConfig config(false); + config.pass_builder()->DeletePass("attention_lstm_fuse_pass"); + const auto& passes = config.pass_builder()->AllPasses(); + auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass"); + ASSERT_EQ(it, passes.end()); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc deleted file mode 100644 index 94b3933497daac1a4db1787994ea1bc33ec4e74f..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ /dev/null @@ -1,188 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/analysis/analyzer.h" -#include "paddle/fluid/inference/api/api_impl.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#include "paddle/fluid/inference/utils/singleton.h" -#include "paddle/fluid/operators/tensorrt_engine_op.h" - -namespace paddle { - -using inference::analysis::Argument; -using inference::Singleton; -using inference::analysis::Analyzer; -using framework::proto::ProgramDesc; -using paddle::contrib::MixedRTConfig; - -class TensorRTSubgraphPredictor : public NativePaddlePredictor { - public: - explicit TensorRTSubgraphPredictor(const MixedRTConfig& config) - : NativePaddlePredictor(config), config_(config) {} - - bool Init(const std::shared_ptr& parent_scope) { - FLAGS_IA_enable_tensorrt_subgraph_engine = true; - VLOG(30) << "Predictor::init()"; - if (config_.use_gpu) { - place_ = paddle::platform::CUDAPlace(config_.device); - } else { - place_ = paddle::platform::CPUPlace(); - } - if (parent_scope) { - scope_ = parent_scope; - sub_scope_ = &(parent_scope->NewScope()); - } else { - paddle::framework::InitDevices(false); - scope_.reset(new paddle::framework::Scope()); - } - - executor_.reset(new paddle::framework::Executor(place_)); - - // Initialize the inference program - if (!config_.model_dir.empty()) { - // Parameters are saved in separate files sited in - // the specified `dirname`. - inference_program_ = paddle::inference::Load( - executor_.get(), scope_.get(), config_.model_dir); - } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { - // All parameters are saved in a single file. - // The file names should be consistent with that used - // in Python API `fluid.io.save_inference_model`. - inference_program_ = paddle::inference::Load( - executor_.get(), scope_.get(), config_.prog_file, config_.param_file); - } else { - LOG(ERROR) << "fail to load inference model."; - return false; - } - - OptimizeInferenceProgram(); - ctx_ = executor_->Prepare(*inference_program_, 0); - - VLOG(50) << "to create variables"; - executor_->CreateVariables(*inference_program_, - sub_scope_ ? sub_scope_ : scope_.get(), 0); - // Get the feed_target_names and fetch_target_names - PrepareFeedFetch(); - return true; - } - - bool Run(const std::vector& inputs, - std::vector* output_data, - int batch_size = -1) override { - PADDLE_ENFORCE_GT(batch_size, 0, - "TensorRT engine needs the argument batch_size set"); - FLAGS_tensorrt_engine_batch_size = batch_size; - return NativePaddlePredictor::Run(inputs, output_data, batch_size); - } - - void OptimizeInferenceProgram() { - // Analyze inference_program - Argument argument; - - argument.Set("minimum_subgraph_size", - new int(config_.minimum_subgraph_size)); - argument.Set("max_batch_size", new int(config_.max_batch_size)); - argument.Set("workspace_size", new int(config_.workspace_size)); - argument.Set("precision_mode", - new std::string(config_.precision_mode)); - - if (!config_.model_dir.empty()) { - argument.fluid_model_dir.reset(new std::string(config_.model_dir)); - } else { - PADDLE_ENFORCE( - !config_.param_file.empty(), - "Either model_dir or (param_file, prog_file) should be set."); - PADDLE_ENFORCE(!config_.prog_file.empty()); - argument.fluid_model_program_path.reset( - new std::string(config_.prog_file)); - argument.fluid_model_param_path.reset( - new std::string(config_.param_file)); - } - argument.origin_program_desc.reset( - new ProgramDesc(*inference_program_->Proto())); - Singleton::Global().Run(&argument); - CHECK(argument.transformed_program_desc); - VLOG(50) << "transformed program:\n" - << argument.transformed_program_desc->SerializeAsString(); - VLOG(50) << "to prepare executor"; - inference_program_.reset( - new framework::ProgramDesc(*argument.transformed_program_desc)); - } - - private: - MixedRTConfig config_; -}; - -template <> -std::unique_ptr -CreatePaddlePredictor( - const MixedRTConfig& config) { - VLOG(30) << "create TensorRTSubgraphPredictor"; - if (config.use_gpu) { - // 1. GPU memeroy - PADDLE_ENFORCE_GT( - config.fraction_of_gpu_memory, 0.f, - "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); - PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); - std::vector flags; - if (config.fraction_of_gpu_memory >= 0.0f || - config.fraction_of_gpu_memory <= 0.95f) { - flags.push_back("dummpy"); - std::string flag = "--fraction_of_gpu_memory_to_use=" + - std::to_string(config.fraction_of_gpu_memory); - flags.push_back(flag); - VLOG(30) << "set flag: " << flag; - framework::InitGflags(flags); - } - } - - std::unique_ptr predictor( - new TensorRTSubgraphPredictor(config)); - if (!dynamic_cast(predictor.get()) - ->Init(nullptr)) { - return nullptr; - } - return std::move(predictor); -} - -template <> -std::unique_ptr CreatePaddlePredictor( - const MixedRTConfig& config) { - return CreatePaddlePredictor(config); -} - -} // namespace paddle - -USE_TRT_CONVERTER(elementwise_add_weight); -USE_TRT_CONVERTER(elementwise_add_tensor); -USE_TRT_CONVERTER(elementwise_sub_tensor); -USE_TRT_CONVERTER(elementwise_div_tensor); -USE_TRT_CONVERTER(elementwise_mul_tensor); -USE_TRT_CONVERTER(elementwise_max_tensor); -USE_TRT_CONVERTER(elementwise_min_tensor); -USE_TRT_CONVERTER(elementwise_pow_tensor); -USE_TRT_CONVERTER(mul); -USE_TRT_CONVERTER(conv2d); -USE_TRT_CONVERTER(relu); -USE_TRT_CONVERTER(sigmoid); -USE_TRT_CONVERTER(tanh); -USE_TRT_CONVERTER(fc); -USE_TRT_CONVERTER(pool2d); -USE_TRT_CONVERTER(softmax); -USE_TRT_CONVERTER(batch_norm); -USE_TRT_CONVERTER(concat); -USE_TRT_CONVERTER(dropout); -USE_TRT_CONVERTER(pad); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc deleted file mode 100644 index 89c9a65cb06ba565f0e0cbdb9b6031c6adbcb64e..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "paddle/fluid/inference/analysis/analyzer.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" - -namespace paddle { - -using contrib::MixedRTConfig; - -DEFINE_string(dirname, "", "Directory of the inference model."); - -void CompareTensorRTWithFluid(bool enable_tensorrt) { - FLAGS_IA_enable_tensorrt_subgraph_engine = enable_tensorrt; - - //# 1. Create PaddlePredictor with a config. - NativeConfig config0; - config0.model_dir = FLAGS_dirname; - config0.use_gpu = true; - config0.fraction_of_gpu_memory = 0.3; - config0.device = 0; - - MixedRTConfig config1; - config1.model_dir = FLAGS_dirname; - config1.use_gpu = true; - config1.fraction_of_gpu_memory = 0.3; - config1.device = 0; - config1.max_batch_size = 10; - - auto predictor0 = CreatePaddlePredictor(config0); - auto predictor1 = CreatePaddlePredictor(config1); - - for (int batch_id = 0; batch_id < 1; batch_id++) { - //# 2. Prepare input. - std::vector data(20); - for (int i = 0; i < 20; i++) data[i] = i; - - PaddleTensor tensor; - tensor.shape = std::vector({10, 1}); - tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)); - tensor.dtype = PaddleDType::INT64; - - // For simplicity, we set all the slots with the same data. - std::vector slots(4, tensor); - - //# 3. Run - std::vector outputs0; - std::vector outputs1; - CHECK(predictor0->Run(slots, &outputs0)); - CHECK(predictor1->Run(slots, &outputs1, 10)); - - //# 4. Get output. - ASSERT_EQ(outputs0.size(), 1UL); - ASSERT_EQ(outputs1.size(), 1UL); - - const size_t num_elements = outputs0.front().data.length() / sizeof(float); - const size_t num_elements1 = outputs1.front().data.length() / sizeof(float); - EXPECT_EQ(num_elements, num_elements1); - - auto *data0 = static_cast(outputs0.front().data.data()); - auto *data1 = static_cast(outputs1.front().data.data()); - - ASSERT_GT(num_elements, 0UL); - for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) { - EXPECT_NEAR(data0[i], data1[i], 1e-3); - } - } -} - -TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) { - CompareTensorRTWithFluid(false); -} - -TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) { - CompareTensorRTWithFluid(true); -} - -} // namespace paddle diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc index 5446fd4d4256c10442a53ea09a447cf308cbd681..3dd1d3c838c4b1bcdefdadff16b02dbfb4a02ee9 100644 --- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc +++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc @@ -23,7 +23,7 @@ limitations under the License. */ #include #include //NOLINT -#include "paddle/include/paddle_inference_api.h" +#include "utils.h" // NOLINT DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_bool(use_gpu, false, "Whether use gpu."); diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc index 6460514f3f80cac3c5e52560ab61b5cc7fd74636..0eb620ea516d28fb9598af8dbd297e84580a99f9 100644 --- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -36,14 +36,13 @@ namespace demo { */ void Main() { std::unique_ptr predictor; - paddle::contrib::MixedRTConfig config; + paddle::contrib::AnalysisConfig config(true); config.param_file = FLAGS_modeldir + "/__params__"; config.prog_file = FLAGS_modeldir + "/__model__"; - config.use_gpu = true; config.device = 0; - config.max_batch_size = 1; + config.EnableTensorRtEngine(); config.fraction_of_gpu_memory = 0.1; // set by yourself - predictor = CreatePaddlePredictor(config); + predictor = CreatePaddlePredictor(config); VLOG(30) << "begin to process data"; // Just a single batch of data. diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index d747f855803a6997d08957b5d35a56a0fe4160c5..bc8891455dc8e4a30ddfcc5f89792296e59c2548 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -17,7 +17,7 @@ limitations under the License. */ */ #include -#include // use glog instead of CHECK to avoid importing other paddle header files. +#include #include "utils.h" // NOLINT #ifdef PADDLE_WITH_CUDA @@ -40,20 +40,17 @@ using contrib::AnalysisConfig; */ void Main(bool use_gpu) { std::unique_ptr predictor, analysis_predictor; - AnalysisConfig config; + AnalysisConfig config(use_gpu); config.param_file = FLAGS_modeldir + "/__params__"; config.prog_file = FLAGS_modeldir + "/__model__"; - config.use_gpu = use_gpu; config.device = 0; if (FLAGS_use_gpu) { config.fraction_of_gpu_memory = 0.1; // set by yourself } - VLOG(30) << "init predictor"; predictor = CreatePaddlePredictor(config); - analysis_predictor = CreatePaddlePredictor(config); + analysis_predictor = CreatePaddlePredictor(config); - VLOG(30) << "begin to process data"; // Just a single batch of data. std::string line; std::ifstream file(FLAGS_data); @@ -68,13 +65,10 @@ void Main(bool use_gpu) { PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); input.dtype = PaddleDType::FLOAT32; - VLOG(30) << "run executor"; std::vector output, analysis_output; predictor->Run({input}, &output, 1); - VLOG(30) << "output.size " << output.size(); auto& tensor = output.front(); - VLOG(30) << "output: " << SummaryTensor(tensor); // compare with reference result CheckOutput(FLAGS_refer, tensor); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 14698f6dfc8885ec1d35f1912bad10a9caa13db4..0f540699b8ffea94c3f3aaf3354a0462e0e674a9 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -51,7 +51,7 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) { } template -T *ZeroCopyTensor::data(PaddlePlace *place, int *size) { +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const { auto *tensor = static_cast(FindTensor()); auto *res = tensor->data(); @@ -67,8 +67,10 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) { return res; } -template float *ZeroCopyTensor::data(PaddlePlace *place, int *size); -template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template float *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; template float *ZeroCopyTensor::mutable_data(PaddlePlace place); template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); @@ -84,7 +86,7 @@ void *ZeroCopyTensor::FindTensor() const { return tensor; } -std::vector ZeroCopyTensor::shape() { +std::vector ZeroCopyTensor::shape() const { auto *tensor = static_cast(FindTensor()); PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_); return framework::vectorize(tensor->dims()); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc index 2d5b561d801cd9e734cab13b28e7285493e30f94..12071e09f8442f2c52a06b7c3fe4bed2c28b524a 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -24,18 +24,20 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) { } template -T *ZeroCopyTensor::data(PaddlePlace *place, int *size) { +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const { return nullptr; } -template float *ZeroCopyTensor::data(PaddlePlace *place, int *size); -template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template float *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; template float *ZeroCopyTensor::mutable_data(PaddlePlace place); template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); void *ZeroCopyTensor::FindTensor() const { return nullptr; } -std::vector ZeroCopyTensor::shape() { return {}; } +std::vector ZeroCopyTensor::shape() const { return {}; } void ZeroCopyTensor::SetLoD(const std::vector> &x) {} diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index af21c0095c28b26c0ef4afc83572a9681d49d497..6f9d663121004470d57c17b8154d725fdf2b9689 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -15,9 +15,14 @@ #pragma once #include +#if !defined(_WIN32) #include +#else +#endif + #include #include // NOLINT +#include #include #include #include @@ -125,6 +130,51 @@ static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, return size; } +static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) { + if (a.dtype != b.dtype) { + LOG(ERROR) << "dtype not match"; + return false; + } + + if (a.lod.size() != b.lod.size()) { + LOG(ERROR) << "lod not match"; + return false; + } + for (size_t i = 0; i < a.lod.size(); i++) { + if (a.lod[i].size() != b.lod[i].size()) { + LOG(ERROR) << "lod not match"; + return false; + } + for (size_t j = 0; j < a.lod[i].size(); j++) { + if (a.lod[i][j] != b.lod[i][j]) { + LOG(ERROR) << "lod not match"; + return false; + } + } + } + + if (a.shape.size() != b.shape.size()) { + LOG(INFO) << "shape not match"; + return false; + } + for (size_t i = 0; i < a.shape.size(); i++) { + if (a.shape[i] != b.shape[i]) { + LOG(ERROR) << "shape not match"; + return false; + } + } + + auto *adata = static_cast(a.data.data()); + auto *bdata = static_cast(b.data.data()); + for (int i = 0; i < VecReduceToInt(a.shape); i++) { + if (adata[i] != bdata[i]) { + LOG(ERROR) << "data not match"; + return false; + } + } + return true; +} + static std::string DescribeTensor(const PaddleTensor &tensor) { std::stringstream os; os << "Tensor [" << tensor.name << "]\n"; @@ -157,6 +207,26 @@ static std::string DescribeTensor(const PaddleTensor &tensor) { return os.str(); } +static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) { + std::stringstream os; + os << "Tensor [" << tensor.name() << "]\n"; + + os << " - shape: " << to_string(tensor.shape()) << '\n'; + os << " - lod: "; + for (auto &l : tensor.lod()) { + os << to_string(l) << "; "; + } + os << "\n"; + os << " - data: "; + PaddlePlace place; + int size; + const auto *data = tensor.data(&place, &size); + for (int i = 0; i < size; i++) { + os << data[i] << " "; + } + return os.str(); +} + static void PrintTime(int batch_size, int repeat, int num_threads, int tid, double latency, int epoch = 1) { LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat diff --git a/paddle/fluid/inference/analysis/analyzer_main.cc b/paddle/fluid/inference/api/paddle_anakin_config.h similarity index 56% rename from paddle/fluid/inference/analysis/analyzer_main.cc rename to paddle/fluid/inference/api/paddle_anakin_config.h index 5e1fe3eb797cdced56a61aa2db0c3d18601824f8..0e91c2624bed4459b936ac4477d73ae954e55bcc 100644 --- a/paddle/fluid/inference/analysis/analyzer_main.cc +++ b/paddle/fluid/inference/api/paddle_anakin_config.h @@ -11,23 +11,25 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#pragma once -/* - * This file implements analysizer -- an executation help to analyze and - * optimize trained model. - */ -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include +#include +#include +#include +#include -int main(int argc, char** argv) { - google::ParseCommandLineFlags(&argc, &argv, true); - using paddle::inference::analysis::Analyzer; - using paddle::inference::analysis::Argument; +#include "paddle_api.h" // NOLINT - Argument argument; - Analyzer analyzer; - analyzer.Run(&argument); +namespace paddle { +namespace contrib { +// Configurations for Anakin engine. +struct AnakinConfig : public PaddlePredictor::Config { + enum TargetType { NVGPU = 0, X86 }; + int device; + std::string model_file; + int max_batch_size{-1}; + TargetType target_type; +}; - return 0; -} +} // namespace contrib +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h new file mode 100644 index 0000000000000000000000000000000000000000..2ac736df7ccd54babe582ca1383903c191069d33 --- /dev/null +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -0,0 +1,79 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include + +// Here we include some header files with relative paths, for that in deploy, +// the abstract path of this header file will be changed. +#include "paddle_api.h" // NOLINT +#include "paddle_pass_builder.h" // NOLINT + +namespace paddle { + +class AnalysisPredictor; +// == +// +// ----------------------------------------------------------------------------------- +// NOTE: The following APIs are not mature yet, we are still working on them. +namespace contrib { + +// NOTE WIP, not stable yet. +struct AnalysisConfig : public NativeConfig { + explicit AnalysisConfig(bool use_gpu = false); + explicit AnalysisConfig(const AnalysisConfig& other); + explicit AnalysisConfig(AnalysisConfig&& other); + + // Determine whether to perform graph optimization. + bool enable_ir_optim = true; + + // Get a pass builder for customize the passes in IR analysis phase. + PassStrategy* pass_builder() const; + + // NOT stable yet. + bool use_feed_fetch_ops{true}; + + void EnableTensorRtEngine(int workspace_size = 1 << 20, + int max_batch_size = 1); + bool use_tensorrt() const { return use_tensorrt_; } + + // NOTE this is just for internal development, please not use it. + // NOT stable yet. + void EnableMKLDNN(); + bool use_mkldnn() const { return use_mkldnn_; } + + friend class ::paddle::AnalysisPredictor; + + protected: + bool use_tensorrt_{false}; + bool use_mkldnn_{false}; + int tensorrt_workspace_size_; + int tensorrt_max_batchsize_; + std::unique_ptr pass_builder_; +}; + +// Configurations for Anakin engine. +struct AnakinConfig : public PaddlePredictor::Config { + enum TargetType { NVGPU = 0, X86 }; + int device; + std::string model_file; + int max_batch_size{-1}; + TargetType target_type; +}; + +} // namespace contrib +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h new file mode 100644 index 0000000000000000000000000000000000000000..0a2a2a1a23401b5aa4d3402da6f7a3369280d8f5 --- /dev/null +++ b/paddle/fluid/inference/api/paddle_api.h @@ -0,0 +1,220 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include + +namespace paddle { + +// Data type. +enum PaddleDType { + FLOAT32, + INT64, + // TODO(Superjomn) support more data types if needed. +}; + +/* + * Memory menage for PaddleTensor. + * The PaddleBuf holds a buffer for data input or output. The memory can be + * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf + * should be reused for better performance. + * + * For user allocated memory, the following API can be used: + * - PaddleBuf(void* data, size_t length) to set an external memory by + * specifying + * the memory address and length. + * - Reset(void* data, size_t length) to reset the PaddleBuf with an external + * memory. + * ATTENTION, for user allocated memory, deallocation should be done by users + * externally after the program finished. The PaddleBuf won't do any allocation + * or deallocation. + * + * To have the PaddleBuf allocate and manage the memory: + * - PaddleBuf(size_t length) will allocate a memory of size `length`. + * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION + * if the allocated memory is larger than `length`, nothing will done. + */ +class PaddleBuf { + public: + // PaddleBuf allocate memory internally, and manage it. + explicit PaddleBuf(size_t length) + : data_(new char[length]), length_(length), memory_owned_(true) {} + // Set external memory, the PaddleBuf won't manage it. + PaddleBuf(void* data, size_t length) + : data_(data), length_(length), memory_owned_{false} {} + // Copy only available when memory is managed externally. + explicit PaddleBuf(const PaddleBuf&); + + // Resize the memory. + void Resize(size_t length); + // Reset to external memory, with address and length set. + void Reset(void* data, size_t length); + // Tell whether the buffer is empty. + bool empty() const { return length_ == 0; } + // Get the memory address. + void* data() const { return data_; } + // Get the memory length. + size_t length() const { return length_; } + + ~PaddleBuf() { Free(); } + PaddleBuf& operator=(const PaddleBuf&); + PaddleBuf& operator=(PaddleBuf&&); + PaddleBuf() = default; + PaddleBuf(PaddleBuf&& other); + + private: + void Free(); + void* data_{nullptr}; // pointer to the data memory. + size_t length_{0}; // number of memory bytes. + bool memory_owned_{true}; +}; + +// Basic input and output data structure for PaddlePredictor. +struct PaddleTensor { + PaddleTensor() = default; + std::string name; // variable name. + std::vector shape; + PaddleBuf data; // blob of data. + PaddleDType dtype; + std::vector> lod; // Tensor+LoD equals LoDTensor +}; + +enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; +// Tensor without copy, currently only supports AnalysisPredictor. +class ZeroCopyTensor { + public: + void Reshape(const std::vector& shape); + + // Get the memory in CPU or GPU with specific data type, should Reshape first + // to tell the data size. + // Once can directly call this data to feed the data. + // This is for write the input tensor. + template + T* mutable_data(PaddlePlace place); + // Get the memory directly, will return the place and memory size by pointer. + // This is for reading the output tensor. + template + T* data(PaddlePlace* place, int* size) const; + + std::vector shape() const; + + void SetLoD(const std::vector>& x); + std::vector> lod() const; + const std::string& name() const { return name_; } + + protected: + explicit ZeroCopyTensor(void* scope) : scope_{scope} {} + void SetName(const std::string& name) { name_ = name; } + void* FindTensor() const; + + private: + std::string name_; + bool input_or_output_; + friend class AnalysisPredictor; + void* scope_{nullptr}; +}; + +/* + * A simple Inference API for Paddle. + */ +class PaddlePredictor { + public: + struct Config; + PaddlePredictor() = default; + PaddlePredictor(const PaddlePredictor&) = delete; + PaddlePredictor& operator=(const PaddlePredictor&) = delete; + + // Predict an record. + // The caller should be responsible for allocating and releasing the memory of + // `inputs`. `inputs` should be available until Run returns. Caller should be + // responsible for the output tensor's buffer, either allocated or passed from + // outside. + virtual bool Run(const std::vector& inputs, + std::vector* output_data, + int batch_size = -1) = 0; + + // Zero copy input and output optimization. + // Get the input or output tensors, and operate on their memory directly, + // without copy. + virtual std::unique_ptr GetInputTensor( + const std::string& name) { + return nullptr; + } + virtual std::unique_ptr GetOutputTensor( + const std::string& name) { + return nullptr; + } + virtual bool ZeroCopyRun() { return false; } + + // Clone a predictor that share the model weights, the Cloned predictor should + // be thread-safe. + virtual std::unique_ptr Clone() = 0; + + // Destroy the Predictor. + virtual ~PaddlePredictor() = default; + + // The common configs for all the predictors. + struct Config { + std::string model_dir; // path to the model directory. + }; +}; + +struct NativeConfig : public PaddlePredictor::Config { + // GPU related fields. + bool use_gpu{false}; + int device{0}; + float fraction_of_gpu_memory{-1.f}; // Change to a float in (0,1] if needed. + + // Specify the exact path of program and parameter files. + std::string prog_file; + std::string param_file; + + // Specify the variable's name of each input if input tensors don't follow the + // `feeds` and `fetches` of the phase `save_inference_model`. + bool specify_input_name{false}; +}; + +// A factory to help create different predictors. +// +// Usage: +// +// NativeConfig config; +// ... // change the configs. +// auto native_predictor = CreatePaddlePredictor(config); +// +// FOR EXTENSION DEVELOPER: +// Different predictors are designated by config type. Similar configs can be +// merged, but there shouldn't be a huge config containing different fields for +// more than one kind of predictors. +template +std::unique_ptr CreatePaddlePredictor(const ConfigT& config); + +// NOTE The following APIs are too trivial, we will discard it in the following +// versions. +enum class PaddleEngineKind { + kNative = 0, // Use the native Fluid facility. + kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT. + kAnalysis, // More optimization. + kAnakin // Use Anakin for inference, not mature yet. +}; + +template +std::unique_ptr CreatePaddlePredictor(const ConfigT& config); + +int PaddleDtypeSize(PaddleDType dtype); + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index a755ccb93bdee018dfeaf91157e7971b4d4cd832..92fb51d647cf4e2c8a4914d8df2e8b7b6318d1d1 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -26,265 +26,9 @@ limitations under the License. */ #include #include -namespace paddle { - -// Data type. -enum PaddleDType { - FLOAT32, - INT64, - // TODO(Superjomn) support more data types if needed. -}; - -/* - * Memory menage for PaddleTensor. - * The PaddleBuf holds a buffer for data input or output. The memory can be - * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf - * should be reused for better performance. - * - * For user allocated memory, the following API can be used: - * - PaddleBuf(void* data, size_t length) to set an external memory by - * specifying - * the memory address and length. - * - Reset(void* data, size_t length) to reset the PaddleBuf with an external - * memory. - * ATTENTION, for user allocated memory, deallocation should be done by users - * externally after the program finished. The PaddleBuf won't do any allocation - * or deallocation. - * - * To have the PaddleBuf allocate and manage the memory: - * - PaddleBuf(size_t length) will allocate a memory of size `length`. - * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION - * if the allocated memory is larger than `length`, nothing will done. - */ -class PaddleBuf { - public: - // PaddleBuf allocate memory internally, and manage it. - explicit PaddleBuf(size_t length) - : data_(new char[length]), length_(length), memory_owned_(true) {} - // Set external memory, the PaddleBuf won't manage it. - PaddleBuf(void* data, size_t length) - : data_(data), length_(length), memory_owned_{false} {} - // Copy only available when memory is managed externally. - explicit PaddleBuf(const PaddleBuf&); - - // Resize the memory. - void Resize(size_t length); - // Reset to external memory, with address and length set. - void Reset(void* data, size_t length); - // Tell whether the buffer is empty. - bool empty() const { return length_ == 0; } - // Get the memory address. - void* data() const { return data_; } - // Get the memory length. - size_t length() const { return length_; } - - ~PaddleBuf() { Free(); } - PaddleBuf& operator=(const PaddleBuf&); - PaddleBuf& operator=(PaddleBuf&&); - PaddleBuf() = default; - PaddleBuf(PaddleBuf&& other); - - private: - void Free(); - void* data_{nullptr}; // pointer to the data memory. - size_t length_{0}; // number of memory bytes. - bool memory_owned_{true}; -}; - -// Basic input and output data structure for PaddlePredictor. -struct PaddleTensor { - PaddleTensor() = default; - std::string name; // variable name. - std::vector shape; - PaddleBuf data; // blob of data. - PaddleDType dtype; - std::vector> lod; // Tensor+LoD equals LoDTensor -}; - -enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; -// Tensor without copy, currently only supports AnalysisPredictor. -class ZeroCopyTensor { - public: - void Reshape(const std::vector& shape); - - // Get the memory in CPU or GPU with specific data type, should Reshape first - // to tell the data size. - // Once can directly call this data to feed the data. - // This is for write the input tensor. - template - T* mutable_data(PaddlePlace place); - // Get the memory directly, will return the place and memory size by pointer. - // This is for reading the output tensor. - template - T* data(PaddlePlace* place, int* size); - - std::vector shape(); - - void SetLoD(const std::vector>& x); - std::vector> lod() const; - - protected: - explicit ZeroCopyTensor(void* scope) : scope_{scope} {} - void SetName(const std::string& name) { name_ = name; } - void* FindTensor() const; - - private: - std::string name_; - bool input_or_output_; - friend class AnalysisPredictor; - void* scope_{nullptr}; -}; - -/* - * A simple Inference API for Paddle. - */ -class PaddlePredictor { - public: - struct Config; - PaddlePredictor() = default; - PaddlePredictor(const PaddlePredictor&) = delete; - PaddlePredictor& operator=(const PaddlePredictor&) = delete; - - // Predict an record. - // The caller should be responsible for allocating and releasing the memory of - // `inputs`. `inputs` should be available until Run returns. Caller should be - // responsible for the output tensor's buffer, either allocated or passed from - // outside. - virtual bool Run(const std::vector& inputs, - std::vector* output_data, - int batch_size = -1) = 0; - - // Zero copy input and output optimization. - // Get the input or output tensors, and operate on their memory directly, - // without copy. - virtual std::unique_ptr GetInputTensor( - const std::string& name) { - return nullptr; - } - virtual std::unique_ptr GetOutputTensor( - const std::string& name) { - return nullptr; - } - virtual bool ZeroCopyRun() { return false; } - - // Clone a predictor that share the model weights, the Cloned predictor should - // be thread-safe. - virtual std::unique_ptr Clone() = 0; - - // Destroy the Predictor. - virtual ~PaddlePredictor() = default; - - // The common configs for all the predictors. - struct Config { - std::string model_dir; // path to the model directory. - }; -}; - -struct NativeConfig : public PaddlePredictor::Config { - // GPU related fields. - bool use_gpu{false}; - int device{0}; - float fraction_of_gpu_memory{-1.f}; // Change to a float in (0,1] if needed. - - // Specify the exact path of program and parameter files. - std::string prog_file; - std::string param_file; - - // Specify the variable's name of each input if input tensors don't follow the - // `feeds` and `fetches` of the phase `save_inference_model`. - bool specify_input_name{false}; -}; - -// A factory to help create different predictors. -// -// Usage: -// -// NativeConfig config; -// ... // change the configs. -// auto native_predictor = CreatePaddlePredictor(config); -// -// FOR EXTENSION DEVELOPER: -// Different predictors are designated by config type. Similar configs can be -// merged, but there shouldn't be a huge config containing different fields for -// more than one kind of predictors. -template -std::unique_ptr CreatePaddlePredictor(const ConfigT& config); - -// NOTE The following APIs are too trivial, we will discard it in the following -// versions. -enum class PaddleEngineKind { - kNative = 0, // Use the native Fluid facility. - kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT. - kAnalysis, // More optimization. - kAnakin // Use Anakin for inference, not mature yet. -}; - -template -std::unique_ptr CreatePaddlePredictor(const ConfigT& config); - -// == -// -// ----------------------------------------------------------------------------------- -// NOTE: The following APIs are not mature yet, we are still working on them. - -namespace contrib { - -// Accelerate GPU computation with TensorRT engine. -struct MixedRTConfig : public NativeConfig { - // Determine whether a subgraph will be executed by TRT. - int min_subgraph_size{1}; - // While TensorRT allows an engine optimized for a given max batch size - // to run at any smaller size, the performance for those smaller - // sizes may not be as well-optimized. Therefore, Max batch is best - // equivalent to the runtime batch size. - int max_batch_size{1}; - // For workspace_size, refer it from here: - // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting - int workspace_size{1 << 30}; - // We transform the Ops that can be converted into TRT layer in the model, - // and aggregate these Ops into subgraphs for TRT execution. - // We set this variable to control the minimum number of nodes in the - // subgraph, 3 as default value. - int minimum_subgraph_size = 3; - // Reserved configuration - // We just support "FP32" now, "FP16" and "INT8" will be supported. - std::string precision_mode = "FP32"; -}; - -// NOTE WIP, not stable yet. -struct AnalysisConfig : public NativeConfig { - enum class IrPassMode { - kSystem, // Use system default passes, not customize. - kInclude, // Specify the passes in `ir_passes`. - kExclude // Specify the disabled passes in `ir_passes`. - }; - - // Determine whether to perform graph optimization. - bool enable_ir_optim = true; - // Manually determine the IR passes to run. - IrPassMode ir_mode{IrPassMode::kExclude}; - // passes to be excluded/included - std::vector ir_passes{"embedding_fc_lstm_fuse_pass"}; - - // NOT stable yet. - bool use_feed_fetch_ops{true}; - - // NOTE this is just for internal development, please not use it. - // NOT stable yet. - bool _use_mkldnn{false}; -}; - -// Configurations for Anakin engine. -struct AnakinConfig : public PaddlePredictor::Config { - enum TargetType { NVGPU = 0, X86 }; - int device; - std::string model_file; - int max_batch_size{-1}; - TargetType target_type; -}; - -} // namespace contrib - -int PaddleDtypeSize(PaddleDType dtype); - -} // namespace paddle +#include "paddle_api.h" // NOLINT +#ifndef WITH_ANAKIN +#include "paddle_analysis_config.h" // NOLINT +#else +#include "paddle_anakin_config.h" // NOLINT +#endif diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc new file mode 100644 index 0000000000000000000000000000000000000000..bc3ce72f0832c4bf029f86e023bd9ff11f6578bd --- /dev/null +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/paddle_pass_builder.h" +#include + +namespace paddle { + +void PaddlePassBuilder::AppendPass(const std::string &pass_type) { + passes_.push_back(pass_type); +} + +void PaddlePassBuilder::TurnOnDebug() { + std::vector passes; + auto it = std::begin(passes_); + while (it != std::end(passes_)) { + if (*it != "graph_viz_pass") { + it = passes_.insert(it + 1, "graph_viz_pass"); + } else { + ++it; + } + } +} + +std::string PaddlePassBuilder::DebugString() { + std::stringstream ss; + ss << "Passes to apply:\n"; + for (auto &pass : passes_) { + ss << " - " << pass << '\n'; + } + return ss.str(); +} + +void PaddlePassBuilder::DeletePass(const std::string &pass_type) { + auto it = std::begin(passes_); + while (it != std::end(passes_)) { + if (*it == pass_type) { + it = passes_.erase(it); + } else { + ++it; + } + } +} + +void PaddlePassBuilder::InsertPass(size_t idx, const std::string &pass_type) { + passes_.insert(std::begin(passes_) + idx, pass_type); +} + +void PaddlePassBuilder::DeletePass(size_t idx) { + passes_.erase(std::begin(passes_) + idx); +} + +void GpuPassStrategy::EnableMKLDNN() { + LOG(ERROR) << "GPU not support MKLDNN yet"; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h new file mode 100644 index 0000000000000000000000000000000000000000..80658d30850aaa7212903828c5c963da5f37ca65 --- /dev/null +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -0,0 +1,131 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +namespace paddle { +/* + * This is a pass builder based on string. It is part of inference API. + */ +class PaddlePassBuilder { + public: + explicit PaddlePassBuilder(const std::vector &passes) + : passes_(passes) {} + + void AppendPass(const std::string &pass_type); + + void InsertPass(size_t idx, const std::string &pass_type); + + // Delete the `idx`-th pass. + void DeletePass(size_t idx); + + // Delete all the passes that has type `pass_type`. + void DeletePass(const std::string &pass_type); + + // Visualize the computation graph after each pass by generating a DOT + // language file, one can draw them with the Graphviz toolkit. + void TurnOnDebug(); + + // Human-readible information. + std::string DebugString(); + + const std::vector &AllPasses() const { return passes_; } + + protected: + std::vector passes_; +}; + +/* + * Pass strategy to help control the IR passes. + */ +class PassStrategy : public PaddlePassBuilder { + public: + explicit PassStrategy(const std::vector &passes) + : PaddlePassBuilder(passes) {} + + // The MKLDNN control exists in both CPU and GPU mode, because there can be + // still some CPU kernels running in CPU mode. + virtual void EnableMKLDNN() = 0; + + virtual ~PassStrategy() = default; +}; + +/* + * The CPU passes controller, it is used in AnalysisPredictor with CPU mode. + */ +class CpuPassStrategy : public PassStrategy { + public: + CpuPassStrategy() : PassStrategy({}) { + // NOTE the large fusions should be located in the front, so that they will + // not be damaged by smaller ones. + passes_.assign({ + "infer_clean_graph_pass", // + "attention_lstm_fuse_pass", // + "seqconv_eltadd_relu_fuse_pass", // + // "embedding_fc_lstm_fuse_pass", // + "fc_lstm_fuse_pass", // + "mul_lstm_fuse_pass", // + "fc_gru_fuse_pass", // + "mul_gru_fuse_pass", // + "seq_concat_fc_fuse_pass", // + "fc_fuse_pass", // + "conv_bn_fuse_pass", // + "conv_eltwiseadd_bn_fuse_pass", // + }); + } + + virtual ~CpuPassStrategy() = default; + + void EnableMKLDNN() override { +// TODO(Superjomn) Consider the way to mix CPU with GPU. +#ifdef PADDLE_WITH_MKLDNN + passes_.insert(passes_.begin(), "mkldnn_placement_pass"); + + for (auto &pass : + std::vector({"depthwise_conv_mkldnn_pass", // + "conv_bias_mkldnn_fuse_pass", // + "conv_relu_mkldnn_fuse_pass", // + "conv_elementwise_add_mkldnn_fuse_pass"})) { + passes_.push_back(pass); + } +#endif + } + + CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {} +}; + +/* + * The GPU passes strategy, it is used in + */ +class GpuPassStrategy : public PassStrategy { + public: + GpuPassStrategy() : PassStrategy({}) { + passes_.assign({ + "infer_clean_graph_pass", "conv_bn_fuse_pass", + }); + } + + GpuPassStrategy(const GpuPassStrategy &other) + : PassStrategy(other.AllPasses()) {} + + void EnableMKLDNN() override; + + virtual ~GpuPassStrategy() = default; +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index a610687a5b11999a7cb7426dbe961e5972ee1746..e09705e3c69eb2b2370bd1ad2d9cf178ef041ee6 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,4 +1,5 @@ nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) +add_subdirectory(plugin) add_subdirectory(convert) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 0a35e10f6936313928ab21a6f17c40335e8fc882..ed4c398cee518af3211cab4e982082c46ebb36c2 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,8 +1,9 @@ # Add TRT tests nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc -batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc - DEPS tensorrt_engine operator scope framework_proto op_registry) +batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc +pad_op.cc split_op.cc + DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter) @@ -28,6 +29,8 @@ nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL) nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL) - nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL) +nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin +split_op concat_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index e73c5bbf57501e4ff3c080a46d91685035652bfa..0b756534ec6fbf27a3e92bf39fb7544d9785ca48 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -27,7 +27,7 @@ class ActivationOpConverter : public OpConverter { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); - LOG(INFO) + VLOG(3) << "convert a fluid Activation op to tensorrt activation layer whose " "type is " << op_type_; diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 3330af2da6c97ad153dcecd86be4b441eac62b5e..d017bac66dd99a4b54c44ec786de61d1e66b8981 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -23,7 +23,7 @@ class BatchNormOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - LOG(INFO) << "convert a fluid batch norm op to tensorrt batch_norm"; + VLOG(3) << "convert a fluid batch norm op to tensorrt batch_norm"; framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc index 60c16e35ed39a4cb14cbc16ebacaba2bb72bcc81..525ba9dc341c8c1343553ac9523611f79ac3aa2d 100644 --- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc @@ -19,13 +19,13 @@ namespace inference { namespace tensorrt { /* - * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights. + * ConcatOp */ class ConcatOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(40) << "convert a fluid mul op to tensorrt mul layer without bias"; + VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias"; framework::OpDesc op_desc(op, nullptr); // Declare inputs diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 7bcf2dd1eeb17e802c5647df31945284ae08fa95..43950b8c048b4e1b8974956948caa639812b2f78 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -37,8 +37,7 @@ class Conv2dOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - LOG(INFO) - << "convert a fluid conv2d op to tensorrt conv layer without bias"; + VLOG(3) << "convert a fluid conv2d op to tensorrt conv layer without bias"; framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1); diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc index df86a68dac521257aadac5ab1ed4020475a382f3..ddbc724e3b2a48b75df17f9bda691a1fd3883c32 100644 --- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc @@ -25,7 +25,7 @@ class DropoutOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(40) << "convert a fluid dropout op to tensorrt dropout layer"; + VLOG(3) << "convert a fluid dropout op to tensorrt dropout layer"; framework::OpDesc op_desc(op, nullptr); // Declare inputs auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 0a6ce568f194f03c7259e1ebf28dd6ce4df2d594..671bcd8aa9a9fff34644a056499961cf6ab81287 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -26,7 +26,7 @@ class ElementwiseWeightOpConverter : public OpConverter { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); - LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer"; + VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer"; PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight @@ -108,7 +108,7 @@ class ElementwiseTensorOpConverter : public OpConverter { // Here the two nullptr looks strange, that's because the // framework::OpDesc's constructor is strange. framework::OpDesc op_desc(op, nullptr); - LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer"; + VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer"; PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index bc1d9ee2811c447cd84f7e5b415b6bd53433b986..eef4fab4e86f05fa80bc614371f1aa43e433407e 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -52,7 +52,7 @@ class FcOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(40) << "convert a fluid fc op to tensorrt fc layer without bias"; + VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias"; framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc index babd56d62392299a58ecc16b7441029f80022498..5b6aaad49833cedbd8d1ee0ec5d24c7f983190e6 100644 --- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc @@ -25,7 +25,7 @@ class MulOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(40) << "convert a fluid mul op to tensorrt mul layer without bias"; + VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias"; framework::OpDesc op_desc(op, nullptr); // Declare inputs diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc index c3699428d29e4d21f4eafbcf1fd0e255c91fabe6..4afcb0aecec9d07b52d2fd701fae8750067a6041 100644 --- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc @@ -25,7 +25,7 @@ class PadOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(40) << "convert a fluid transpose op to tensorrt tranpose layer"; + VLOG(3) << "convert a fluid transpose op to tensorrt tranpose layer"; framework::OpDesc op_desc(op, nullptr); // Declare inputs diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index d943d699f2cbe6726b3e634d0bd08e7cff3057d6..48850020840a49bd309c007943f14b2f7eec5e2d 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -25,7 +25,7 @@ class Pool2dOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(40) + VLOG(3) << "convert a fluid pool2d op to tensorrt pool2d layer without bias"; framework::OpDesc op_desc(op, nullptr); // Declare inputs diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc index 174cdbe53b2698d05b680d27f1f94bee38d1dfa5..80bfb2d190a5637032e7c18fbac7f22b3a9e81e1 100644 --- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc @@ -25,7 +25,7 @@ class SoftMaxOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(40) + VLOG(3) << "convert a fluid softmax op to tensorrt softmax layer without bias"; framework::OpDesc op_desc(op, nullptr); // Declare inputs diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..12179cccc76f8b0f595f41c135290dc0f3b50ad7 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -0,0 +1,75 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * SplitOp. + */ +class SplitOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(40) << "convert a fluid split op to tensorrt split layer"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + auto input_dims = input->getDimensions(); + int input_num = op_desc.Input("X").size(); + size_t output_num = op_desc.Output("Out").size(); + + // Get Attrs + PADDLE_ENFORCE(input_num == 1); + int axis = boost::get(op_desc.GetAttr("axis")); + std::vector output_lengths = + boost::get>(op_desc.GetAttr("sections")); + PADDLE_ENFORCE(axis != 0); + if (axis < 0) { + axis += input_dims.nbDims; + } else { + axis -= 1; + } + + PADDLE_ENFORCE(output_lengths.size() == output_num); + + // + SplitPlugin* plugin = new SplitPlugin(axis, output_lengths); + nvinfer1::IPluginLayer* layer = + engine_->AddPlugin(&input, input_num, plugin); + + std::string layer_name = "split (Output: "; + for (size_t i = 0; i < output_num; i++) { + auto output_name = op_desc.Output("Out")[i]; + layer->getOutput(i)->setName(output_name.c_str()); + engine_->SetITensor(output_name, layer->getOutput(i)); + layer_name += output_name; + if (test_mode) { + engine_->DeclareOutput(output_name); + } + } + layer->setName((layer_name + ")").c_str()); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(split, SplitOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f81d011552c152c2df79e1a272f34b954ae2a3a1 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(split_op, test) { + std::unordered_set parameters({""}); + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2)); + validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("split"); + desc.SetInput("X", {"split_input"}); + desc.SetOutput("Out", {"split_out1", "split_out2"}); + + int num = 0; + int axis = 1; + std::vector output_lengths = {2, 1}; + desc.SetAttr("axis", axis); + desc.SetAttr("num", num); + desc.SetAttr("sections", output_lengths); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(split); diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 9e0f95844761db7571c5313726d34685a9aa66b2..fdd8b56b0ce5c9b5cb6395bcb437aae5ae27829b 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -61,6 +61,7 @@ TensorRTEngine::~TensorRTEngine() { } void TensorRTEngine::FreezeNetwork() { + VLOG(3) << "TRT to freeze network"; freshDeviceId(); PADDLE_ENFORCE(infer_builder_ != nullptr, "Call InitNetwork first to initialize network."); @@ -254,6 +255,12 @@ void TensorRTEngine::freshDeviceId() { cudaSetDevice(device_); } +nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( + nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) { + owned_plugin_.emplace_back(plugin); + return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin); +} + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 828181200e300c370bbfa234c3c23ae44810878c..335acdf653e55cc7f3ceccdba88992851c8e0310 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/utils/singleton.h" namespace paddle { @@ -125,6 +126,8 @@ class TensorRTEngine : public EngineBase { void SetRuntimeBatch(size_t batch_size); int GetRuntimeBatch(); int GetDevice() { return device_; } + nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, + int nbInputs, PluginTensorRT*); // A pointer to CPU memory is needed of the TRT weight. // Before TRT runs, fluid loads weight into GPU storage. @@ -164,8 +167,10 @@ class TensorRTEngine : public EngineBase { std::unordered_map buffer_sizes_; std::unordered_map itensor_map_; + // The specific GPU id that the TensorRTEngine bounded to. int device_; + std::vector> owned_plugin_; // TensorRT related internal members template diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index b6e7968108403c9c9c192759c44eac040d1c5073..fc7ca7714e9325d2b6bce6189300aa339c81c2ba 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -52,7 +52,7 @@ class NaiveLogger : public nvinfer1::ILogger { void log(nvinfer1::ILogger::Severity severity, const char* msg) override { switch (severity) { case Severity::kINFO: - LOG(INFO) << msg; + VLOG(3) << msg; break; case Severity::kWARNING: LOG(WARNING) << msg; diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..71b7a551619a43e5300ad3205418d1174c7019ff --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -0,0 +1 @@ +nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu DEPS enforce) diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.h b/paddle/fluid/inference/tensorrt/plugin/serialize.h new file mode 100644 index 0000000000000000000000000000000000000000..50c0b17d78327e22b0aa81fdac6958e80a30dfe8 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/serialize.h @@ -0,0 +1,111 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +template +inline void SerializeValue(void** buffer, T const& value); + +template +inline void DeserializeValue(void const** buffer, size_t* buffer_size, + T* value); + +namespace { + +template +struct Serializer {}; + +template +struct Serializer::value || + std::is_enum::value || + std::is_pod::value>::type> { + static size_t SerializedSize(T const& value) { return sizeof(T); } + static void Serialize(void** buffer, T const& value) { + std::memcpy(*buffer, &value, sizeof(T)); + reinterpret_cast(*buffer) += sizeof(T); + } + static void Deserialize(void const** buffer, size_t* buffer_size, T* value) { + assert(*buffer_size >= sizeof(T)); + std::memcpy(value, *buffer, sizeof(T)); + reinterpret_cast(*buffer) += sizeof(T); + *buffer_size -= sizeof(T); + } +}; + +template <> +struct Serializer { + static size_t SerializedSize(const char* value) { return strlen(value) + 1; } + static void Serialize(void** buffer, const char* value) { + std::strcpy(static_cast(*buffer), value); + reinterpret_cast(*buffer) += strlen(value) + 1; + } + static void Deserialize(void const** buffer, size_t* buffer_size, + const char** value) { + *value = static_cast(*buffer); + size_t data_size = strnlen(*value, *buffer_size) + 1; + assert(*buffer_size >= data_size); + reinterpret_cast(*buffer) += data_size; + *buffer_size -= data_size; + } +}; + +template +struct Serializer, + typename std::enable_if::value || + std::is_enum::value || + std::is_pod::value>::type> { + static size_t SerializedSize(std::vector const& value) { + return sizeof(value.size()) + value.size() * sizeof(T); + } + static void Serialize(void** buffer, std::vector const& value) { + SerializeValue(buffer, value.size()); + size_t nbyte = value.size() * sizeof(T); + std::memcpy(*buffer, value.data(), nbyte); + reinterpret_cast(*buffer) += nbyte; + } + static void Deserialize(void const** buffer, size_t* buffer_size, + std::vector* value) { + size_t size; + DeserializeValue(buffer, buffer_size, &size); + value->resize(size); + size_t nbyte = value->size() * sizeof(T); + assert(*buffer_size >= nbyte); + std::memcpy(value->data(), *buffer, nbyte); + reinterpret_cast(*buffer) += nbyte; + *buffer_size -= nbyte; + } +}; + +} // namespace + +template +inline size_t SerializedSize(T const& value) { + return Serializer::SerializedSize(value); +} + +template +inline void SerializeValue(void** buffer, T const& value) { + return Serializer::Serialize(buffer, value); +} + +template +inline void DeserializeValue(void const** buffer, size_t* buffer_size, + T* value) { + return Serializer::Deserialize(buffer, buffer_size, value); +} diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu new file mode 100644 index 0000000000000000000000000000000000000000..bd6a44dcc14d50cddb879763a93abf4297494ec9 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -0,0 +1,81 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +nvinfer1::Dims SplitPlugin::getOutputDimensions(int index, + const nvinfer1::Dims* inputDims, + int nbInputs) { + assert(nbInputs == 1); + assert(index < this->getNbOutputs()); + nvinfer1::Dims const& input_dims = inputDims[0]; + nvinfer1::Dims output_dims = input_dims; + output_dims.d[axis_] = output_length_.at(index); + return output_dims; +} + +int SplitPlugin::initialize() { + std::vector segment_offsets(1, 0); + for (int i = 0; i < this->getNbOutputs(); ++i) { + segment_offsets.push_back(segment_offsets.back() + output_length_[i]); + } + segment_offsets_ = segment_offsets; + nvinfer1::Dims dims = this->getInputDims(0); + nx_ = 1; + for (int i = dims.nbDims - 1; i > axis_; --i) { + nx_ *= dims.d[i]; + } + ny_ = dims.d[axis_]; + nz_ = 1; + for (int i = axis_ - 1; i >= 0; --i) { + nz_ *= dims.d[i]; + } + return 0; +} + +int SplitPlugin::enqueue(int batchSize, const void* const* inputs, + void** outputs, void* workspace, cudaStream_t stream) { + auto const& input_dims = this->getInputDims(0); + int input_size = 0; + float const* idata = reinterpret_cast(inputs[0]); + float** odatas = reinterpret_cast(outputs); + + // kernel impl here. + int inputBatchOffset = nx_ * ny_ * nz_; + for (size_t i = 0; i < this->getNbOutputs(); i++) { + for (size_t j = 0; j < batchSize; j++) { + cudaMemcpyAsync( + odatas[i] + + j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * + sizeof(float), + inputs[0] + + (inputBatchOffset * j + segment_offsets_[i] * nx_) * + sizeof(float), + (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float), + cudaMemcpyDeviceToDevice, stream); + } + } + + return cudaGetLastError() != cudaSuccess; +} + +} // tensorrt +} // inference +} // paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h new file mode 100644 index 0000000000000000000000000000000000000000..7281e40c331550de472df49c57b1d9a5226842d5 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -0,0 +1,74 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class SplitPlugin : public PluginTensorRT { + int axis_; + std::vector output_length_; + int nx_, ny_, nz_; + std::vector segment_offsets_; + + protected: + virtual size_t getSerializationSize() override { + return SerializedSize(axis_) + SerializedSize(output_length_) + + getBaseSerializationSize(); + } + + // TRT will call this func when we need to serialize the configuration of + // tensorrt. + // It should not be called by users. + virtual void serialize(void *buffer) override { + serializeBase(buffer); + SerializeValue(&buffer, axis_); + SerializeValue(&buffer, output_length_); + } + + public: + SplitPlugin(int axis, std::vector const &output_lengths) + : axis_(axis), output_length_(output_lengths) { + assert(axis <= nvinfer1::Dims::MAX_DIMS); + } + + // It was used for tensorrt deserialization. + // It should not be called by users. + SplitPlugin(void const *serialData, size_t serialLength) { + deserializeBase(serialData, serialLength); + DeserializeValue(&serialData, &serialLength, &axis_); + DeserializeValue(&serialData, &serialLength, &output_length_); + } + + SplitPlugin *clone() const override { + return new SplitPlugin(axis_, output_length_); + } + + virtual const char *getPluginType() const override { return "split"; } + virtual int getNbOutputs() const override { return output_length_.size(); } + virtual nvinfer1::Dims getOutputDimensions(int index, + const nvinfer1::Dims *inputs, + int nbInputDims) override; + virtual int initialize() override; + virtual int enqueue(int batchSize, const void *const *inputs, void **outputs, + void *workspace, cudaStream_t stream) override; +}; + +} // tensorrt +} // inference +} // paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc new file mode 100644 index 0000000000000000000000000000000000000000..08016d84b15bc750738f3183d8d61a5c90862288 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +void PluginTensorRT::serializeBase(void*& buffer) { + SerializeValue(&buffer, input_dims_); + SerializeValue(&buffer, max_batch_size_); + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, data_format_); +} + +void PluginTensorRT::deserializeBase(void const*& serialData, + size_t& serialLength) { + DeserializeValue(&serialData, &serialLength, &input_dims_); + DeserializeValue(&serialData, &serialLength, &max_batch_size_); + DeserializeValue(&serialData, &serialLength, &data_type_); + DeserializeValue(&serialData, &serialLength, &data_format_); +} + +size_t PluginTensorRT::getBaseSerializationSize() { + return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) + + SerializedSize(data_type_) + SerializedSize(data_format_)); +} + +bool PluginTensorRT::supportsFormat(nvinfer1::DataType type, + nvinfer1::PluginFormat format) const { + return ((type == nvinfer1::DataType::kFLOAT) && + (format == nvinfer1::PluginFormat::kNCHW)); +} + +void PluginTensorRT::configureWithFormat(const nvinfer1::Dims* inputDims, + int nbInputs, + const nvinfer1::Dims* outputDims, + int nbOutputs, nvinfer1::DataType type, + nvinfer1::PluginFormat format, + int maxBatchSize) { + data_type_ = type; + data_format_ = format; + input_dims_.assign(inputDims, inputDims + nbInputs); + max_batch_size_ = maxBatchSize; +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h new file mode 100644 index 0000000000000000000000000000000000000000..4d85e955a49b7dcccae158ea06b76419419797cf --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -0,0 +1,80 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "NvInfer.h" + +#include "paddle/fluid/inference/tensorrt/plugin/serialize.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class PluginTensorRT : public nvinfer1::IPluginExt { + public: + PluginTensorRT() {} + PluginTensorRT(const void* serialized_data, size_t length) {} + nvinfer1::Dims const& getInputDims(int index) const { + return input_dims_.at(index); + } + size_t getMaxBatchSize() const { return max_batch_size_; } + nvinfer1::DataType getDataType() const { return data_type_; } + nvinfer1::PluginFormat getDataFormat() const { return data_format_; } + virtual const char* getPluginVersion() const { return "1"; } + size_t getWorkspaceSize(int) const override { return 0; } + void terminate() override {} + virtual ~PluginTensorRT() {} + // Check format support. The default is FLOAT32 and NCHW. + bool supportsFormat(nvinfer1::DataType type, + nvinfer1::PluginFormat format) const override; + void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, + const nvinfer1::Dims* outputDims, int nbOutputs, + nvinfer1::DataType type, + nvinfer1::PluginFormat format, + int maxBatchSize) override; + + // *NOTE* The following functions need to be overrided in the subclass. + virtual nvinfer1::IPluginExt* clone() const = 0; + virtual const char* getPluginType() const = 0; + // Initialize the layer for execution. This is called when the engine is + // created. + int initialize() override { return 0; } + // Serialize the layer config to buffer. + virtual void serialize(void* buffer) = 0; + virtual size_t getSerializationSize() = 0; + virtual int enqueue(int batchSize, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream) = 0; + + protected: + // Deserialize input_dims, max_batch_size, data_type, data_format + void deserializeBase(void const*& serialData, size_t& serialLength); + size_t getBaseSerializationSize(); + // Serialize input_dims, max_batch_size, data_type, data_format + void serializeBase(void*& buffer); + + std::vector input_dims_; + size_t max_batch_size_; + nvinfer1::DataType data_type_; + nvinfer1::PluginFormat data_format_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 2ca84c80058b35840aff5d072cdc99ecf5165f8e..74569057913d1db9a7184ab61ba655b3a66e49bd 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -45,11 +45,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2 # DAM set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") -inference_analysis_test(test_analyzer_dam SRCS analyzer_dam_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS - --infer_model=${DAM_INSTALL_DIR}/model - --infer_data=${DAM_INSTALL_DIR}/data.txt - --use_analysis=0) +inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc) # chinese_ner set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") @@ -108,7 +104,7 @@ if(WITH_GPU AND TENSORRT_FOUND) if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") endif() - cc_test(test_trt_models SRCS trt_models_tester.cc - ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models - DEPS paddle_inference_tensorrt_subgraph_engine) + inference_analysis_test(test_trt_models SRCS trt_models_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL) endif() diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index ceac5dc7e14365c77cf1cbbbc16e4bf3ebfced73..b369cba5c8b3f8aadd1123d6b7345fad6e47bd0f 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -69,7 +69,7 @@ struct DataRecord { num_lines++; std::vector data; split(line, ',', &data); - CHECK_EQ(data.size(), 2 * MAX_TURN_NUM + 3); + CHECK_EQ(data.size(), (size_t)(2 * MAX_TURN_NUM + 3)); // load turn data std::vector turns_tmp[MAX_TURN_NUM]; for (int i = 0; i < MAX_TURN_NUM; ++i) { @@ -178,7 +178,8 @@ TEST(Analyzer_dam, profile) { std::vector outputs; std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { PADDLE_ENFORCE_GT(outputs.size(), 0); @@ -196,15 +197,13 @@ TEST(Analyzer_dam, fuse_statis) { contrib::AnalysisConfig cfg; SetConfig(&cfg); - if (FLAGS_use_analysis) { - int num_ops; - auto predictor = CreatePaddlePredictor(cfg); - auto fuse_statis = GetFuseStatis( - static_cast(predictor.get()), &num_ops); - ASSERT_TRUE(fuse_statis.count("fc_fuse")); - EXPECT_EQ(fuse_statis.at("fc_fuse"), 317); - EXPECT_EQ(num_ops, 2020); - } + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 317); + EXPECT_EQ(num_ops, 2020); } // Compare result of NativeConfig and AnalysisConfig @@ -215,9 +214,8 @@ TEST(Analyzer_dam, compare) { std::vector> input_slots_all; SetInput(&input_slots_all); - if (FLAGS_use_analysis) { - CompareNativeAndAnalysis(cfg, input_slots_all); - } + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 5fb551810fd4d1c56547a8aa581cb6c4587df031..310852e2f7cb284bda3041911d0059b55ee3b477 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -133,7 +133,8 @@ TEST(Analyzer_LAC, profile) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { // the first inference result @@ -175,7 +176,8 @@ TEST(Analyzer_LAC, compare) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } } // namespace analysis diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index d91f7c314d0a936da6f5b0c41920c905af5cd0ee..3a5f844de3cae7eb9b6e3555c5219c6cf8ee1919 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -121,7 +121,8 @@ TEST(Analyzer_Chinese_ner, profile) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { // the first inference result @@ -160,7 +161,8 @@ TEST(Analyzer_Chinese_ner, compare) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index e5c8dfd22a006d5271248c5b083aab2c22417502..2b936175ed3f8ec24826485027048c82df0461ab 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -37,12 +37,16 @@ void SetInput(std::vector> *inputs) { void profile(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg); - cfg._use_mkldnn = use_mkldnn; + + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } std::vector outputs; std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); } TEST(Analyzer_resnet50, profile) { profile(); } @@ -65,11 +69,14 @@ TEST(Analyzer_resnet50, fuse_statis) { void compare(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg); - cfg._use_mkldnn = use_mkldnn; + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } TEST(Analyzer_resnet50, compare) { compare(); } diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index e0416ff953b61f56a2ca1a45cb382d40a6cffa4a..1ae2b4b03a1b2a66b3ddc8cb66d9575751a52297 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -210,7 +210,6 @@ void SetConfig(AnalysisConfig *cfg) { cfg->device = 0; cfg->specify_input_name = true; cfg->enable_ir_optim = true; - cfg->ir_passes.clear(); // Do not exclude any pass. } void SetInput(std::vector> *inputs) { @@ -226,14 +225,16 @@ void SetInput(std::vector> *inputs) { // Easy for profiling independently. TEST(Analyzer_rnn1, profile) { - contrib::AnalysisConfig cfg; + contrib::AnalysisConfig cfg(false); SetConfig(&cfg); - cfg.use_gpu = false; + cfg.fraction_of_gpu_memory = 0.1; + cfg.pass_builder()->TurnOnDebug(); std::vector outputs; std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); } // Check the fuse status @@ -260,7 +261,8 @@ TEST(Analyzer_rnn1, compare) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } // Test Multi-Thread. @@ -271,32 +273,8 @@ TEST(Analyzer_rnn1, multi_thread) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, 4 /* multi_thread */); -} - -bool CompareTensors(const framework::Scope &a_scope, - const framework::Scope &b_scope, - const std::vector &tensors) { - for (auto &x : tensors) { - auto *a_var = a_scope.FindVar(x); - auto *b_var = b_scope.FindVar(x); - if (a_var && b_var) { - if (a_var->Type() == typeid(framework::LoDTensor) || - a_var->Type() == typeid(framework::Tensor)) { - LOG(INFO) << "comparing tensor " << x; - auto &a_t = a_var->Get(); - auto &b_t = b_var->Get(); - if (!inference::CompareTensor(a_t, b_t)) { - LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x); - } - } else { - LOG(INFO) << "skip no tensor " << x; - } - } else { - LOG(INFO) << "skip tensor " << x; - } - } - return true; + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, 4 /* multi_thread */); } // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing @@ -307,7 +285,6 @@ TEST(Analyzer_rnn1, ZeroCopy) { config.use_feed_fetch_ops = false; PaddlePlace place; - int output_size{0}; auto predictor = CreatePaddlePredictor(config); @@ -353,86 +330,22 @@ TEST(Analyzer_rnn1, ZeroCopy) { Timer timer; double total_time{0}; - double native_total_time{0}; - double analysis_total_time{0.}; - for (int i = 0; i < FLAGS_repeat; i++) { timer.tic(); predictor->ZeroCopyRun(); total_time += timer.toc(); } + LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor); - auto *output_data = output_tensor->data(&place, &output_size); - ASSERT_GT(output_size, 0); // more than one output! - - for (int i = 0; i < FLAGS_repeat; i++) { - // Run native predictor. - timer.tic(); - ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs)); - native_total_time += timer.toc(); - } - - for (int i = 0; i < FLAGS_repeat; i++) { - timer.tic(); - ASSERT_TRUE( - analysis_predictor->Run(native_inputs.front(), &analysis_outputs)); - analysis_total_time += timer.toc(); - } - - if (!FLAGS_with_precision_check) { - return; - } - int native_output_size = VecReduceToInt(native_outputs.front().shape); - - EXPECT_EQ(native_output_size, output_size); + ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs)); + LOG(INFO) << "native output " << DescribeTensor(native_outputs.front()); - // Compare tensors between analysis and zerocopy - auto *p0 = static_cast(predictor.get()); - auto *p1 = static_cast(analysis_predictor.get()); - auto *p2 = static_cast(native_predictor.get()); - - std::vector tensor_names; - for (auto &var_desc : p0->program().Block(0).AllVars()) { - tensor_names.push_back(var_desc->Name()); - } - - LOG(INFO) << "Comparing tensors"; - ASSERT_TRUE( - CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"})); - ASSERT_TRUE( - CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"})); - - LOG(INFO) << "output1 " << inference::LoDTensorSummary( - p0->scope() - ->FindVar("final_output.tmp_1") - ->Get()); - LOG(INFO) << "output2 " << inference::LoDTensorSummary( - p1->scope() - ->FindVar("final_output.tmp_1") - ->Get()); - LOG(INFO) << "output3 " << inference::LoDTensorSummary( - p2->scope() - ->FindVar("final_output.tmp_1") - ->Get()); - - for (int i = 0; i < output_size; i++) { - LOG(INFO) << output_data[i] << " " - << static_cast(native_outputs.front().data.data())[i] - << " " - << static_cast(analysis_outputs.front().data.data())[i]; - EXPECT_NEAR(output_data[i], - static_cast(native_outputs.front().data.data())[i], - 1e-3); + int output_size{0}; + auto *zero_copy_data = output_tensor->data(&place, &output_size); + auto *native_data = static_cast(native_outputs.front().data.data()); + for (size_t i = 0; i < output_size / sizeof(float); i++) { + EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3); } - - LOG(INFO) << "batch_size: " << FLAGS_batch_size; - - LOG(INFO) << "zero average time: " - << total_time / (FLAGS_repeat * FLAGS_batch_size); - LOG(INFO) << "analysis average time: " - << analysis_total_time / (FLAGS_repeat * FLAGS_batch_size); - LOG(INFO) << "native average time: " - << native_total_time / (FLAGS_repeat * FLAGS_batch_size); } TEST(Analyzer_rnn1, ZeroCopyMultiThread) { diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc index e0eb919bd896d73a557001982a436fc93f087a74..e2985006f0ed858e778bf4737be3aaee0e056021 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc @@ -132,7 +132,8 @@ TEST(Analyzer_rnn2, profile) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { // the first inference result @@ -153,7 +154,8 @@ TEST(Analyzer_rnn2, compare) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index f590ef27967e47ffcb3a97e80dd147efdd1906e6..858191184a377a26042c98e17d5b8df782575efc 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -161,7 +161,8 @@ TEST(Analyzer_seq_conv1, profile) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { // the first inference result @@ -199,7 +200,8 @@ TEST(Analyzer_seq_conv1, compare) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index ca19475bda372398d425b0fa6f9a732cd79a8166..34a241f070fdc62d1b1e94835fb1dad405baafa9 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -74,7 +74,8 @@ TEST(Analyzer_Text_Classification, profile) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1) { // Get output @@ -101,20 +102,20 @@ TEST(Analyzer_Text_Classification, compare) { std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) { AnalysisConfig cfg; SetConfig(&cfg); // Enable embedding_fc_lstm_fuse_pass (disabled by default) - auto it = std::find(cfg.ir_passes.begin(), cfg.ir_passes.end(), - "embedding_fc_lstm_fuse_pass"); - if (it != cfg.ir_passes.end()) cfg.ir_passes.erase(it); + cfg.pass_builder()->InsertPass(2, "embedding_fc_lstm_fuse_pass"); std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index b2cd49af9aa580482fad84b6b23cb19f954e22fc..956a235edcefb7d688983c3b63b187e284efb02a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -58,7 +58,7 @@ void SetConfig(AnalysisConfig *cfg) { cfg->enable_ir_optim = true; cfg->specify_input_name = true; // TODO(TJ): fix fusion gru - cfg->ir_passes.push_back("fc_gru_fuse_pass"); + cfg->pass_builder()->DeletePass("fc_gru_fuse_pass"); } void SetInput(std::vector> *inputs) { @@ -84,12 +84,15 @@ void SetInput(std::vector> *inputs) { void profile(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg); - cfg._use_mkldnn = use_mkldnn; + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } std::vector outputs; std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { const float ocr_result_data[] = { @@ -125,11 +128,14 @@ TEST(Analyzer_vis, fuse_statis) { void compare(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg); - cfg._use_mkldnn = use_mkldnn; + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } std::vector> input_slots_all; SetInput(&input_slots_all); - CompareNativeAndAnalysis(cfg, input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); } TEST(Analyzer_vis, compare) { compare(); } diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h new file mode 100644 index 0000000000000000000000000000000000000000..aa0c4b1d049bc276cda2f58ac1edd8102fb3fd88 --- /dev/null +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { +namespace inference { + +thread_local int num_spaces = 0; + +static std::string GenSpaces(int num_spaces) { + std::ostringstream os; + for (int i = 0; i < num_spaces; ++i) { + os << " "; + } + return os.str(); +} + +std::ostream &operator<<(std::ostream &os, + const PaddlePredictor::Config &config) { + os << GenSpaces(num_spaces) << "PaddlePredictor::Config {\n"; + num_spaces++; + os << GenSpaces(num_spaces) << "model_dir: " << config.model_dir << "\n"; + num_spaces--; + os << GenSpaces(num_spaces) << "}\n"; + return os; +} + +std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { + os << GenSpaces(num_spaces) << "NativeConfig {\n"; + num_spaces++; + os << *reinterpret_cast(&config); + os << GenSpaces(num_spaces) << "use_gpu: " << config.use_gpu << "\n"; + os << GenSpaces(num_spaces) << "device: " << config.device << "\n"; + os << GenSpaces(num_spaces) + << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n"; + os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n"; + os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; + os << GenSpaces(num_spaces) + << "specify_input_name: " << config.specify_input_name << "\n"; + num_spaces--; + os << GenSpaces(num_spaces) << "}\n"; + return os; +} + +std::ostream &operator<<(std::ostream &os, + const contrib::AnalysisConfig &config) { + os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n"; + num_spaces++; + os << *reinterpret_cast(&config); + os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim + << "\n"; + os << GenSpaces(num_spaces) + << "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n"; + os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt() + << "\n"; + os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n"; + num_spaces--; + os << GenSpaces(num_spaces) << "}\n"; + return os; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 8c5888d8da7b33eeca77311c10dd818648e8e524..a4046914132cc713a707fc2a4d12087383d77fe5 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -19,12 +19,16 @@ #include #include // NOLINT #include + #include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" + +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/tests/api/config_printer.h" #include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/platform/profiler.h" @@ -37,10 +41,18 @@ DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); DEFINE_bool(use_analysis, true, "Running the inference program in analysis mode."); +DECLARE_bool(profile); + namespace paddle { namespace inference { -using contrib::AnalysisConfig; +void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { + if (use_analysis) { + LOG(INFO) << *reinterpret_cast(config); + return; + } + LOG(INFO) << *config; +} void CompareResult(const std::vector &outputs, const std::vector &ref_outputs) { @@ -76,42 +88,58 @@ void CompareResult(const std::vector &outputs, } std::unique_ptr CreateTestPredictor( - const AnalysisConfig &config, bool use_analysis = true) { + const PaddlePredictor::Config *config, bool use_analysis = true) { if (use_analysis) { - return CreatePaddlePredictor(config); - } else { - return CreatePaddlePredictor(config); + return CreatePaddlePredictor( + *(reinterpret_cast(config))); } + return CreatePaddlePredictor( + *(reinterpret_cast(config))); } size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } std::unordered_map GetFuseStatis(PaddlePredictor *predictor, int *num_ops) { + std::unordered_map res; auto *analysis_predictor = static_cast(predictor); - auto &fuse_statis = analysis_predictor->analysis_argument() - .Get>( - framework::ir::kFuseStatisAttr); - for (auto &item : fuse_statis) { + auto *fusion_status = + analysis_predictor->analysis_argument().fusion_statis_ptr(); + if (!fusion_status) { + return res; + } + for (auto &item : *fusion_status) { LOG(INFO) << "fused " << item.first << " " << item.second; } int num = 0; for (auto &node : - analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { - if (node->IsFunction()) { + analysis_predictor->analysis_argument().main_graph().Nodes()) { + if (node->IsOp()) { ++num; } } *num_ops = num; - return fuse_statis; + return *fusion_status; } void SetFakeImageInput(std::vector> *inputs, - const std::string &dirname) { + const std::string &dirname, bool is_combined = true, + std::string model_filename = "model", + std::string params_filename = "params") { // Set fake_image_data PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); - std::vector> feed_target_shapes = - GetFeedTargetShapes(dirname, true, "model", "params"); + std::vector> feed_target_shapes = GetFeedTargetShapes( + dirname, is_combined, model_filename, params_filename); + std::ostringstream os; + for (size_t i = 0; i < feed_target_shapes.size(); ++i) { + os << "feed target " << i << ": {" << feed_target_shapes[i][0]; + for (size_t j = 1; j < feed_target_shapes[i].size(); ++j) { + os << ", " << feed_target_shapes[i][j]; + } + os << "}\n"; + } + LOG(INFO) << os.str(); + int dim1 = feed_target_shapes[0][1]; int dim2 = feed_target_shapes[0][2]; int dim3 = feed_target_shapes[0][3]; @@ -135,25 +163,43 @@ void SetFakeImageInput(std::vector> *inputs, } void TestOneThreadPrediction( - const AnalysisConfig &config, + const PaddlePredictor::Config *config, const std::vector> &inputs, std::vector *outputs, bool use_analysis = true) { int batch_size = FLAGS_batch_size; int num_times = FLAGS_repeat; auto predictor = CreateTestPredictor(config, use_analysis); - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - for (size_t j = 0; j < inputs.size(); j++) { - predictor->Run(inputs[j], outputs); + + // warmup run + LOG(INFO) << "Warm up run..."; + { + Timer warmup_timer; + warmup_timer.tic(); + predictor->Run(inputs[0], outputs, batch_size); + PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1); +#if !defined(_WIN32) + if (FLAGS_profile) { + paddle::platform::ResetProfiler(); } +#endif + } + + LOG(INFO) << "Run " << num_times << " times..."; + { + Timer run_timer; + run_timer.tic(); + for (int i = 0; i < num_times; i++) { + for (size_t j = 0; j < inputs.size(); j++) { + predictor->Run(inputs[j], outputs, batch_size); + } + } + PrintTime(batch_size, num_times, 1, 0, run_timer.toc() / num_times, + inputs.size()); } - PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times, - inputs.size()); } void TestMultiThreadPrediction( - const AnalysisConfig &config, + const PaddlePredictor::Config *config, const std::vector> &inputs, std::vector *outputs, int num_threads, bool use_analysis = true) { @@ -161,11 +207,12 @@ void TestMultiThreadPrediction( int num_times = FLAGS_repeat; std::vector threads; std::vector> predictors; - // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled - // because AttentionLSTM's hard code nodeid will be damanged. - for (int tid = 0; tid < num_threads; ++tid) { - predictors.emplace_back(CreateTestPredictor(config, use_analysis)); + predictors.emplace_back(CreateTestPredictor(config, use_analysis)); + for (int tid = 1; tid < num_threads; ++tid) { + predictors.emplace_back(predictors.front()->Clone()); } + + size_t total_time{0}; for (int tid = 0; tid < num_threads; ++tid) { threads.emplace_back([&, tid]() { #ifdef PADDLE_WITH_MKLDNN @@ -173,17 +220,21 @@ void TestMultiThreadPrediction( #endif // Each thread should have local inputs and outputs. // The inputs of each thread are all the same. - std::vector> inputs_tid = inputs; std::vector outputs_tid; + auto &predictor = predictors[tid]; + LOG(INFO) << "running thread " << tid; Timer timer; timer.tic(); for (int i = 0; i < num_times; i++) { - for (size_t j = 0; j < inputs_tid.size(); j++) { - predictors[tid]->Run(inputs_tid[j], &outputs_tid); + for (const auto &input : inputs) { + ASSERT_TRUE(predictor->Run(input, &outputs_tid)); } } - PrintTime(batch_size, num_times, num_threads, tid, - timer.toc() / num_times, inputs_tid.size()); + + auto time = timer.toc(); + total_time += time; + PrintTime(batch_size, num_times, num_threads, tid, time / num_times, + inputs.size()); }); } for (int i = 0; i < num_threads; ++i) { @@ -191,12 +242,11 @@ void TestMultiThreadPrediction( } } -void TestPrediction(const AnalysisConfig &config, +void TestPrediction(const PaddlePredictor::Config *config, const std::vector> &inputs, std::vector *outputs, int num_threads, bool use_analysis = FLAGS_use_analysis) { - LOG(INFO) << "use_analysis: " << use_analysis - << ", use_mkldnn: " << config._use_mkldnn; + PrintConfig(config, use_analysis); if (num_threads == 1) { TestOneThreadPrediction(config, inputs, outputs, use_analysis); } else { @@ -206,9 +256,9 @@ void TestPrediction(const AnalysisConfig &config, } void CompareNativeAndAnalysis( - const AnalysisConfig &config, + const PaddlePredictor::Config *config, const std::vector> &inputs) { - LOG(INFO) << "use_mkldnn: " << config._use_mkldnn; + PrintConfig(config, true); std::vector native_outputs, analysis_outputs; TestOneThreadPrediction(config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &analysis_outputs, true); diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 75840a9c437d956da4f542a38b2532ea20ee96c5..922feba10fec5d1d13b47dbce064fce2e01d8998 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -1,108 +1,149 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #include #include #include -#include "paddle/fluid/inference/analysis/analyzer.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" + +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { -using paddle::contrib::MixedRTConfig; - -DEFINE_string(dirname, "", "Directory of the inference model."); - -NativeConfig GetConfigNative() { - NativeConfig config; - config.model_dir = FLAGS_dirname; - // LOG(INFO) << "dirname " << config.model_dir; - config.fraction_of_gpu_memory = 0.45; - config.use_gpu = true; - config.device = 0; - return config; +namespace inference { + +DEFINE_bool(use_tensorrt, true, "Test the performance of TensorRT engine."); +DEFINE_string(prog_filename, "", "Name of model file."); +DEFINE_string(param_filename, "", "Name of parameters file."); + +template +void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu, + bool use_tensorrt = false, int batch_size = -1) { + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + config->prog_file = model_dir + "/" + FLAGS_prog_filename; + config->param_file = model_dir + "/" + FLAGS_param_filename; + } else { + config->model_dir = model_dir; + } + if (use_gpu) { + config->use_gpu = true; + config->device = 0; + config->fraction_of_gpu_memory = 0.15; + } } -MixedRTConfig GetConfigTRT() { - MixedRTConfig config; - config.model_dir = FLAGS_dirname; - config.use_gpu = true; - config.fraction_of_gpu_memory = 0.2; - config.device = 0; - config.max_batch_size = 3; - return config; +template <> +void SetConfig(contrib::AnalysisConfig* config, + std::string model_dir, bool use_gpu, + bool use_tensorrt, int batch_size) { + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + config->prog_file = model_dir + "/" + FLAGS_prog_filename; + config->param_file = model_dir + "/" + FLAGS_param_filename; + } else { + config->model_dir = model_dir; + } + if (use_gpu) { + config->use_gpu = true; + config->device = 0; + config->fraction_of_gpu_memory = 0.15; + if (use_tensorrt) { + config->EnableTensorRtEngine(1 << 10, batch_size); + config->pass_builder()->DeletePass("conv_bn_fuse_pass"); + config->pass_builder()->DeletePass("fc_fuse_pass"); + config->pass_builder()->TurnOnDebug(); + } else { + config->enable_ir_optim = true; + } + } } -void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) { - NativeConfig config0 = GetConfigNative(); - config0.model_dir = model_dirname; - - MixedRTConfig config1 = GetConfigTRT(); - config1.model_dir = model_dirname; - config1.max_batch_size = batch_size; - - auto predictor0 = CreatePaddlePredictor(config0); - auto predictor1 = CreatePaddlePredictor(config1); - // Prepare inputs - int height = 224; - int width = 224; - float *data = new float[batch_size * 3 * height * width]; - memset(data, 0, sizeof(float) * (batch_size * 3 * height * width)); - data[0] = 1.0f; - - // Prepare inputs - PaddleTensor tensor; - tensor.name = "input_0"; - tensor.shape = std::vector({batch_size, 3, height, width}); - tensor.data = PaddleBuf(static_cast(data), - sizeof(float) * (batch_size * 3 * height * width)); - tensor.dtype = PaddleDType::FLOAT32; - std::vector paddle_tensor_feeds(1, tensor); - - // Prepare outputs - std::vector outputs0; - std::vector outputs1; - CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0)); - - CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size)); - - // Get output. - ASSERT_EQ(outputs0.size(), 1UL); - ASSERT_EQ(outputs1.size(), 1UL); - - const size_t num_elements = outputs0.front().data.length() / sizeof(float); - const size_t num_elements1 = outputs1.front().data.length() / sizeof(float); - EXPECT_EQ(num_elements, num_elements1); - - auto *data0 = static_cast(outputs0.front().data.data()); - auto *data1 = static_cast(outputs1.front().data.data()); - - ASSERT_GT(num_elements, 0UL); - for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) { - EXPECT_NEAR(data0[i], data1[i], 1e-3); +void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { + std::vector> inputs_all; + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename, + FLAGS_param_filename); + } else { + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + } + + std::vector outputs; + if (use_analysis || use_tensorrt) { + contrib::AnalysisConfig config(true); + SetConfig(&config, model_dir, true, use_tensorrt, + FLAGS_batch_size); + TestPrediction(reinterpret_cast(&config), + inputs_all, &outputs, FLAGS_num_threads, true); + } else { + NativeConfig config; + SetConfig(&config, model_dir, true, false); + TestPrediction(reinterpret_cast(&config), + inputs_all, &outputs, FLAGS_num_threads, false); } } -TEST(trt_models_test, mobilenet) { - CompareTensorRTWithFluid(1, FLAGS_dirname + "/mobilenet"); +void compare(std::string model_dir, bool use_tensorrt) { + std::vector> inputs_all; + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename, + FLAGS_param_filename); + } else { + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + } + + std::vector native_outputs; + NativeConfig native_config; + SetConfig(&native_config, model_dir, true, false, + FLAGS_batch_size); + TestOneThreadPrediction( + reinterpret_cast(&native_config), inputs_all, + &native_outputs, false); + + std::vector analysis_outputs; + contrib::AnalysisConfig analysis_config(true); + SetConfig(&analysis_config, model_dir, true, + use_tensorrt, FLAGS_batch_size); + TestOneThreadPrediction( + reinterpret_cast(&analysis_config), inputs_all, + &analysis_outputs, true); + + CompareResult(native_outputs, analysis_outputs); } -TEST(trt_models_test, resnet50) { - CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnet50"); +TEST(TensorRT_mobilenet, compare) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + compare(model_dir, /* use_tensorrt */ true); } -TEST(trt_models_test, resnext50) { - CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnext50"); +TEST(TensorRT_resnet50, compare) { + std::string model_dir = FLAGS_infer_model + "/resnet50"; + compare(model_dir, /* use_tensorrt */ true); } +TEST(TensorRT_resnext50, compare) { + std::string model_dir = FLAGS_infer_model + "/resnext50"; + compare(model_dir, /* use_tensorrt */ true); +} + +TEST(TensorRT_resnext50, profile) { + std::string model_dir = FLAGS_infer_model + "/resnext50"; + profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt); +} + +TEST(TensorRT_mobilenet, analysis) { + std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; + compare(model_dir, /* use_tensorrt */ false); +} + +} // namespace inference } // namespace paddle + +USE_PASS(tensorrt_subgraph_pass); diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index edefeed67eb048387fbee2bf067553a8023001c2..5c06cad64eca296b0a9b57bd22d2a94cd7d00838 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "glog/logging.h" @@ -21,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/string/printf.h" DEFINE_bool(init_allocated_mem, false, "It is a mistake that the values of the memory allocated by " @@ -145,12 +147,18 @@ void* Alloc(const platform::CUDAPlace& place, platform::SetDeviceId(place.device); size_t avail, total; platform::GpuMemoryUsage(&avail, &total); - LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU " - << place.device << ", available " << avail << " bytes"; + LOG(WARNING) << "Cannot allocate " << string::HumanReadableSize(size) + << " in GPU " << place.device << ", available " + << string::HumanReadableSize(avail); LOG(WARNING) << "total " << total; - LOG(WARNING) << "GpuMinChunkSize " << buddy_allocator->GetMinChunkSize(); - LOG(WARNING) << "GpuMaxChunkSize " << buddy_allocator->GetMaxChunkSize(); - LOG(WARNING) << "GPU memory used: " << Used(place); + LOG(WARNING) << "GpuMinChunkSize " + << string::HumanReadableSize( + buddy_allocator->GetMinChunkSize()); + LOG(WARNING) << "GpuMaxChunkSize " + << string::HumanReadableSize( + buddy_allocator->GetMaxChunkSize()); + LOG(WARNING) << "GPU memory used: " + << string::HumanReadableSize(Used(place)); platform::SetDeviceId(cur_dev); } if (FLAGS_init_allocated_mem) { diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 1db2d54218f11eea10a2702fec28665a4c446f72..857bd87bb063be887ef8da8de9b92cb811f0596f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -5,6 +5,8 @@ list(REMOVE_DUPLICATES GENERAL_OPS) set(DEPS_OPS "") set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h) file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt. DO NOT EDIT!\n\n") + +set(PART_CUDA_KERNEL_FILES) function(op_library TARGET) # op_library is a function to create op library. The interface is same as # cc_library. But it handle split GPU/CPU code and link some common library @@ -37,6 +39,12 @@ function(op_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu) endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu + ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE) + list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu) list(APPEND hip_cu_srcs ${TARGET}.hip.cu) endif() @@ -86,7 +94,8 @@ function(op_library TARGET) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" - "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") + "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op" + "fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() @@ -300,8 +309,10 @@ op_library(flatten_op DEPS reshape_op) op_library(sequence_pad_op DEPS sequence_padding) op_library(unstack_op DEPS stack_op) op_library(fake_quantize_op DEPS memory) +if (NOT WIN32) op_library(crf_decoding_op DEPS jit_kernel) op_library(fusion_lstm_op DEPS jit_kernel) +endif(NOT WIN32) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(layer_norm_op DEPS cub) @@ -317,8 +328,8 @@ op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) -op_library(tensor_array_to_tensor_op DEPS concat_op) op_library(concat_op DEPS concat_and_split) +op_library(tensor_array_to_tensor_op DEPS concat_op) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) @@ -327,6 +338,8 @@ foreach(src ${GENERAL_OPS}) endforeach() file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") + + if (NOT WIN32) add_subdirectory(reader) endif(NOT WIN32) @@ -353,3 +366,14 @@ if(NOT WIN32) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) + +if(WITH_GPU) + foreach(CUDA_KERNEL_FILE ${PART_CUDA_KERNEL_FILES}) + file(READ ${CUDA_KERNEL_FILE} TARGET_CONTENT) + string(REGEX MATCH "REGISTER_OP_CUDA_KERNEL\\(\\n?([^,]+),.*" MATCHED ${TARGET_CONTENT}) + if (MATCHED) + string(STRIP ${CMAKE_MATCH_1} MATCHED) + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MATCHED}, CUDA);\n") + endif() + endforeach() +endif() diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc index 0784920064a879963cd9725cd9acf4cec7b874ce..cb98bc514083ad113fdebfbac043a9516fd9435a 100644 --- a/paddle/fluid/operators/auc_op.cc +++ b/paddle/fluid/operators/auc_op.cc @@ -53,7 +53,7 @@ class AucOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Predict")->type()), - ctx.device_context()); + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 3083e622c3066879e107f930a45bcec36d347f80..3a4086274d8a4bf6725df9f3195cec2446ceae6c 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -50,12 +50,18 @@ static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache"; static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = static_cast(1024) * 1024 * 1024; -static constexpr size_t kNUM_CUDNN_FWD_ALGS = - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; +#if CUDNN_VERSION_MIN(6, 0, 5) +static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT; static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; +#else +// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc. +static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7; +static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4; +static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5; +#endif template class CUDNNConvOpKernel : public framework::OpKernel { diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index d5eec148f9b4f76866ec9fca98a596b9bc2860ef..e5c3f0eeb385e1a15fdbb12a989603996420efe3 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -22,6 +22,7 @@ iou_similarity_op.cu) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) +detection_library(density_prior_box_op SRCS density_prior_box_op.cc) detection_library(anchor_generator_op SRCS anchor_generator_op.cc anchor_generator_op.cu) detection_library(target_assign_op SRCS target_assign_op.cc diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..99df15c3226b4305a28a3912398d6d1c766daa73 --- /dev/null +++ b/paddle/fluid/operators/detection/density_prior_box_op.cc @@ -0,0 +1,175 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/density_prior_box_op.h" + +namespace paddle { +namespace operators { + +class DensityPriorBoxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of DensityPriorBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Image"), + "Input(Image) of DensityPriorBoxOp should not be null."); + + auto image_dims = ctx->GetInputDim("Image"); + auto input_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW."); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + PADDLE_ENFORCE_LT(input_dims[2], image_dims[2], + "The height of input must smaller than image."); + + PADDLE_ENFORCE_LT(input_dims[3], image_dims[3], + "The width of input must smaller than image."); + auto variances = ctx->Attrs().Get>("variances"); + + auto fixed_sizes = ctx->Attrs().Get>("fixed_sizes"); + auto fixed_ratios = ctx->Attrs().Get>("fixed_ratios"); + auto densities = ctx->Attrs().Get>("densities"); + + PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(), + "The number of fixed_sizes and densities must be equal."); + size_t num_priors = 0; + if ((fixed_sizes.size() > 0) && (densities.size() > 0)) { + for (size_t i = 0; i < densities.size(); ++i) { + if (fixed_ratios.size() > 0) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + } + } + std::vector dim_vec(4); + dim_vec[0] = input_dims[2]; + dim_vec[1] = input_dims[3]; + dim_vec[2] = num_priors; + dim_vec[3] = 4; + ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); + ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + platform::CPUPlace()); + } +}; + +class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Input", + "(Tensor, default Tensor), " + "the input feature data of DensityPriorBoxOp, the layout is NCHW."); + AddInput("Image", + "(Tensor, default Tensor), " + "the input image data of DensityPriorBoxOp, the layout is NCHW."); + AddOutput("Boxes", + "(Tensor, default Tensor), the output prior boxes of " + "DensityPriorBoxOp. The layout is [H, W, num_priors, 4]. " + "H is the height of input, W is the width of input, num_priors " + "is the box count of each position."); + AddOutput("Variances", + "(Tensor, default Tensor), the expanded variances of " + "DensityPriorBoxOp. The layout is [H, W, num_priors, 4]. " + "H is the height of input, W is the width of input, num_priors " + "is the box count of each position."); + AddAttr>("variances", + "(vector) List of variances to be " + "encoded in density prior boxes.") + .AddCustomChecker([](const std::vector& variances) { + PADDLE_ENFORCE_EQ(variances.size(), 4, + "Must and only provide 4 variance."); + for (size_t i = 0; i < variances.size(); ++i) { + PADDLE_ENFORCE_GT(variances[i], 0.0, + "variance[%d] must be greater than 0.", i); + } + }); + AddAttr("clip", "(bool) Whether to clip out-of-boundary boxes.") + .SetDefault(true); + + AddAttr( + "step_w", + "Density prior boxes step across width, 0.0 for auto calculation.") + .SetDefault(0.0) + .AddCustomChecker([](const float& step_w) { + PADDLE_ENFORCE_GE(step_w, 0.0, "step_w should be larger than 0."); + }); + AddAttr( + "step_h", + "Density prior boxes step across height, 0.0 for auto calculation.") + .SetDefault(0.0) + .AddCustomChecker([](const float& step_h) { + PADDLE_ENFORCE_GE(step_h, 0.0, "step_h should be larger than 0."); + }); + + AddAttr("offset", + "(float) " + "Density prior boxes center offset.") + .SetDefault(0.5); + AddAttr>("fixed_sizes", + "(vector) List of fixed sizes " + "of generated density prior boxes.") + .SetDefault(std::vector{}) + .AddCustomChecker([](const std::vector& fixed_sizes) { + for (size_t i = 0; i < fixed_sizes.size(); ++i) { + PADDLE_ENFORCE_GT(fixed_sizes[i], 0.0, + "fixed_sizes[%d] should be larger than 0.", i); + } + }); + + AddAttr>("fixed_ratios", + "(vector) List of fixed ratios " + "of generated density prior boxes.") + .SetDefault(std::vector{}) + .AddCustomChecker([](const std::vector& fixed_ratios) { + for (size_t i = 0; i < fixed_ratios.size(); ++i) { + PADDLE_ENFORCE_GT(fixed_ratios[i], 0.0, + "fixed_ratios[%d] should be larger than 0.", i); + } + }); + + AddAttr>("densities", + "(vector) List of densities " + "of generated density prior boxes.") + .SetDefault(std::vector{}) + .AddCustomChecker([](const std::vector& densities) { + for (size_t i = 0; i < densities.size(); ++i) { + PADDLE_ENFORCE_GT(densities[i], 0, + "densities[%d] should be larger than 0.", i); + } + }); + AddComment(R"DOC( + Density Prior box operator + Each position of the input produce N density prior boxes, N is determined by + the count of fixed_ratios, densities, the calculation of N is as follows: + for density in densities: + N += size(fixed_ratios)*density^2 + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(density_prior_box, ops::DensityPriorBoxOp, + ops::DensityPriorBoxOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(density_prior_box, ops::DensityPriorBoxOpKernel, + ops::DensityPriorBoxOpKernel); diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9a52077e9cf90b278549a077af161bd4e282d972 --- /dev/null +++ b/paddle/fluid/operators/detection/density_prior_box_op.h @@ -0,0 +1,146 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/operators/detection/prior_box_op.h" + +namespace paddle { +namespace operators { + +template +class DensityPriorBoxOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto variances = ctx.Attr>("variances"); + auto clip = ctx.Attr("clip"); + + auto fixed_sizes = ctx.Attr>("fixed_sizes"); + auto fixed_ratios = ctx.Attr>("fixed_ratios"); + auto densities = ctx.Attr>("densities"); + + T step_w = static_cast(ctx.Attr("step_w")); + T step_h = static_cast(ctx.Attr("step_h")); + T offset = static_cast(ctx.Attr("offset")); + + auto img_width = image->dims()[3]; + auto img_height = image->dims()[2]; + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + T step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(img_width) / feature_width; + step_height = static_cast(img_height) / feature_height; + } else { + step_width = step_w; + step_height = step_h; + } + int num_priors = 0; + if (fixed_sizes.size() > 0 && densities.size() > 0) { + for (size_t i = 0; i < densities.size(); ++i) { + if (fixed_ratios.size() > 0) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + } + } + + boxes->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + auto e_boxes = framework::EigenTensor::From(*boxes).setConstant(0.0); + + int step_average = static_cast((step_width + step_height) * 0.5); + + for (int h = 0; h < feature_height; ++h) { + for (int w = 0; w < feature_width; ++w) { + T center_x = (w + offset) * step_width; + T center_y = (h + offset) * step_height; + int idx = 0; + // Generate density prior boxes with fixed sizes. + for (size_t s = 0; s < fixed_sizes.size(); ++s) { + auto fixed_size = fixed_sizes[s]; + int density = densities[s]; + // Generate density prior boxes with fixed ratios. + if (fixed_ratios.size() > 0) { + for (size_t r = 0; r < fixed_ratios.size(); ++r) { + float ar = fixed_ratios[r]; + int shift = step_average / density; + float box_width_ratio = fixed_size * sqrt(ar); + float box_height_ratio = fixed_size / sqrt(ar); + for (int di = 0; di < density; ++di) { + for (int dj = 0; dj < density; ++dj) { + float center_x_temp = + center_x - step_average / 2. + shift / 2. + dj * shift; + float center_y_temp = + center_y - step_average / 2. + shift / 2. + di * shift; + e_boxes(h, w, idx, 0) = + (center_x_temp - box_width_ratio / 2.) / img_width >= 0 + ? (center_x_temp - box_width_ratio / 2.) / img_width + : 0; + e_boxes(h, w, idx, 1) = + (center_y_temp - box_height_ratio / 2.) / img_height >= 0 + ? (center_y_temp - box_height_ratio / 2.) / img_height + : 0; + e_boxes(h, w, idx, 2) = + (center_x_temp + box_width_ratio / 2.) / img_width <= 1 + ? (center_x_temp + box_width_ratio / 2.) / img_width + : 1; + e_boxes(h, w, idx, 3) = + (center_y_temp + box_height_ratio / 2.) / img_height <= 1 + ? (center_y_temp + box_height_ratio / 2.) / img_height + : 1; + idx++; + } + } + } + } + } + } + } + if (clip) { + platform::Transform trans; + ClipFunctor clip_func; + trans(ctx.template device_context(), + boxes->data(), boxes->data() + boxes->numel(), + boxes->data(), clip_func); + } + framework::Tensor var_t; + var_t.mutable_data( + framework::make_ddim({1, static_cast(variances.size())}), + ctx.GetPlace()); + + auto var_et = framework::EigenTensor::From(var_t); + + for (size_t i = 0; i < variances.size(); ++i) { + var_et(0, i) = variances[i]; + } + + int box_num = feature_height * feature_width * num_priors; + auto var_dim = vars->dims(); + vars->Resize({box_num, static_cast(variances.size())}); + + auto e_vars = framework::EigenMatrix::From(*vars); + + e_vars = var_et.broadcast(Eigen::DSizes(box_num, 1)); + + vars->Resize(var_dim); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index c82930cc4994c3854e60f40ae9909a90d82cbff6..2d262f932aed9761143f7983c9a38f7a97c374ea 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -15,6 +15,10 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/float16.h" + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using paddle::platform::float16; namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index 93204216f947e5203863a3493005faa0c03ae4af..7bb6934e1496cc989eee8ba82f56959522803bfb 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -111,6 +111,17 @@ class RowwiseTransformIterator return *this; } + RowwiseTransformIterator &operator+(int n) { + while (n-- > 0) { + ++i_; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } + } + + return *this; + } + bool operator==(const RowwiseTransformIterator &rhs) const { return (ptr_ + i_) == &(*rhs); @@ -149,6 +160,21 @@ class MidWiseTransformIterator return *this; } + MidWiseTransformIterator &operator+(int n) { + while (n-- > 0) { + ++j_; + if (UNLIKELY(j_ == post_)) { + ++i_; + j_ = 0; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } + } + } + + return *this; + } + bool operator==(const MidWiseTransformIterator &rhs) const { return (ptr_ + i_) == &(*rhs); diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index fa4dec9cf118cef9b836943fd4eae90d23e6218a..e80249fc87855311479b35af61f872182292795a 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -27,11 +27,9 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { "Out(Output) of Fully Connected should not be null."); PADDLE_ENFORCE(ctx->HasInput("W"), "W(Input) of Fully Connected should not be null."); - // NCHW + auto in_dims = ctx->GetInputDim("Input"); - // IO, I=C*H*W auto w_dims = ctx->GetInputDim("W"); - std::vector output_shape({in_dims[0], w_dims[1]}); if (ctx->HasInput("Bias")) { auto bias_dims = ctx->GetInputDim("Bias"); @@ -44,14 +42,32 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { "The shape of Bias must be [1, dim]."); } } - PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4, - "Fully Connected input should be 2-D or 4-D tensor."); + + if (ctx->Attrs().Get("use_mkldnn")) { + PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4, + "Fully Connected input should be 2-D or 4-D tensor."); + } PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, "Fully Connected input should be 2-D tensor."); - PADDLE_ENFORCE_EQ(framework::product(in_dims) / in_dims[0], w_dims[0], - "Fully Connected input and weigth size do not match."); + int in_num_col_dims = ctx->Attrs().Get("in_num_col_dims"); + PADDLE_ENFORCE_GT( + in_dims.size(), in_num_col_dims, + "The input tensor Input's rank of FCOp should be larger than " + "in_num_col_dims."); + + auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims); + PADDLE_ENFORCE_EQ( + in_mat_dims[1], w_dims[0], + "Fully Connected input and weigth size do not match. %s, %s"); + + std::vector output_dims; + output_dims.reserve(static_cast(in_num_col_dims + 1)); + for (int i = 0; i < in_num_col_dims; ++i) { + output_dims.push_back(in_dims[i]); + } + output_dims.push_back(w_dims[1]); - ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); ctx->ShareLoD("Input", "Out"); } @@ -101,12 +117,15 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType( } void FCOpMaker::Make() { - AddInput("Input", - "(Tensor), The input tensor of fully connected operator with format " - "(NCHW). "); + AddInput("Input", "(Tensor), The input tensor of fully connected operator."); AddInput("W", "(Tensor), The weight fc op with shape (I, O)."); AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O") .AsDispensable(); + AddAttr("in_num_col_dims", + "(int, default 1), The fc op can take tensors with more than " + "two dimensions as its inputs.") + .SetDefault(1) + .EqualGreaterThan(1); AddOutput("Out", "(Tensor) The output tensor of fully connected operator. "); AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") @@ -131,13 +150,15 @@ class FCOpKernel : public framework::OpKernel { auto output = ctx.Output("Out"); auto in_dims = input->dims(); auto w_dims = w->dims(); + auto out_dims = output->dims(); + int M = framework::product(out_dims) / out_dims[out_dims.size() - 1]; const T* input_data = input->data(); const T* w_data = w->data(); T* output_data = output->mutable_data(ctx.GetPlace()); auto blas = math::GetBlas(ctx); math::FCCompute( - blas, in_dims[0], w_dims[1], w_dims[0], input_data, w_data, output_data, + blas, M, w_dims[1], w_dims[0], input_data, w_data, output_data, bias ? bias->data() : NULL); // TODO(TJ): fuse act diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h index d74d4db92528d69492ab7b90a98d3775dadc35d1..e4df59c5d51c390cf593add0c5562665c91f33f6 100644 --- a/paddle/fluid/operators/gather.cu.h +++ b/paddle/fluid/operators/gather.cu.h @@ -50,7 +50,9 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { // PADDLE_ENFORCE(platform::is_gpu_place(place)); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1); + PADDLE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1)); + int index_size = index.dims()[0]; auto src_dims = src.dims(); diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h index d72e07d76c97e9e455e54980207d7c02842cc04b..dc08ee5efacde5e232d751b13aaf11f51237634a 100644 --- a/paddle/fluid/operators/gather.h +++ b/paddle/fluid/operators/gather.h @@ -38,7 +38,8 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1); + PADDLE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1)); int64_t index_size = index.dims()[0]; auto src_dims = src.dims(); diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index f84ff206fffddef1030b7ed439e887bdfef342a6..95aa9b573c795159079bdb5401b34d7a61252115 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -31,7 +31,8 @@ class GatherOp : public framework::OperatorWithKernel { "Output(Out) of GatherOp should not be null."); auto index_dims = ctx->GetInputDim("Index"); - PADDLE_ENFORCE(index_dims.size() == 1); + PADDLE_ENFORCE(index_dims.size() == 1 || + (index_dims.size() == 2 && index_dims[1] == 1)); int batch_size = ctx->GetInputDim("Index")[0]; framework::DDim output_dims(ctx->GetInputDim("X")); output_dims[0] = batch_size; @@ -53,6 +54,7 @@ class GatherGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X")); } protected: @@ -75,7 +77,7 @@ Gather Operator. $Out = X[Index]$ -Out is obtained by gathering entries of the outer-most dimension +Out is obtained by gathering entries of the outer-most dimension of X indexed by Index and concatenate them together. Example: diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h index 4e91a3dcd272c8d368cb8c43e7e1fb4c98265db4..08a6043eb07a6e44d46428ee195f6cb28c2ee77c 100644 --- a/paddle/fluid/operators/grid_sampler_op.h +++ b/paddle/fluid/operators/grid_sampler_op.h @@ -63,7 +63,8 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx, Tensor ones; ones.mutable_data({n, h, w}, ctx.GetPlace()); auto ones_t = EigenTensor::From(ones).setConstant(1.0); - Tensor half_xmax, half_ymax; + Tensor half_xmax; + Tensor half_ymax; half_xmax.mutable_data({n, h, w}, ctx.GetPlace()); auto half_xmax_t = EigenTensor::From(half_xmax).setConstant(0.5 * x_max); diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc index b9ebe71a3d7ae270a10a45f4805652415078b363..b2c2c7954b79658e66f1524a81bcad0b7bf22c35 100644 --- a/paddle/fluid/operators/hash_op.cc +++ b/paddle/fluid/operators/hash_op.cc @@ -38,7 +38,7 @@ class HashOp : public framework::OperatorWithKernel { std::vector out_dims; out_dims.reserve(dims.size() + 1); // copy all dims except the last one - for (size_t i = 0u; i != dims.size() - 1; ++i) { + for (int i = 0u; i != dims.size() - 1; ++i) { out_dims.emplace_back(dims[i]); } int num_hash = ctx->Attrs().Get("num_hash"); diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 51219504ffa2a778b56351f759e8a8dfb951ad91..df1edc5c2e994b3093d6f6e7e4f6e0e5b2abb469 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -40,8 +40,9 @@ class LoadOp : public framework::OperatorBase { auto out_var_name = Output("Out"); auto *out_var = scope.FindVar(out_var_name); - PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", - out_var_name); + PADDLE_ENFORCE(out_var != nullptr, + "Output variable %s cannot be found in scope %p", + out_var_name, &scope); if (out_var->IsType()) { LoadLodTensor(fin, place, out_var); diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc index de3f0990e109cacd49c4d888bbc1f797fb196e01..a6843f20a59a23bd4e875b0f96524cc8d7aa46d6 100644 --- a/paddle/fluid/operators/lookup_sparse_table_op.cc +++ b/paddle/fluid/operators/lookup_sparse_table_op.cc @@ -45,6 +45,7 @@ class LookupSparseTableOp : public framework::OperatorBase { auto out_var = scope.FindVar(Output("Out")); auto w_var = scope.FindVar(Input("W")); auto ids_var = scope.FindVar(Input("Ids")); + auto is_test = Attr("is_test"); PADDLE_ENFORCE(out_var->IsType(), "The type of Out var should be LodTensor."); @@ -65,7 +66,7 @@ class LookupSparseTableOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()), framework::proto::VarType::FP32, "The sparse table only support FP32"); - w_t->Get(ids_t, out_t, true); + w_t->Get(ids_t, out_t, true, is_test); } }; @@ -91,6 +92,10 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker { "(bool default false)" "Whether create new value if for nonexistent key.") .SetDefault(true); + AddAttr("is_test", + "In test mode, lookup_sparse_table will " + "return a 0 for unknown id") + .SetDefault(false); AddComment(R"DOC( Lookup Sprase Tablel Operator. diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 52b459a6a2e56b7c256efdb535b4652c64bae23c..61c3cb34a2472c0ba7d2a7ea5abf8e826a793951 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/lrn_op.h" #include +#include "paddle/fluid/operators/math/blas.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -29,34 +30,43 @@ struct LRNFunctor { const framework::Tensor& input, framework::Tensor* out, framework::Tensor* mid, int N, int C, int H, int W, int n, T k, T alpha, T beta) { - auto x_v = framework::EigenVector::Flatten(input); - - const int start = -(n - 1) / 2; - const int end = start + n; - - auto e_mid = framework::EigenTensor::From(*mid); - e_mid = e_mid.constant(k); - - auto e_x = framework::EigenTensor::From(input); - for (int m = 0; m < N; m++) { - for (int i = 0; i < C; i++) { - for (int c = start; c < end; c++) { - int ch = i + c; - if (ch >= 0 && ch < C) { - auto s = e_mid.slice(Eigen::array({{m, i, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto r = e_x.slice(Eigen::array({{m, ch, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - s += alpha * r.square(); - } - } + const T* idata = input.data(); + auto place = ctx.GetPlace(); + auto blas = math::GetBlas(ctx); + T* odata = out->mutable_data(place); + T* mdata = mid->mutable_data(place); + Tensor squared; + T* sdata = squared.mutable_data({1, C + n - 1, H, W}, place); + std::memset(sdata, 0, sizeof(T) * squared.numel()); + for (int i = 0; i < mid->numel(); ++i) { + mdata[i] = k; + } + int img_size = H * W; + int fea_size = C * img_size; + int pre_pad = (n - 1) / 2; + // compute batches one by one + for (int i = 0; i < N; ++i) { + blas.VSQR(fea_size, idata + i * fea_size, sdata + pre_pad * img_size); + // init the first channel of mid + for (int c = 0; c < n; ++c) { + blas.AXPY(img_size, alpha, sdata + c * img_size, mdata + i * fea_size); + } + for (int c = 1; c < C; ++c) { + // copy previous scale + int mid_offset = i * fea_size + c * img_size; + std::memcpy(mdata + mid_offset, mdata + mid_offset - img_size, + img_size * sizeof(T)); + // add last + blas.AXPY(img_size, alpha, sdata + (c + n - 1) * img_size, + mdata + mid_offset); + // sub rest + blas.AXPY(img_size, -alpha, sdata + (c - 1) * img_size, + mdata + mid_offset); } } - - auto out_e = framework::EigenVector::Flatten(*out); - out_e = x_v * e_mid.reshape(Eigen::DSizes(e_mid.size())).pow(-beta); + // compute the final output + blas.VPOW(mid->numel(), mdata, -beta, odata); + blas.VMUL(mid->numel(), odata, idata, odata); } }; template struct LRNFunctor; @@ -156,6 +166,9 @@ class LRNOp : public framework::OperatorWithKernel { auto x_dim = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4."); + int n = ctx->Attrs().Get("n"); + PADDLE_ENFORCE(n > 0 && n % 2 == 1, "n should be positive odd value"); + ctx->SetOutputDim("Out", x_dim); ctx->ShareLoD("X", /*->*/ "Out"); ctx->SetOutputDim("MidOut", x_dim); diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h index 0fd3175e8579df9e61368cc151a94fa45e433884..12d39c3815395896343238b536110aecac66a376 100644 --- a/paddle/fluid/operators/lrn_op.h +++ b/paddle/fluid/operators/lrn_op.h @@ -60,7 +60,6 @@ class LRNKernel : public framework::OpKernel { T beta = ctx.Attr("beta"); T k = ctx.Attr("k"); - PADDLE_ENFORCE(n > 0, "n should >= 0"); PADDLE_ENFORCE(alpha >= 0.0, "alpha should >= 0.0"); PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0"); PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0"); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index d9bf2a8aafb9264aa62f3f7404746a53bbb4155f..a4100585592496a03c2e1b5f5c132103a0aa9a8d 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -75,12 +75,13 @@ if(WITH_GPU) endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) - -set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc) -set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) -if(WITH_XBYAK) - list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) - list(APPEND JIT_KERNEL_DEPS xbyak) -endif() -cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) -cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) +if (NOT WIN32) + set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc) + set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) + if(WITH_XBYAK) + list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) + list(APPEND JIT_KERNEL_DEPS xbyak) + endif() + cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) + cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) +endif (NOT WIN32) diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index da185d93c09f9b06bd5968b9c8e93176f9ef014b..5d0d562030d2a20e4a1cefd3c36c6533fd35dc96 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -152,6 +152,12 @@ class Blas { template void VEXP(int n, const T* x, T* y) const; + template + void VSQR(int n, const T* x, T* y) const; + + template + void VPOW(int n, const T* x, T alpha, T* y) const; + template void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta, T* C) const; @@ -238,6 +244,16 @@ class BlasT : private Blas { Base()->template VEXP(args...); } + template + void VSQR(ARGS... args) const { + Base()->template VSQR(args...); + } + + template + void VPOW(ARGS... args) const { + Base()->template VPOW(args...); + } + template void GEMV(ARGS... args) const { Base()->template GEMV(args...); diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index e1df78d11e41c5f74e244643f40c6d0581fa6a4a..59454669be9e0f92a6fc0db52445307d88e1c7d8 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #pragma once +#include #include #include #include "paddle/fluid/operators/math/math_function.h" @@ -102,6 +103,16 @@ struct CBlas { static void VEXP(ARGS... args) { platform::dynload::vsExp(args...); } + + template + static void VSQR(ARGS... args) { + platform::dynload::vsSqr(args...); + } + + template + static void VPOW(ARGS... args) { + platform::dynload::vsPowx(args...); + } }; template <> @@ -182,6 +193,16 @@ struct CBlas { static void VEXP(ARGS... args) { platform::dynload::vdExp(args...); } + + template + static void VSQR(ARGS... args) { + platform::dynload::vdSqr(args...); + } + + template + static void VPOW(ARGS... args) { + platform::dynload::vdPowx(args...); + } }; #else @@ -241,6 +262,8 @@ struct CBlas { } static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); } static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); } + static void VSQR(...) { PADDLE_THROW("float16 VSQR not supported on CPU"); } + static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); } static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); }; static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); }; #ifdef PADDLE_WITH_MKLML @@ -398,6 +421,31 @@ void Blas::VEXP(int n, const T *x, T *y) const { #endif } +template <> +template +void Blas::VSQR(int n, const T *x, T *y) const { +#ifdef PADDLE_WITH_MKLML + CBlas::VSQR(n, x, y); +#else + for (int i = 0; i < n; ++i) { + y[i] = std::sqrt(x[i]); + } +#endif +} + +template <> +template +void Blas::VPOW(int n, const T *x, T a, + T *y) const { +#ifdef PADDLE_WITH_MKLML + CBlas::VPOW(n, x, a, y); +#else + for (int i = 0; i < n; ++i) { + y[i] = std::pow(x[i], a); + } +#endif +} + template <> template T Blas::DOT(int n, const T *x, const T *y) const { diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 6b3eecfbd11471b5d95dcb10c91acc536f78cb85..e46f60f764ab9f1c292db339a5b38b976de5a11a 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -118,6 +118,39 @@ void VXXJitCode::generate() { ret(); } +bool ReluJitCode::init(int d) { return MayIUse(avx); } + +void ReluJitCode::generate() { + int offset = 0; + vxorps(ymm_zero, ymm_zero, ymm_zero); + for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) { + vmovups(ymm_src, ptr[param1 + offset]); + vmaxps(ymm_dst, ymm_zero, ymm_src); + vmovups(ptr[param2 + offset], ymm_dst); + offset += sizeof(float) * AVX_FLOAT_BLOCK; + } + int rest = num_ % AVX_FLOAT_BLOCK; + if (rest >= 4) { + vmovups(xmm_src, ptr[param1 + offset]); + vmaxps(xmm_dst, xmm_zero, xmm_src); + vmovups(ptr[param2 + offset], xmm_dst); + offset += sizeof(float) * 4; + rest -= 4; + } + if (rest >= 2) { + vmovups(xmm_src, ptr[param1 + offset]); + vmaxps(xmm_dst, xmm_zero, xmm_src); + vmovq(ptr[param2 + offset], xmm_dst); + offset += sizeof(float) * 2; + rest -= 2; + } + if (rest > 0) { + vmovups(xmm_src, ptr[param1 + offset]); + vmaxps(xmm_dst, xmm_zero, xmm_src); + vmovss(ptr[param2 + offset], xmm_dst); + } + ret(); +} } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index aaedb0ae10323eeddfba9512d9e47c7a22320610..3c242870a24c5bb29d34d4b99406c5df8cec6763 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -85,6 +85,29 @@ class VXXJitCode : public JitCode { ymm_t ymm_zero = ymm_t(3); }; +class ReluJitCode : public JitCode { + public: + DECLARE_JIT_CODE(ReluJitCode); + explicit ReluJitCode(int d, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), num_(d) {} + static bool init(int d); + void generate() override; + + private: + int num_; + reg64_t param1{abi_param1}; + reg64_t param2{abi_param2}; + + xmm_t xmm_zero = xmm_t(0); + xmm_t xmm_src = xmm_t(1); + xmm_t xmm_dst = xmm_t(1); + + ymm_t ymm_zero = ymm_t(0); + ymm_t ymm_src = ymm_t(1); + ymm_t ymm_dst = ymm_t(1); +}; + } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index e9b259282cd00cc2afc46634423ec09590bf5dd3..cd3a45e66773c89e45e80ab77ebd925abd6cbe53 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -97,37 +97,38 @@ class VAddBiasKernel : public Kernel { template class VActKernel : public Kernel { public: - virtual void Compute(const T *x, T *y) const = 0; + virtual void ComputeDeprecated(const T *x, T *y) const = 0; }; template class VReluKernel : public VActKernel { public: - virtual void Compute(const T *x, T *y) const = 0; + virtual void ComputeDeprecated(const T *x, T *y) const = 0; + void (*Compute)(const T *, T *, int); }; template class VIdentityKernel : public VActKernel { public: - virtual void Compute(const T *x, T *y) const = 0; + virtual void ComputeDeprecated(const T *x, T *y) const = 0; }; template class VExpKernel : public VActKernel { public: - virtual void Compute(const T *x, T *y) const = 0; + virtual void ComputeDeprecated(const T *x, T *y) const = 0; }; template class VSigmoidKernel : public VActKernel { public: - virtual void Compute(const T *x, T *y) const = 0; + virtual void ComputeDeprecated(const T *x, T *y) const = 0; }; template class VTanhKernel : public VActKernel { public: - virtual void Compute(const T *x, T *y) const = 0; + virtual void ComputeDeprecated(const T *x, T *y) const = 0; }; template diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index c4bfbcf925a2bbdc39f8468049c58e126d3eba1b..cf46a210afbd4903dc3841f27765c390f721c763 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -71,6 +71,13 @@ void VAddBiasRefer(const T* a, const T* x, T* y, int n) { } } +template +void VReluRefer(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } +} + #ifdef PADDLE_WITH_MKLML template void VMulMKL(const T* x, const T* y, T* z, int n); @@ -344,124 +351,60 @@ bool VAddBiasKernelImpl::useJIT(int d) { } #endif -#undef DECLARE_STATIC_FUNC - -REGISTER_JITKERNEL(vmul, VMulKernel); -REGISTER_JITKERNEL(vadd, VAddKernel); -REGISTER_JITKERNEL(vaddrelu, VAddReluKernel); -REGISTER_JITKERNEL(vscal, VScalKernel); -REGISTER_JITKERNEL(vaddbias, VAddBiasKernel); - /* VRelu JitKernel */ -template +template class VReluKernelImpl : public VReluKernel { public: - explicit VReluKernelImpl(int d) : VReluKernel() { this->num_ = d; } - void Compute(const T* x, T* y) const override { - for (int i = 0; i < this->num_; ++i) { - y[i] = x[i] > 0 ? x[i] : 0; + DECLARE_STATIC_FUNC; + explicit VReluKernelImpl(int d) : VReluKernel() { + this->num_ = d; // TODO(TJ): remove me when ComputeDeprecated done +#ifdef PADDLE_WITH_XBYAK + if (useJIT(d)) { + size_t sz = 96 /*init*/ + + d / AVX_FLOAT_BLOCK * 4 /* instructions*/ * + 8 /*everage byte for each instruction*/; + jitcode_.reset(new gen::ReluJitCode(d, sz > 4096 ? sz : 4096)); + this->Compute = jitcode_->getCode(); + return; } - } -}; - -#define INTRI8_FLOAT(isa) \ - template <> \ - void VReluKernelImpl::Compute(const float* x, float* y) \ - const { \ - __m256 tmp = _mm256_loadu_ps(x); \ - tmp = _mm256_max_ps(tmp, _mm256_setzero_ps()); \ - _mm256_storeu_ps(y, tmp); \ - } - -#define INTRI16_FLOAT(isa) \ - template <> \ - void VReluKernelImpl::Compute(const float* x, float* y) \ - const { \ - __m256 zeros = _mm256_setzero_ps(); \ - __m256 tmp0 = _mm256_loadu_ps(x); \ - __m256 tmp1 = _mm256_loadu_ps(x + 8); \ - tmp0 = _mm256_max_ps(tmp0, zeros); \ - tmp1 = _mm256_max_ps(tmp1, zeros); \ - _mm256_storeu_ps(y, tmp0); \ - _mm256_storeu_ps(y + 8, tmp1); \ - } +#endif -#define INTRI_GT8LT16_FLOAT(isa) \ - template <> \ - VReluKernelImpl::VReluKernelImpl(int d) \ - : VReluKernel() { \ - this->num_ = d; \ - this->end_ = AVX_FLOAT_BLOCK; \ - this->rest_ = d - AVX_FLOAT_BLOCK; \ - } \ - template <> \ - void VReluKernelImpl::Compute(const float* x, \ - float* y) const { \ - __m256 zeros = _mm256_setzero_ps(); \ - __m256 tmp0 = _mm256_loadu_ps(x); \ - __m256 tmp1 = _mm256_loadu_ps(x + this->rest_); \ - tmp0 = _mm256_max_ps(tmp0, zeros); \ - tmp1 = _mm256_max_ps(tmp1, zeros); \ - _mm256_storeu_ps(y, tmp0); \ - _mm256_storeu_ps(y + this->rest_, tmp1); \ + this->Compute = VReluRefer; } - -#define INTRI_GT16_FLOAT(isa) \ - template <> \ - VReluKernelImpl::VReluKernelImpl(int d) \ - : VReluKernel() { \ - this->num_ = d; \ - this->end_ = d - d % AVX_FLOAT_BLOCK; \ - this->rest_ = d - AVX_FLOAT_BLOCK; \ - } \ - template <> \ - void VReluKernelImpl::Compute(const float* x, float* y) \ - const { \ - __m256 zeros = _mm256_setzero_ps(); \ - for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ - __m256 tmp = _mm256_loadu_ps(x + i); \ - tmp = _mm256_max_ps(tmp, zeros); \ - _mm256_storeu_ps(y + i, tmp); \ - } \ - __m256 tmp = _mm256_loadu_ps(x + this->rest_); \ - tmp = _mm256_max_ps(tmp, zeros); \ - _mm256_storeu_ps(y + this->rest_, tmp); \ + void ComputeDeprecated(const T* x, T* y) const override { + VReluRefer(x, y, this->num_); } +#ifdef PADDLE_WITH_XBYAK -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx); -INTRI16_FLOAT(jit::avx); -INTRI_GT8LT16_FLOAT(jit::avx); -INTRI_GT16_FLOAT(jit::avx); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2); -INTRI16_FLOAT(jit::avx2); -INTRI_GT8LT16_FLOAT(jit::avx2); -INTRI_GT16_FLOAT(jit::avx2); + private: + std::unique_ptr jitcode_{nullptr}; #endif -#ifdef __AVX512F__ -// TODO(TJ): refine avx512 -INTRI8_FLOAT(jit::avx512f); -INTRI16_FLOAT(jit::avx512f); -INTRI_GT8LT16_FLOAT(jit::avx512f); -INTRI_GT16_FLOAT(jit::avx512f); +}; + +#ifdef PADDLE_WITH_XBYAK +template <> +bool VReluKernelImpl::useJIT(int d) { + return gen::ReluJitCode::init(d); +} #endif -#undef INTRI8_FLOAT -#undef INTRI16_FLOAT -#undef INTRI_GT8LT16_FLOAT -#undef INTRI_GT16_FLOAT +#undef DECLARE_STATIC_FUNC + +REGISTER_JITKERNEL(vmul, VMulKernel); +REGISTER_JITKERNEL(vadd, VAddKernel); +REGISTER_JITKERNEL(vaddrelu, VAddReluKernel); +REGISTER_JITKERNEL(vscal, VScalKernel); +REGISTER_JITKERNEL(vaddbias, VAddBiasKernel); +REGISTER_JITKERNEL(vrelu, VReluKernel); /* An empty JitKernel */ template class VIdentityKernelImpl : public VIdentityKernel { public: explicit VIdentityKernelImpl(int d) : VIdentityKernel() { this->num_ = d; } - void Compute(const T* x, T* y) const override {} + void ComputeDeprecated(const T* x, T* y) const override {} }; -REGISTER_JITKERNEL_DEPRECATED(vrelu, VReluKernel); REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel); } // namespace jitkernel diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index c55e54a13f539014c0f582436ca1a105d0b0fedd..2ac9e1092362f60ea3d89da0c971a365b45f39ea 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -35,7 +35,7 @@ template class VExpKernelImpl : public VExpKernel { public: explicit VExpKernelImpl(int d) : VExpKernel() { this->num_ = d; } - void Compute(const T* x, T* y) const override { + void ComputeDeprecated(const T* x, T* y) const override { for (int i = 0; i < this->num_; ++i) { y[i] = std::exp(x[i]); } @@ -43,18 +43,18 @@ class VExpKernelImpl : public VExpKernel { }; #ifdef PADDLE_WITH_MKLML -#define MKL_FLOAT(isa, block) \ - template <> \ - void VExpKernelImpl::Compute(const float* x, float* y) \ - const { \ - platform::dynload::vsExp(this->num_, x, y); \ +#define MKL_FLOAT(isa, block) \ + template <> \ + void VExpKernelImpl::ComputeDeprecated(const float* x, \ + float* y) const { \ + platform::dynload::vsExp(this->num_, x, y); \ } -#define MKL_DOUBLE(isa, block) \ - template <> \ - void VExpKernelImpl::Compute(const double* x, double* y) \ - const { \ - platform::dynload::vdExp(this->num_, x, y); \ +#define MKL_DOUBLE(isa, block) \ + template <> \ + void VExpKernelImpl::ComputeDeprecated( \ + const double* x, double* y) const { \ + platform::dynload::vdExp(this->num_, x, y); \ } FOR_EACH_ISA(MKL_FLOAT, kLT8); FOR_EACH_ISA(MKL_FLOAT, kGT8LT16); @@ -211,24 +211,24 @@ __m256 ExpAVX2(__m256 x) { } // namespace detail -#define INTRI8_FLOAT(isa, expisa) \ - template <> \ - void VExpKernelImpl::Compute(const float* x, float* y) \ - const { \ - __m256 tmp = _mm256_loadu_ps(x); \ - _mm256_storeu_ps(y, expisa(tmp)); \ +#define INTRI8_FLOAT(isa, expisa) \ + template <> \ + void VExpKernelImpl::ComputeDeprecated(const float* x, \ + float* y) const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + _mm256_storeu_ps(y, expisa(tmp)); \ } -#define INTRI16_FLOAT(isa, expisa) \ - template <> \ - void VExpKernelImpl::Compute(const float* x, float* y) \ - const { \ - __m256 tmp0 = _mm256_loadu_ps(x); \ - __m256 tmp1 = _mm256_loadu_ps(x + 8); \ - tmp0 = expisa(tmp0); \ - tmp1 = expisa(tmp1); \ - _mm256_storeu_ps(y, tmp0); \ - _mm256_storeu_ps(y + 8, tmp1); \ +#define INTRI16_FLOAT(isa, expisa) \ + template <> \ + void VExpKernelImpl::ComputeDeprecated(const float* x, \ + float* y) const { \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + tmp0 = expisa(tmp0); \ + tmp1 = expisa(tmp1); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ } #ifdef __AVX__ @@ -260,14 +260,14 @@ class VSigmoidKernelImpl : public VSigmoidKernel { this->num_ = d; vexp_ = KernelPool::Instance().template Get>(d); } - void Compute(const T* x, T* y) const override { + void ComputeDeprecated(const T* x, T* y) const override { const T min = SIGMOID_THRESHOLD_MIN; const T max = SIGMOID_THRESHOLD_MAX; for (int i = 0; i < this->num_; ++i) { y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); y[i] = static_cast(0) - y[i]; } - vexp_->Compute(y, y); + vexp_->ComputeDeprecated(y, y); for (int i = 0; i < this->num_; ++i) { y[i] = static_cast(1) / (static_cast(1) + y[i]); } @@ -285,30 +285,30 @@ class VSigmoidKernelImpl : public VSigmoidKernel { tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \ tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp) -#define INTRI8_FLOAT(isa, expisa) \ - template <> \ - void VSigmoidKernelImpl::Compute(const float* x, float* y) \ - const { \ - /* TODO(TJ): try to use static const*/ \ - __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ - __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ - __m256 tmp = _mm256_loadu_ps(x); \ - INTRI_SIGMOID(tmp, min, max, expisa); \ - _mm256_storeu_ps(y, tmp); \ +#define INTRI8_FLOAT(isa, expisa) \ + template <> \ + void VSigmoidKernelImpl::ComputeDeprecated( \ + const float* x, float* y) const { \ + /* TODO(TJ): try to use static const*/ \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_SIGMOID(tmp, min, max, expisa); \ + _mm256_storeu_ps(y, tmp); \ } -#define INTRI16_FLOAT(isa, expisa) \ - template <> \ - void VSigmoidKernelImpl::Compute(const float* x, \ - float* y) const { \ - __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ - __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ - __m256 tmp0 = _mm256_loadu_ps(x); \ - __m256 tmp1 = _mm256_loadu_ps(x + 8); \ - INTRI_SIGMOID(tmp0, min, max, expisa); \ - INTRI_SIGMOID(tmp1, min, max, expisa); \ - _mm256_storeu_ps(y, tmp0); \ - _mm256_storeu_ps(y + 8, tmp1); \ +#define INTRI16_FLOAT(isa, expisa) \ + template <> \ + void VSigmoidKernelImpl::ComputeDeprecated( \ + const float* x, float* y) const { \ + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + INTRI_SIGMOID(tmp0, min, max, expisa); \ + INTRI_SIGMOID(tmp1, min, max, expisa); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ } #define INTRI_GT8LT16_FLOAT(isa, expisa) \ @@ -322,8 +322,8 @@ class VSigmoidKernelImpl : public VSigmoidKernel { KernelPool::Instance().template Get>(this->rest_); \ } \ template <> \ - void VSigmoidKernelImpl::Compute(const float* x, \ - float* y) const { \ + void VSigmoidKernelImpl::ComputeDeprecated( \ + const float* x, float* y) const { \ __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ __m256 tmp = _mm256_loadu_ps(x); \ @@ -335,7 +335,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel { y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ y[i] = 0.f - y[i]; \ } \ - vexp_->Compute(y + this->end_, y + this->end_); \ + vexp_->ComputeDeprecated(y + this->end_, y + this->end_); \ for (int i = this->end_; i < this->num_; ++i) { \ y[i] = 1.f / (1.f + y[i]); \ } \ @@ -352,8 +352,8 @@ class VSigmoidKernelImpl : public VSigmoidKernel { KernelPool::Instance().template Get>(this->rest_); \ } \ template <> \ - void VSigmoidKernelImpl::Compute(const float* x, \ - float* y) const { \ + void VSigmoidKernelImpl::ComputeDeprecated( \ + const float* x, float* y) const { \ __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \ __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \ for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ @@ -367,7 +367,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel { y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \ y[i] = 0.f - y[i]; \ } \ - vexp_->Compute(y + this->end_, y + this->end_); \ + vexp_->ComputeDeprecated(y + this->end_, y + this->end_); \ for (int i = this->end_; i < this->num_; ++i) { \ y[i] = 1.f / (1.f + y[i]); \ } \ @@ -408,10 +408,10 @@ class VTanhKernelImpl : public VTanhKernel { vsigmoid_ = KernelPool::Instance().template Get>(d); vaddbias_ = KernelPool::Instance().template Get>(d); } - void Compute(const T* x, T* y) const override { + void ComputeDeprecated(const T* x, T* y) const override { const T a = static_cast(2), b = static_cast(-1); vscal_->Compute(&a, x, y, this->num_); - vsigmoid_->Compute(y, y); + vsigmoid_->ComputeDeprecated(y, y); vscal_->Compute(&a, y, y, this->num_); vaddbias_->Compute(&b, y, y, this->num_); } @@ -430,25 +430,25 @@ class VTanhKernelImpl : public VTanhKernel { tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \ tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f)) -#define INTRI8_FLOAT(isa, expisa) \ - template <> \ - void VTanhKernelImpl::Compute(const float* x, float* y) \ - const { \ - __m256 tmp = _mm256_loadu_ps(x); \ - INTRI_VTANH(tmp, expisa); \ - _mm256_storeu_ps(y, tmp); \ +#define INTRI8_FLOAT(isa, expisa) \ + template <> \ + void VTanhKernelImpl::ComputeDeprecated(const float* x, \ + float* y) const { \ + __m256 tmp = _mm256_loadu_ps(x); \ + INTRI_VTANH(tmp, expisa); \ + _mm256_storeu_ps(y, tmp); \ } -#define INTRI16_FLOAT(isa, expisa) \ - template <> \ - void VTanhKernelImpl::Compute(const float* x, float* y) \ - const { \ - __m256 tmp0 = _mm256_loadu_ps(x); \ - __m256 tmp1 = _mm256_loadu_ps(x + 8); \ - INTRI_VTANH(tmp0, expisa); \ - INTRI_VTANH(tmp1, expisa); \ - _mm256_storeu_ps(y, tmp0); \ - _mm256_storeu_ps(y + 8, tmp1); \ +#define INTRI16_FLOAT(isa, expisa) \ + template <> \ + void VTanhKernelImpl::ComputeDeprecated(const float* x, \ + float* y) const { \ + __m256 tmp0 = _mm256_loadu_ps(x); \ + __m256 tmp1 = _mm256_loadu_ps(x + 8); \ + INTRI_VTANH(tmp0, expisa); \ + INTRI_VTANH(tmp1, expisa); \ + _mm256_storeu_ps(y, tmp0); \ + _mm256_storeu_ps(y + 8, tmp1); \ } #define INTRI_GT8LT16_FLOAT(isa, expisa) \ @@ -466,8 +466,8 @@ class VTanhKernelImpl : public VTanhKernel { this->rest_); \ } \ template <> \ - void VTanhKernelImpl::Compute(const float* x, \ - float* y) const { \ + void VTanhKernelImpl::ComputeDeprecated( \ + const float* x, float* y) const { \ __m256 tmp = _mm256_loadu_ps(x); \ INTRI_VTANH(tmp, expisa); \ _mm256_storeu_ps(y, tmp); \ @@ -475,40 +475,40 @@ class VTanhKernelImpl : public VTanhKernel { y += AVX_FLOAT_BLOCK; \ const float a = 2.f, b = -1.f; \ vscal_->Compute(&a, x, y, this->num_); \ - vsigmoid_->Compute(y, y); \ + vsigmoid_->ComputeDeprecated(y, y); \ vscal_->Compute(&a, y, y, this->num_); \ vaddbias_->Compute(&b, y, y, this->num_); \ } -#define INTRI_GT16_FLOAT(isa, expisa) \ - template <> \ - VTanhKernelImpl::VTanhKernelImpl(int d) \ - : VTanhKernel() { \ - this->num_ = d; \ - this->rest_ = d % AVX_FLOAT_BLOCK; \ - this->end_ = d - this->rest_; \ - vscal_ = \ - KernelPool::Instance().template Get>(this->rest_); \ - vsigmoid_ = KernelPool::Instance().template Get>( \ - this->rest_); \ - vaddbias_ = KernelPool::Instance().template Get>( \ - this->rest_); \ - } \ - template <> \ - void VTanhKernelImpl::Compute(const float* x, float* y) \ - const { \ - for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ - __m256 tmp = _mm256_loadu_ps(x + i); \ - INTRI_VTANH(tmp, expisa); \ - _mm256_storeu_ps(y + i, tmp); \ - } \ - x += this->end_; \ - y += this->end_; \ - const float a = 2.f, b = -1.f; \ - vscal_->Compute(&a, x, y, this->num_); \ - vsigmoid_->Compute(y, y); \ - vscal_->Compute(&a, y, y, this->num_); \ - vaddbias_->Compute(&b, y, y, this->num_); \ +#define INTRI_GT16_FLOAT(isa, expisa) \ + template <> \ + VTanhKernelImpl::VTanhKernelImpl(int d) \ + : VTanhKernel() { \ + this->num_ = d; \ + this->rest_ = d % AVX_FLOAT_BLOCK; \ + this->end_ = d - this->rest_; \ + vscal_ = \ + KernelPool::Instance().template Get>(this->rest_); \ + vsigmoid_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + vaddbias_ = KernelPool::Instance().template Get>( \ + this->rest_); \ + } \ + template <> \ + void VTanhKernelImpl::ComputeDeprecated(const float* x, \ + float* y) const { \ + for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \ + __m256 tmp = _mm256_loadu_ps(x + i); \ + INTRI_VTANH(tmp, expisa); \ + _mm256_storeu_ps(y + i, tmp); \ + } \ + x += this->end_; \ + y += this->end_; \ + const float a = 2.f, b = -1.f; \ + vscal_->Compute(&a, x, y, this->num_); \ + vsigmoid_->ComputeDeprecated(y, y); \ + vscal_->Compute(&a, y, y, this->num_); \ + vaddbias_->Compute(&b, y, y, this->num_); \ } #ifdef __AVX__ diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index ba3e917377cf12192a068a9d71238442e12d5e5e..926221f0a75c461e275a72f16b4339ae28a8e988 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -175,26 +175,26 @@ class LSTMKernelImpl : public LSTMKernel { void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, T* checked) const override { // gates: W_ch, W_ih, W_fh, W_oh - act_gate_d3_->Compute(gates + d_, gates + d_); + act_gate_d3_->ComputeDeprecated(gates + d_, gates + d_); /* C_t = C_t-1 * fgated + cand_gated * igated */ - act_cand_d_->Compute(gates, gates); + act_cand_d_->ComputeDeprecated(gates, gates); vmul_d_->Compute(gates, gates + d_, gates + d_, d_); vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); vadd_d_->Compute(gates + d_, gates + d2_, ct, d_); /* H_t = act_cell(C_t) * ogated */ - act_cell_d_->Compute(ct, gates + d2_); + act_cell_d_->ComputeDeprecated(ct, gates + d2_); vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); } void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { /* C_t = igated * cgated*/ - act_gate_d_->Compute(gates + d_, gates + d_); - act_cand_d_->Compute(gates, gates); + act_gate_d_->ComputeDeprecated(gates + d_, gates + d_); + act_cand_d_->ComputeDeprecated(gates, gates); vmul_d_->Compute(gates, gates + d_, ct, d_); /* H_t = act_cell(C_t) * ogated */ - act_gate_d_->Compute(gates + d3_, gates + d3_); - act_cell_d_->Compute(ct, gates + d2_); + act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_); + act_cell_d_->ComputeDeprecated(ct, gates + d2_); vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); } @@ -292,32 +292,32 @@ class PeepholeKernelImpl : public LSTMKernel { vmul_d_->Compute(wp_data, ct_1, checked, d_); vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_); vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_); - act_gate_d2_->Compute(gates + d_, gates + d_); + act_gate_d2_->ComputeDeprecated(gates + d_, gates + d_); /* C_t = C_t-1 * fgated + cand_gated * igated*/ - act_cand_d_->Compute(gates, gates); + act_cand_d_->ComputeDeprecated(gates, gates); vmul_d_->Compute(gates, gates + d_, gates + d_, d_); vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); vadd_d_->Compute(gates + d_, gates + d2_, ct, d_); /* get ogated*/ vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_); - act_gate_d_->Compute(gates + d3_, gates + d3_); + act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_); /* H_t = act_cell(C_t) * ogated */ - act_cell_d_->Compute(ct, gates + d2_); + act_cell_d_->ComputeDeprecated(ct, gates + d2_); vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); } void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { /* C_t = igated * cgated*/ - act_gate_d_->Compute(gates + d_, gates + d_); - act_cand_d_->Compute(gates, gates); + act_gate_d_->ComputeDeprecated(gates + d_, gates + d_); + act_cand_d_->ComputeDeprecated(gates, gates); vmul_d_->Compute(gates, gates + d_, ct, d_); /* get outgated, put W_oc * C_t on igated */ vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_); /* H_t = act_cell(C_t) * ogated */ - act_gate_d_->Compute(gates + d3_, gates + d3_); - act_cell_d_->Compute(ct, gates + d2_); + act_gate_d_->ComputeDeprecated(gates + d3_, gates + d3_); + act_cell_d_->ComputeDeprecated(ct, gates + d2_); vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); } @@ -376,20 +376,20 @@ class GRUKernelImpl : public GRUKernel { } void ComputeH1(T* gates, T* ht) const override { - act_gate_d_->Compute(gates, gates); - act_state_d_->Compute(gates + d2_, gates + d2_); + act_gate_d_->ComputeDeprecated(gates, gates); + act_state_d_->ComputeDeprecated(gates + d2_, gates + d2_); vmul_d_->Compute(gates, gates + d2_, ht, d_); } void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override { // W: {W_update, W_reset; W_state} - act_gate_d2_->Compute(gates, gates); + act_gate_d2_->ComputeDeprecated(gates, gates); vmul_d_->Compute(ht_1, gates + d_, ht, d_); } void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override { T* y = gates + d2_; - act_state_d_->Compute(y, y); + act_state_d_->ComputeDeprecated(y, y); // out = zt*ht~ + (1-zt)*ht_1 for (int i = 0; i < d_; ++i) { ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index 7dc3e600b564d95b46070ff4436b2d0de2f3e105..5e1f91ffae03796be2817d0461900c2512938c77 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -92,7 +92,7 @@ TEST(JitKernel, vrelu) { #endif auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(x_data, ztgt_data); + ker->Compute(x_data, ztgt_data, d); } auto ttgte = GetCurrentUS(); VLOG(30) << "Vec size " << d @@ -181,7 +181,7 @@ TEST(JitKernel, vexp) { auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(x_data, ztgt_data); + ker->ComputeDeprecated(x_data, ztgt_data); } auto ttgte = GetCurrentUS(); @@ -222,7 +222,7 @@ void vsigmoid_better( y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); y[i] = 0.f - y[i]; } - vexp->Compute(y, y); + vexp->ComputeDeprecated(y, y); for (int i = 0; i < n; ++i) { y[i] = 1.f / (1.f + y[i]); } @@ -253,7 +253,7 @@ TEST(JitKernel, vsigmoid) { auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(x_data, ztgt_data); + ker->ComputeDeprecated(x_data, ztgt_data); } auto ttgte = GetCurrentUS(); @@ -287,7 +287,7 @@ void vtanh_better( const int n, const float* x, float* y) { const float a = 2.f, b = -1.f; vscal->Compute(&a, x, y, n); - vsigmoid->Compute(y, y); + vsigmoid->ComputeDeprecated(y, y); vscal->Compute(&a, y, y, n); vaddbias->Compute(&b, y, y, n); } @@ -321,7 +321,7 @@ TEST(JitKernel, vtanh) { auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->Compute(x_data, ztgt_data); + ker->ComputeDeprecated(x_data, ztgt_data); } auto ttgte = GetCurrentUS(); @@ -344,8 +344,8 @@ void lstm_ctht_ref( const std::shared_ptr< const paddle::operators::math::jitkernel::VExpKernel>& vexp_1, const int d, float* gates, const float* ct_1, float* ct, float* ht) { - vsigmoid_3d->Compute(gates + d, gates + d); - vtanh_d->Compute(gates, gates); + vsigmoid_3d->ComputeDeprecated(gates + d, gates + d); + vtanh_d->ComputeDeprecated(gates, gates); const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3; const float min = SIGMOID_THRESHOLD_MIN; const float max = SIGMOID_THRESHOLD_MAX; @@ -355,7 +355,7 @@ void lstm_ctht_ref( // H_t = act_cell(C_t) * ogated float tmp = ct[k] * 2; tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp)); - vexp_1->Compute(&tmp, &tmp); + vexp_1->ComputeDeprecated(&tmp, &tmp); tmp = 2.f / (1.f + tmp) - 1.f; ht[k] = tmp * o[k]; } @@ -373,13 +373,13 @@ void lstm_ctht_better( const paddle::operators::math::jitkernel::VAddKernel>& vadd_d, const int d, float* gates, const float* ct_1, float* ct, float* ht) { int d2 = d * 2; - vsigmoid_3d->Compute(gates + d, gates + d); - vtanh_d->Compute(gates, gates); + vsigmoid_3d->ComputeDeprecated(gates + d, gates + d); + vtanh_d->ComputeDeprecated(gates, gates); vmul_d->Compute(gates, gates + d, gates + d, d); vmul_d->Compute(ct_1, gates + d2, gates + d2, d); vadd_d->Compute(gates + d, gates + d2, ct, d); /* H_t = act_cell(C_t) * ogated */ - vtanh_d->Compute(ct, gates + d2); + vtanh_d->ComputeDeprecated(ct, gates + d2); vmul_d->Compute(gates + d2, gates + d * 3, ht, d); } @@ -736,7 +736,7 @@ void vaddrelu_better( const paddle::operators::math::jitkernel::VReluKernel>& vrelu, const float* x, const float* y, float* z, int d) { vadd->Compute(x, y, z, d); - vrelu->Compute(z, z); + vrelu->ComputeDeprecated(z, z); } TEST(JitKernel, vaddrelu) { diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 9577a4cb9d275df9604b7578f8685e4d2938a5e9..5978c1d6056001142854583840b8bfcb54d475d1 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -244,7 +244,7 @@ typename std::enable_if< std::is_same::value>::type elementwise_add_to(const DeviceContext& ctx, BlasT* blas, size_t data_len, const T* in, T* out) { - for (int64_t i = 0; i < data_len; i++) { + for (size_t i = 0; i < data_len; i++) { out[i] += in[i]; } } diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc index 2bc008dd34ffcfe93a00bd4a8cde61626d91e235..5535523e798912ff80eeb5d753914c7d8d70a05f 100644 --- a/paddle/fluid/operators/math/sequence_pooling_test.cc +++ b/paddle/fluid/operators/math/sequence_pooling_test.cc @@ -70,11 +70,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { EXPECT_EQ(in_grad.lod(), lod); if (paddle::platform::is_cpu_place(*place)) { - for (int64_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) { + for (size_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) { int64_t begin = in_grad.lod()[0][i]; int64_t end = in_grad.lod()[0][i + 1]; paddle::framework::Tensor tmp = in_grad.Slice(begin, end); - for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) { + for (size_t j = 0; j != tmp.numel() / second_dim; ++j) { for (int64_t m = 0; m != second_dim; ++m) { EXPECT_EQ(tmp.data()[m + j * second_dim], out_grad.data()[m + i * second_dim]); @@ -82,11 +82,11 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { } } } else { - for (int64_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) { + for (size_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) { int64_t begin = cpu_in_grad.lod()[0][i]; int64_t end = cpu_in_grad.lod()[0][i + 1]; paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end); - for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) { + for (size_t j = 0; j != tmp.numel() / second_dim; ++j) { for (int64_t m = 0; m != second_dim; ++m) { EXPECT_EQ(tmp.data()[m + j * second_dim], cpu_out_grad.data()[m + i * second_dim]); diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc index 78c65af24a8c5fa57e33415acc3018790bf70790..fa2018178f44ff4e3b14937c1f508fa8a698e20e 100644 --- a/paddle/fluid/operators/math/softmax.cc +++ b/paddle/fluid/operators/math/softmax.cc @@ -19,8 +19,10 @@ namespace paddle { namespace operators { namespace math { -template class SoftmaxFunctor; -template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index ce183ed3649055aab31eb6e3f44f2224475957e9..2e9669049e36478549b793e3fa76220825888e21 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -98,9 +98,14 @@ template class SoftmaxGradCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor +template class SoftmaxFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor* X, diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index dd9971ba091cc3ece86654f65c335b98087f45ed..7cf98f27251db3cfe5e8e295ed21056f6e5a2963 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -32,10 +32,10 @@ struct ValueClip { } }; -template -void SoftmaxFunctor::operator()(const DeviceContext& context, - const framework::Tensor* X, - framework::Tensor* Y) { +template +void SoftmaxFunctor::operator()( + const DeviceContext& context, const framework::Tensor* X, + framework::Tensor* Y) { auto logits = EigenMatrix::From(*X); auto softmax = EigenMatrix::From(*Y); @@ -65,6 +65,39 @@ void SoftmaxFunctor::operator()(const DeviceContext& context, .broadcast(one_by_class)); } +template +class SoftmaxFunctor { + void operator()(const DeviceContext& context, const framework::Tensor* X, + framework::Tensor* Y) { + auto logits = EigenMatrix::From(*X); + auto softmax = EigenMatrix::From(*Y); + + const int kBatchDim = 0; + const int kClassDim = 1; + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + + auto shifted_logits = (logits - + logits.maximum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)); + + softmax.device(*context.eigen_device()) = shifted_logits.exp(); + softmax.device(*context.eigen_device()) = (softmax * + softmax.sum(along_class) + .inverse() + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)); + } +}; + template void SoftmaxGradFunctor::operator()( const DeviceContext& context, const framework::Tensor* y, diff --git a/paddle/fluid/operators/merge_ids_op.h b/paddle/fluid/operators/merge_ids_op.h index fef9e023d02f45e21ec409ad398ba7d9bdd36880..99c57590191d58a12760fb335df76037685d1ced 100644 --- a/paddle/fluid/operators/merge_ids_op.h +++ b/paddle/fluid/operators/merge_ids_op.h @@ -43,11 +43,11 @@ class MergeIdsOpKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(ids.size(), outs.size(), "the number of Ids and Out should be the same"); - int row_ids_size = 0; + size_t row_ids_size = 0; int row_size = 0; int embedding_size = 0; - for (int i = 0; i < x_tensors.size(); ++i) { + for (size_t i = 0; i < x_tensors.size(); ++i) { const auto *x_tensor = x_tensors[i]; const auto *row_id = row_ids[i]; @@ -66,7 +66,7 @@ class MergeIdsOpKernel : public framework::OpKernel { std::unordered_map> selected_rows_idx_map; - for (int i = 0; i < x_tensors.size(); ++i) { + for (size_t i = 0; i < x_tensors.size(); ++i) { const auto *row_id = row_ids[i]; for (int j = 0; j < row_id->numel(); ++j) { @@ -78,7 +78,7 @@ class MergeIdsOpKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(row_ids_size, selected_rows_idx_map.size(), "the rows and tensor map size should be the same"); - for (int i = 0; i < outs.size(); ++i) { + for (size_t i = 0; i < outs.size(); ++i) { auto *out_ids = ids[i]; auto *out = outs[i]; diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 08f2949d4a3774894912ae5251806b46e6240702..7e434c293c9631025a5a725d62838fa12e845838 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -56,7 +56,8 @@ class MulOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(x_mat_dims[1], y_mat_dims[0], "First matrix's width must be equal with second matrix's " - "height. %s, %s"); + "height. %s, %s", + x_mat_dims[1], y_mat_dims[0]); std::vector output_dims; output_dims.reserve( static_cast(x_num_col_dims + y_dims.size() - y_num_col_dims)); diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index e471f04662a1fa3e8e77a2db37f0da4521682018..877c9a0528441a7d5b1306c3f8f8be1a5aea577a 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -69,7 +69,7 @@ class NCEOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), - ctx.GetPlace()); + platform::CPUPlace()); } }; @@ -174,7 +174,7 @@ class NCEOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), - ctx.GetPlace()); + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc index 37646c7b4c50fc7409002aca56e5462bde93cc30..685ebc393794337e03de0b9ce5134d6ce382c8bf 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cc +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -74,7 +74,7 @@ PadConstantLikeOp Operator. Pad input(Y) with a pad_value, the number of values padded to the edges of each axis is specified by the difference of the shape of X and Y. -((0, shape_x_0 - shape_y_0), … (0, shape_x_n - shape_y_n)) unique pad widths for +((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) unique pad widths for each axis. The input should be a k-D tensor(k > 0 and k < 7). As an example: diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu index 0d86b3127e42f7ee14ba57b1c762e8128a0f2d54..b21da178f3eeaafa41bde5f64cc4abcf7944b032 100644 --- a/paddle/fluid/operators/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_max_op.cu @@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_max, int, ops::MaxFunctor>, ops::ReduceKernel); -REGISTER_OP_CUDA_KERNEL( - reduce_max_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_max_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..6954c8d744faee6f8f0b715d6e4c8e3bcda7fb83 --- /dev/null +++ b/paddle/fluid/operators/reduce_max_op.part.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_max_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu index 59b30244839849d79e3e531953134633503c4090..4408200d2d052c2f68c2dd35619de6ed67f07f6e 100644 --- a/paddle/fluid/operators/reduce_mean_op.cu +++ b/paddle/fluid/operators/reduce_mean_op.cu @@ -69,13 +69,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel, ops::ReduceMeanKernel, ops::ReduceMeanKernel, ops::ReduceMeanKernel); - -REGISTER_OP_CUDA_KERNEL( - reduce_mean_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_mean_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..4b663bcdca7c20f8802d962a362f429d8eafe9af --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.part.cu @@ -0,0 +1,26 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// .part used to speed up nvcc compile +#include "paddle/fluid/operators/reduce_mean_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_mean_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu index da466f805eff4709dc23471baef03e94052ee6c1..5a04a12b79444dcea30d3c1140d9708a98b55fe3 100644 --- a/paddle/fluid/operators/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_min_op.cu @@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_min, int, ops::MinFunctor>, ops::ReduceKernel); -REGISTER_OP_CUDA_KERNEL( - reduce_min_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_min_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..5b8f061b2d03eb76863401905ac87044fd5ea778 --- /dev/null +++ b/paddle/fluid/operators/reduce_min_op.part.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_min_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu index d62e677d92cffecf629d1684026b0c7bcfec29e3..d8692afb96e4d5d3206210060684dd12fb4d79a7 100644 --- a/paddle/fluid/operators/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_prod_op.cu @@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod, int, ops::ProdFunctor>, ops::ReduceKernel); -REGISTER_OP_CUDA_KERNEL( - reduce_prod_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_prod_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..486c578c64b9a2d80abc940a7c4266ef5fd23c7f --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.part.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_prod_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_prod_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu index 53cd9e9419dd9aecee730917ae21d7a4ab332ffc..2b031e8df99768c9208146640bddbe51149b2614 100644 --- a/paddle/fluid/operators/reduce_sum_op.cu +++ b/paddle/fluid/operators/reduce_sum_op.cu @@ -64,13 +64,3 @@ class ReduceSumKernel : public framework::OpKernel { REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel, ops::ReduceSumKernel, ops::ReduceSumKernel, ops::ReduceSumKernel); - -REGISTER_OP_CUDA_KERNEL( - reduce_sum_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_sum_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..525633f62a95b2d0d677fcbebe551b75cb2a180d --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.part.cu @@ -0,0 +1,26 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/cub_reduce.h" +#include "paddle/fluid/operators/reduce_sum_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_sum_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/ref_by_trainer_id_op.h b/paddle/fluid/operators/ref_by_trainer_id_op.h index 2ce577544ae2437b9297da2190fd09b435d5173c..34192278d84758d720e021215c14a54349ba0c62 100644 --- a/paddle/fluid/operators/ref_by_trainer_id_op.h +++ b/paddle/fluid/operators/ref_by_trainer_id_op.h @@ -38,7 +38,7 @@ class RefByTrainerIdKernel : public framework::OpKernel { } else { trainer_id = *trainer_id_data; } - PADDLE_ENFORCE_LT(trainer_id, in_list.size()); + PADDLE_ENFORCE_LT((size_t)trainer_id, in_list.size()); out->mutable_data(context.GetPlace()); out->ShareDataWith(*(in_list[trainer_id])); } diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index 8e29761ec208764e263e357a0b3c9456c932d093..043ea680d1506e7b7e33ba5537a71f37feaf81be 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -122,7 +122,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor), " "Argmaxes corresponding to indices in X used " "for gradient computation. Only output " - "if arg “is_test” is false.") + "if arg \"is_test\" is false.") .AsIntermediate(); AddAttr("spatial_scale", "(float, default 1.0), " diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h index ac7d69bfb549fd98c76fcf834e8d3ad9bec2ef23..b2e79f6c82bb748293f4219845e6798347c8c46e 100644 --- a/paddle/fluid/operators/scatter.cu.h +++ b/paddle/fluid/operators/scatter.cu.h @@ -51,7 +51,8 @@ void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { // PADDLE_ENFORCE(platform::is_gpu_place(place)); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1); + PADDLE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1)); int index_size = index.dims()[0]; auto src_dims = src.dims(); diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h index 39af717615c01f5c121e32b176b74d05be738531..8bae6606c94620ab4fa8ae34f69236e7e87e9670 100644 --- a/paddle/fluid/operators/scatter.h +++ b/paddle/fluid/operators/scatter.h @@ -37,7 +37,8 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1); + PADDLE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1)); int index_size = index.dims()[0]; auto src_dims = src.dims(); diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h index 2e206c963ea009b436bb03433d30683a29fe83aa..b27ef27e298d0f08129e2c0a349c741129acdfe2 100644 --- a/paddle/fluid/operators/sgd_op.h +++ b/paddle/fluid/operators/sgd_op.h @@ -109,8 +109,6 @@ class SGDOpKernel : public framework::OpKernel { const auto *grad_data = grad.value().data(); auto *out_data = param_out->mutable_value()->data(); for (size_t i = 0; i < grad.rows().size(); i++) { - PADDLE_ENFORCE(grad.rows()[i] < grad.height(), - "Input rows index should less than height"); int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false); PADDLE_ENFORCE_GE(id_index, static_cast(0), "id should be in the table"); diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h index cf1eeb017d666f605a431aa54637d8cbc99c7c46..2fea8a65bc5141b11549ef400f11b54278be35f9 100644 --- a/paddle/fluid/operators/softmax_op.h +++ b/paddle/fluid/operators/softmax_op.h @@ -35,8 +35,13 @@ class SoftmaxKernel : public framework::OpKernel { Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1); Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1); - math::SoftmaxFunctor()( +#ifdef ON_INFER + math::SoftmaxFunctor()( context.template device_context(), &X_2d, &Out_2d); +#else + math::SoftmaxFunctor()( + context.template device_context(), &X_2d, &Out_2d); +#endif } }; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index e9aba3b37b8cc01d4fe5de5200579d4e93f67e56..c0530e3d8bc407ddd6d7bf6e10a715185d0beb1f 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -42,8 +42,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); - math::SoftmaxFunctor()(dev_ctx, logits, - softmax); + math::SoftmaxFunctor()( + dev_ctx, logits, softmax); math::CrossEntropyFunctor()( dev_ctx, loss, softmax, labels, context.Attr("soft_label"), context.Attr("ignore_index")); diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h index 6dbada3da8826f0e7cb07a9642d327e5ee38c309..f5d6d85d7d75507f82de212812ecee0a650d3aad 100644 --- a/paddle/fluid/operators/split_ids_op.h +++ b/paddle/fluid/operators/split_ids_op.h @@ -64,7 +64,7 @@ class SplitIdsOpKernel : public framework::OpKernel { out_ids.resize(outs.size()); // split id by their shard_num. - for (int i = 0; i < all_ids.size(); ++i) { + for (size_t i = 0; i < all_ids.size(); ++i) { T id = all_ids[i]; size_t shard_id = static_cast(id) % shard_num; out_ids[shard_id].push_back(id); diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc index 3f4b48bc7391def082c82ed451fc5a752009a2f1..9345b495415d203728238c19621a20f446c40bf5 100644 --- a/paddle/fluid/operators/stack_op.cc +++ b/paddle/fluid/operators/stack_op.cc @@ -21,8 +21,12 @@ REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker, REGISTER_OPERATOR(stack_grad, ops::StackOpGrad); REGISTER_OP_CPU_KERNEL(stack, ops::StackKernel, - ops::StackKernel); + ops::StackKernel, + ops::StackKernel, + ops::StackKernel); REGISTER_OP_CPU_KERNEL(stack_grad, ops::StackGradKernel, - ops::StackGradKernel); + ops::StackGradKernel, + ops::StackGradKernel, + ops::StackGradKernel); diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu index 92c1bde2bcf089e5c715e90e564408e6ad37ba17..bf2a9e5b3d22996e688621727cb280dc9aed7859 100644 --- a/paddle/fluid/operators/stack_op.cu +++ b/paddle/fluid/operators/stack_op.cu @@ -18,8 +18,12 @@ namespace plat = paddle::platform; namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel, - ops::StackKernel); + ops::StackKernel, + ops::StackKernel, + ops::StackKernel); REGISTER_OP_CUDA_KERNEL(stack_grad, ops::StackGradKernel, - ops::StackGradKernel); + ops::StackGradKernel, + ops::StackGradKernel, + ops::StackGradKernel); diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc index 1d441b43b14ea194152095874645f8133c423efd..6d2ccb38f677962d52ff97df25321bf195759dcd 100644 --- a/paddle/fluid/operators/unpool_op.cc +++ b/paddle/fluid/operators/unpool_op.cc @@ -57,8 +57,8 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is: $(N, C_{out}, H_{out}, W_{out})$, where $$ -H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\ -W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1] +H_{out} = (H_{in}-1) * strides[0] - 2 * paddings[0] + ksize[0] \\ +W_{out} = (W_{in}-1) * strides[1] - 2 * paddings[1] + ksize[1] $$ Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf )DOC"); diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index aa20553ceffceded09447693c6e92f55fb48702d..9273e9b1e72f0ad7abd6c20d4a34283fbe24378a 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -76,6 +76,10 @@ extern void* mklml_dso_handle; __macro(vdMul); \ __macro(vsExp); \ __macro(vdExp); \ + __macro(vsSqr); \ + __macro(vdSqr); \ + __macro(vsPowx); \ + __macro(vdPowx); \ __macro(MKL_Set_Num_Threads) MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 0f3298f6006ade66ccacff4f33338ba3b6b73ebb..9f7aa556988e8ab0ca87d0e7212fe27a209f6a32 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -115,6 +115,14 @@ void InitDevices(bool init_p2p, const std::vector devices) { } places.emplace_back(platform::CPUPlace()); platform::DeviceContextPool::Init(places); + +// windows has no support for openblas multi-thread +#ifdef _WIN32 + if (FLAGS_paddle_num_threads > 1) { + FLAGS_paddle_num_threads = 1; + } +#endif + #ifndef PADDLE_WITH_MKLDNN platform::SetNumThreads(FLAGS_paddle_num_threads); #endif @@ -170,7 +178,9 @@ void InitGLOG(const std::string &prog_name) { // glog will not hold the ARGV[0] inside. // Use strdup to alloc a new string. google::InitGoogleLogging(strdup(prog_name.c_str())); +#ifndef _WIN32 google::InstallFailureSignalHandler(); +#endif } } // namespace framework diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 40af1f95208905231b933e5184a807b061164799..a6360a884d74f06603f28efeb36e39fbd0257cf6 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef _WIN32 #pragma once #include @@ -149,3 +150,4 @@ struct NCCLContextMap { } // namespace platform } // namespace paddle +#endif diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index cf9f4aa95bc1cb79d95b79331fbc09e11af64194..8823e97b0b696556b32724acd096e8fc79a49f53 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -24,21 +24,16 @@ #include "glog/logging.h" #if !defined(_WIN32) -#define UNUSED __attribute__((unused)) #include // dladdr #include // backtrace #include #include // std::accumulate #else #include // _popen, _pclose +#include #include -#if defined(_WIN32) #include // std::accumulate in msvc -#endif -// windows version of __attribute__((unused)) -#define UNUSED __pragma(warning(suppress : 4100)) - -#ifndef S_ISDIR // windows port for sys/stat.h +#ifndef S_ISDIR // windows port for sys/stat.h #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) #endif // S_ISDIR diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h index 86c5f87f34d648d4a784eba2014a4f896d74724b..e9aef621acea44b0dab7a687c13223617d5603c0 100644 --- a/paddle/fluid/platform/variant.h +++ b/paddle/fluid/platform/variant.h @@ -43,3 +43,11 @@ limitations under the License. */ #include #include #include + +// some platform-independent defintion +#if defined(_WIN32) +#define UNUSED +#define __builtin_expect(EXP, C) (EXP) +#else +#define UNUSED __attribute__((unused)) +#endif diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index e7f634c4a622b48e97040987836406cf73cb23b6..6afa53cd36dee2004ada707a52995ce33b3650e9 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -2,9 +2,9 @@ set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc) if(NOT WIN32) -list(APPEND PYBIND_DEPS parallel_executor profiler) -list(APPEND PYBIND_SRCS recordio.cc) -endif() + list(APPEND PYBIND_DEPS parallel_executor profiler) + list(APPEND PYBIND_SRCS recordio.cc) +endif(NOT WIN32) if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED @@ -21,5 +21,13 @@ if(WITH_PYTHON) endif(NOT APPLE AND NOT ANDROID AND NOT WIN32) endif(WITH_AMD_GPU) + if(WIN32) + if(WITH_GPU AND NOT WITH_DSO) + get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) + target_link_libraries(paddle_pybind ${cuda_modules}) + endif(WITH_GPU AND NOT WITH_DSO) + target_link_libraries(paddle_pybind shlwapi) + endif(WIN32) + cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python) endif(WITH_PYTHON) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 806b304be593463690063e9d454083ed7fc88b98..8ff6f6c85ace4bdfb14a2e9c82b1e07d01fc0f4c 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -21,6 +21,13 @@ limitations under the License. */ #include #include +#if defined(_WIN32) +#define NOMINMAX +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#define GOOGLE_GLOG_DLL_DECL +#include +#endif + #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/framework.pb.h" @@ -29,7 +36,9 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" +#ifndef _WIN32 #include "paddle/fluid/framework/parallel_executor.h" +#endif #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" @@ -51,7 +60,9 @@ limitations under the License. */ #include "paddle/fluid/string/to_string.h" #ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif #include "paddle/fluid/platform/cuda_profiler.h" #include "paddle/fluid/platform/gpu_info.h" #endif @@ -342,22 +353,25 @@ All parameter, weight, gradient are variables in Paddle. .def("get_lod_tensor_array", [](Variable &self) { return self.GetMutable(); }, py::return_value_policy::reference) -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) .def("get_communicator", [](Variable &self) -> platform::Communicator * { return self.GetMutable(); }, py::return_value_policy::reference) -#endif .def("get_reader", [](Variable &self) -> framework::ReaderHolder * { PADDLE_ENFORCE(self.IsType()); return self.GetMutable(); }, - py::return_value_policy::reference); + py::return_value_policy::reference) +#endif + ; +#if !defined(_WIN32) py::class_(m, "Reader", "") .def("reset", &framework::ReaderHolder::ResetAll); +#endif using LoDTensorBlockingQueue = ::paddle::operators::reader::LoDTensorBlockingQueue; @@ -482,7 +496,7 @@ All parameter, weight, gradient are variables in Paddle. #endif });; // clang-format on -#ifdef PADDLE_WITH_CUDA +#if (defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) py::class_(m, "Communicator").def(py::init<>()); #endif py::class_(m, "CUDAPlace") @@ -619,11 +633,14 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_CUDA m.def("get_cuda_device_count", platform::GetCUDADeviceCount); +#ifndef _WIN32 m.def("nvprof_init", platform::CudaProfilerInit); m.def("nvprof_start", platform::CudaProfilerStart); m.def("nvprof_stop", platform::CudaProfilerStop); #endif +#endif +#ifndef _WIN32 py::enum_(m, "ProfilerState", py::arithmetic()) .value("kDisabled", platform::ProfilerState::kDisabled) .value("kCPU", platform::ProfilerState::kCPU) @@ -644,6 +661,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("disable_profiler", platform::DisableProfiler); m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("reset_profiler", platform::ResetProfiler); +#endif py::class_> pass(m, "Pass"); pass.def(py::init()) @@ -652,9 +670,9 @@ All parameter, weight, gradient are variables in Paddle. [](ir::Pass &self, const std::string &name, const std::string &attr) { self.Set(name, new std::string(attr)); }) - .def("set_int", [](ir::Pass &self, const std::string &name, int val) { - self.Set(name, new int(val)); - }); + .def("set_int", [](ir::Pass &self, const std::string &name, + int val) { self.Set(name, new int(val)); }) + .def("type", &ir::Pass::Type); py::class_> pb( m, "PassBuilder"); @@ -672,6 +690,7 @@ All parameter, weight, gradient are variables in Paddle. .def("remove_pass", [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); +#ifndef _WIN32 // -- python binds for parallel executor. py::class_ pe(m, "ParallelExecutor"); py::class_ exec_strategy(pe, "ExecutionStrategy", R"DOC( @@ -793,6 +812,7 @@ All parameter, weight, gradient are variables in Paddle. "reduce_strategy", [](const BuildStrategy &self) { return self.reduce_; }, [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.reduce_ = strategy; }, R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor, @@ -806,6 +826,7 @@ All parameter, weight, gradient are variables in Paddle. [](const BuildStrategy &self) { return self.gradient_scale_; }, [](BuildStrategy &self, BuildStrategy::GradientScaleStrategy strategy) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.gradient_scale_ = strategy; }, R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in @@ -817,6 +838,7 @@ All parameter, weight, gradient are variables in Paddle. "debug_graphviz_path", [](const BuildStrategy &self) { return self.debug_graphviz_path_; }, [](BuildStrategy &self, const std::string &path) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.debug_graphviz_path_ = path; }, R"DOC(The type is STR, debug_graphviz_path indicate the path that @@ -826,6 +848,7 @@ All parameter, weight, gradient are variables in Paddle. "enable_data_balance", [](const BuildStrategy &self) { return self.enable_data_balance_; }, [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.enable_data_balance_ = b; }) // FIXME(chengudo): enable_data_balance seems not important .def_property( @@ -834,6 +857,7 @@ All parameter, weight, gradient are variables in Paddle. return self.enable_sequential_execution_; }, [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.enable_sequential_execution_ = b; }, R"DOC(The type is BOOL. If set True, the execution order of ops would be the same as what is in the program. Default False.)DOC") @@ -843,6 +867,7 @@ All parameter, weight, gradient are variables in Paddle. return self.remove_unnecessary_lock_; }, [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.remove_unnecessary_lock_ = b; }, R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC") @@ -852,15 +877,19 @@ All parameter, weight, gradient are variables in Paddle. return self.fuse_elewise_add_act_ops_; }, [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); self.fuse_elewise_add_act_ops_ = b; }, R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether to fuse elementwise_add_op and activation_op, it may make the execution faster. Default False)DOC") - .def("_create_passes_from_strategy", + .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { - return self.CreatePassesFromStrategy(); - }); + return self.CreatePassesFromStrategy(true); + }, + R"DOC(Allow user to customized passes. Normally model-specific + optimization passes should be defined in this way. BuildStrategy + cannot be updated after being finalized.)DOC"); pe.def(py::init &, const std::unordered_set &, @@ -889,6 +918,7 @@ All parameter, weight, gradient are variables in Paddle. }); BindRecordIOWriter(&m); +#endif return m.ptr(); } } // namespace pybind diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h index 47de23377398423dabf3b0ed5b670e564f57cdfb..a2eec6e3c48dd126614bbff0227145537b678ac4 100644 --- a/paddle/fluid/string/printf.h +++ b/paddle/fluid/string/printf.h @@ -72,6 +72,7 @@ #include #include #include +#include #include "tinyformat/tinyformat.h" // https://github.com/c42f/tinyformat @@ -102,5 +103,22 @@ void Printf(const char* fmt, const Args&... args) { Fprintf(std::cout, fmt, args...); } +template +std::string HumanReadableSize(T size) { + size_t i = 0; + double f_size = static_cast(size); + double orig = f_size; + const std::vector units( + {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"}); + while (f_size > 1024) { + f_size /= 1024; + i++; + } + if (i >= units.size()) { + return Sprintf("%fB", orig); + } + return Sprintf("%f%s", f_size, units[i]); +} + } // namespace string } // namespace paddle diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 595af0c29a5c2c30427a82205d3d56c49492c0c8..32f9bca645d80a11274d128b6615a73ffa224705 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -139,6 +139,7 @@ function cmake_gen() { -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} -DWITH_MKL=${WITH_MKL:-ON} + -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} -DWITH_AVX=${WITH_AVX:-OFF} -DWITH_GOLANG=${WITH_GOLANG:-OFF} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} @@ -155,6 +156,8 @@ function cmake_gen() { -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} + -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF} + -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON} -DPY_VERSION=${PY_VERSION:-2.7} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} ======================================== @@ -171,6 +174,7 @@ EOF -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \ -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \ -DWITH_MKL=${WITH_MKL:-ON} \ + -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} \ -DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_GOLANG=${WITH_GOLANG:-OFF} \ -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \ @@ -186,6 +190,8 @@ EOF -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ + -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\ + -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\ -DPY_VERSION=${PY_VERSION:-2.7} \ -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} @@ -775,6 +781,17 @@ function main() { test_fluid_lib assert_api_spec_approvals ;; + assert_api) + assert_api_not_changed ${PYTHON_ABI:-""} + ;; + test_inference) + gen_capi_package + gen_fluid_lib + test_fluid_lib + ;; + assert_api_approvals) + assert_api_spec_approvals + ;; maccheck) cmake_gen ${PYTHON_ABI:-""} build_mac diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 0d29f2ad209296688582924ae16e495930830bd4..139176b0d6c5dff511a97c9ac01f09e72a90306b 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -45,23 +45,42 @@ endif() configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py) - -set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so) -add_custom_command(OUTPUT ${FLUID_CORE} - COMMAND cmake -E copy $ ${FLUID_CORE} - DEPENDS paddle_pybind) +IF(WIN32) + # Python would use the .pyd by default under Windows series platform + set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/) + get_filename_component(openblas_refpath ${CBLAS_LIBRARIES} DIRECTORY) + set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd) + add_custom_command(OUTPUT ${FLUID_CORE} + COMMAND cmake -E copy $ ${FLUID_CORE} + COMMAND cmake -E copy ${openblas_refpath}/openblas.dll ${FLUID_DST_DIR} + DEPENDS paddle_pybind) +ELSE() + set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so) + add_custom_command(OUTPUT ${FLUID_CORE} + COMMAND cmake -E copy $ ${FLUID_CORE} + DEPENDS paddle_pybind) +ENDIF() add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE}) - -add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp - COMMAND touch stub.cc - COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python - COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ - COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel - COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp - COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python - COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib.* ${PADDLE_PYTHON_BUILD_DIR}/lib-python - DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) +IF(WIN32) + add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp + COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/ + COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ + COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel + COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp + COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python + DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) +ELSE(WIN32) + add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp + COMMAND touch stub.cc + COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python + COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/ + COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel + COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp + COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python + COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python + DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) +ENDIF() set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS}) if(NOT WITH_FLUID_ONLY) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 78d79317aa68d71059a0603f22e44de45a02314c..b99197492870e9886e8a29f8c46723401a2a5ce1 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +import os # import all class inside framework into fluid module from . import framework from .framework import * @@ -111,12 +112,16 @@ def __bootstrap__(): os.environ['OMP_NUM_THREADS'] = str(num_threads) read_env_flags = [ - 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', - 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', + 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope', + 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb', 'allocator_strategy', 'reader_queue_speed_test_mode' ] + if os.name != 'nt': + read_env_flags.append('warpctc_dir') + read_env_flags.append('cpu_deterministic') + if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_server_profile_path') diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py index b8d5f4ffeadca0a7b103682f175d50dc46fa258a..b966ae01d039d7e9510dae73ecadb97b494f68c2 100644 --- a/python/paddle/fluid/contrib/inferencer.py +++ b/python/paddle/fluid/contrib/inferencer.py @@ -15,13 +15,15 @@ from __future__ import print_function import contextlib +import os from .. import core from .. import executor from .. import framework from .. import io -from .. import parallel_executor +if os.name != 'nt': + from .. import parallel_executor from .. import unique_name from .trainer import check_and_get_place diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py index 8569e486f91786b5562e84dcdccf6d91da0612cc..096821a5ba690074ecbf023cf87fed7e206d023f 100644 --- a/python/paddle/fluid/contrib/trainer.py +++ b/python/paddle/fluid/contrib/trainer.py @@ -28,7 +28,8 @@ from .. import framework from .. import io # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module from .. import optimizer as opt_module -from .. import parallel_executor +if os.name != 'nt': + from .. import parallel_executor from ..transpiler import distribute_transpiler __all__ = [ diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 4ac94981a7a47530fe6ae4d968212c62dd3e0a93..96b6705e26c0f8d8d223e9020192a8f330c2c727 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -31,6 +31,7 @@ from functools import reduce __all__ = [ 'prior_box', + 'density_prior_box', 'multi_box_head', 'bipartite_match', 'target_assign', @@ -1023,6 +1024,135 @@ def prior_box(input, return box, var +def density_prior_box(input, + image, + densities=None, + fixed_sizes=None, + fixed_ratios=None, + variance=[0.1, 0.1, 0.2, 0.2], + clip=False, + steps=[0.0, 0.0], + offset=0.5, + name=None): + """ + **Density Prior Box Operator** + + Generate density prior boxes for SSD(Single Shot MultiBox Detector) + algorithm. Each position of the input produce N prior boxes, N is + determined by the count of densities, fixed_sizes and fixed_ratios. + Boxes center at grid points around each input position is generated by + this operator, and the grid points is determined by densities and + the count of density prior box is determined by fixed_sizes and fixed_ratios. + Obviously, the number of fixed_sizes is equal to the number of densities. + For densities_i in densities: + N_density_prior_box =sum(N_fixed_ratios * densities_i^2), + + Args: + input(Variable): The Input Variables, the format is NCHW. + image(Variable): The input image data of PriorBoxOp, + the layout is NCHW. + densities(list|tuple|None): the densities of generated density prior + boxes, this attribute should be a list or tuple of integers. + Default: None. + fixed_sizes(list|tuple|None): the fixed sizes of generated density + prior boxes, this attribute should a list or tuple of same + length with :attr:`densities`. Default: None. + fixed_ratios(list|tuple|None): the fixed ratios of generated density + prior boxes, if this attribute is not set and :attr:`densities` + and :attr:`fix_sizes` is set, :attr:`aspect_ratios` will be used + to generate density prior boxes. + variance(list|tuple): the variances to be encoded in density prior boxes. + Default:[0.1, 0.1, 0.2, 0.2]. + clip(bool): Whether to clip out-of-boundary boxes. Default: False. + step(list|turple): Prior boxes step across width and height, If + step[0] == 0.0/step[1] == 0.0, the density prior boxes step across + height/weight of the input will be automatically calculated. + Default: [0., 0.] + offset(float): Prior boxes center offset. Default: 0.5 + name(str): Name of the density prior box op. Default: None. + + Returns: + tuple: A tuple with two Variable (boxes, variances) + + boxes: the output density prior boxes of PriorBox. + The layout is [H, W, num_priors, 4]. + H is the height of input, W is the width of input, + num_priors is the total + box count of each position of input. + + variances: the expanded variances of PriorBox. + The layout is [H, W, num_priors, 4]. + H is the height of input, W is the width of input + num_priors is the total + box count of each position of input + + + Examples: + .. code-block:: python + + box, var = fluid.layers.density_prior_box( + input=conv1, + image=images, + min_sizes=[100.], + max_sizes=[200.], + aspect_ratios=[1.0, 1.0 / 2.0, 2.0], + densities=[3, 4], + fixed_sizes=[50., 60.], + fixed_ratios=[1.0, 3.0, 1.0 / 3.0], + flip=True, + clip=True) + """ + helper = LayerHelper("density_prior_box", **locals()) + dtype = helper.input_dtype() + + def _is_list_or_tuple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + + if not _is_list_or_tuple_(densities): + raise TypeError('densities should be a list or a tuple or None.') + if not _is_list_or_tuple_(fixed_sizes): + raise TypeError('fixed_sizes should be a list or a tuple or None.') + if not _is_list_or_tuple_(fixed_ratios): + raise TypeError('fixed_ratios should be a list or a tuple or None.') + if len(densities) != len(fixed_sizes): + raise ValueError('densities and fixed_sizes length should be euqal.') + if not (_is_list_or_tuple_(steps) and len(steps) == 2): + raise ValueError('steps should be a list or tuple ', + 'with length 2, (step_width, step_height).') + + densities = list(map(int, densities)) + fixed_sizes = list(map(float, fixed_sizes)) + fixed_ratios = list(map(float, fixed_ratios)) + steps = list(map(float, steps)) + + attrs = { + 'variances': variance, + 'clip': clip, + 'step_w': steps[0], + 'step_h': steps[1], + 'offset': offset, + } + if densities is not None and len(densities) > 0: + attrs['densities'] = densities + if fixed_sizes is not None and len(fixed_sizes) > 0: + attrs['fixed_sizes'] = fixed_sizes + if fixed_ratios is not None and len(fixed_ratios) > 0: + attrs['fixed_ratios'] = fixed_ratios + + box = helper.create_variable_for_type_inference(dtype) + var = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type="density_prior_box", + inputs={"Input": input, + "Image": image}, + outputs={"Boxes": box, + "Variances": var}, + attrs=attrs, ) + box.stop_gradient = True + var.stop_gradient = True + return box, var + + def multi_box_head(inputs, image, base_size, diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 1ab48c00548b58f4b3e411d8e46e8cf496d6b891..a9075045a2d5282ecded1681bc9835feb15298ea 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -15,6 +15,7 @@ from __future__ import print_function import contextlib import multiprocessing +import os import six import threading @@ -346,70 +347,72 @@ def _copy_reader_create_op_(block, op): return new_op -@templatedoc(op_type='create_recordio_file_reader') -def open_recordio_file(filename, - shapes, - lod_levels, - dtypes, - pass_num=1, - for_parallel=True): - """ - ${comment} - - Args: - filename(${filename_type}): ${filename_comment}. - shapes(list): List of tuples which declaring data shapes. - lod_levels(${lod_levels_type}): ${lod_levels_comment}. - dtypes(list): List of strs which declaring data type. - pass_num(int): Number of passes to run. - for_parallel(Bool): Set it as True if you are going to run - subsequent operators in parallel. - - Returns: - ${out_comment}. - - Examples: - - >>> import paddle.fluid as fluid - >>> reader = fluid.layers.io.open_recordio_file( - >>> filename='./data.recordio', - >>> shapes=[(3,224,224), (1)], - >>> lod_levels=[0, 0], - >>> dtypes=['float32', 'int64']) - >>> # Via the reader, we can use 'read_file' layer to get data: - >>> image, label = fluid.layers.io.read_file(reader) - """ - dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] - shape_concat = [] - ranks = [] +if os.name != 'nt': + + @templatedoc(op_type='create_recordio_file_reader') + def open_recordio_file(filename, + shapes, + lod_levels, + dtypes, + pass_num=1, + for_parallel=True): + """ + ${comment} + + Args: + filename(${filename_type}): ${filename_comment}. + shapes(list): List of tuples which declaring data shapes. + lod_levels(${lod_levels_type}): ${lod_levels_comment}. + dtypes(list): List of strs which declaring data type. + pass_num(int): Number of passes to run. + for_parallel(Bool): Set it as True if you are going to run + subsequent operators in parallel. + + Returns: + ${out_comment}. + + Examples: + + >>> import paddle.fluid as fluid + >>> reader = fluid.layers.io.open_recordio_file( + >>> filename='./data.recordio', + >>> shapes=[(3,224,224), (1)], + >>> lod_levels=[0, 0], + >>> dtypes=['float32', 'int64']) + >>> # Via the reader, we can use 'read_file' layer to get data: + >>> image, label = fluid.layers.io.read_file(reader) + """ + dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] + shape_concat = [] + ranks = [] - for shape in shapes: - shape_concat.extend(shape) - ranks.append(len(shape)) + for shape in shapes: + shape_concat.extend(shape) + ranks.append(len(shape)) - var_name = unique_name('open_recordio_file') + var_name = unique_name('open_recordio_file') - startup_blk = default_startup_program().current_block() - startup_var = startup_blk.create_var(name=var_name) - startup_blk.append_op( - type='create_recordio_file_reader', - outputs={'Out': [startup_var]}, - attrs={ - 'shape_concat': shape_concat, - 'lod_levels': lod_levels, - 'filename': filename, - 'ranks': ranks - }) + startup_blk = default_startup_program().current_block() + startup_var = startup_blk.create_var(name=var_name) + startup_blk.append_op( + type='create_recordio_file_reader', + outputs={'Out': [startup_var]}, + attrs={ + 'shape_concat': shape_concat, + 'lod_levels': lod_levels, + 'filename': filename, + 'ranks': ranks + }) - startup_var.desc.set_dtypes(dtypes) - startup_var.persistable = True - main_prog_var = _copy_reader_var_(default_main_program().current_block(), - startup_var) + startup_var.desc.set_dtypes(dtypes) + startup_var.persistable = True + main_prog_var = _copy_reader_var_( + default_main_program().current_block(), startup_var) - if pass_num > 1: - main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num) + if pass_num > 1: + main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num) - return monkey_patch_reader_methods(main_prog_var) + return monkey_patch_reader_methods(main_prog_var) def random_data_generator(low, high, shapes, lod_levels, for_parallel=True): diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index c1a93e831b6df02c2c4cfa34cd498b7b147a8b1b..1b5009e76126e8b0fbe2805cc85f4f133a70c1ae 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -18,6 +18,7 @@ All layers just related to the neural network. from __future__ import print_function import numpy as np +import os from ..layer_helper import LayerHelper from ..initializer import Normal, Constant from ..framework import Variable, OpProtoHolder @@ -165,6 +166,7 @@ __all__ = [ 'grid_sampler', 'log_loss', 'add_position_encoding', + 'bilinear_tensor_product', ] @@ -340,126 +342,128 @@ def embedding(input, return tmp -@templatedoc(op_type="lstm") -def dynamic_lstm(input, - size, - h_0=None, - c_0=None, - param_attr=None, - bias_attr=None, - use_peepholes=True, - is_reverse=False, - gate_activation='sigmoid', - cell_activation='tanh', - candidate_activation='tanh', - dtype='float32', - name=None): - """ - ${comment} +if os.name != 'nt': - Args: - input (Variable): ${input_comment} - size (int): 4 * hidden size. - h_0(Variable): The initial hidden state is an optional input, default is zero. - This is a tensor with shape (N x D), where N is the - batch size and D is the hidden size. - c_0(Variable): The initial cell state is an optional input, default is zero. - This is a tensor with shape (N x D), where N is the - batch size. `h_0` and `c_0` can be NULL but only at the same time. - param_attr(ParamAttr|None): The parameter attribute for the learnable - hidden-hidden weights. - - - Weights = {:math:`W_{ch}, W_{ih}, \ - W_{fh}, W_{oh}`} - - The shape is (D x 4D), where D is the hidden - size. - - If it is set to None or one attribute of ParamAttr, - dynamic_lstm will create ParamAttr as param_attr. - If the Initializer of the param_attr is not set, the - parameter is initialized with Xavier. Default: None. - bias_attr (ParamAttr|None): The bias attribute for the learnable bias - weights, which contains two parts, input-hidden - bias weights and peephole connections weights if - setting `use_peepholes` to `True`. - - 1. `use_peepholes = False` - - Biases = {:math:`b_c, b_i, b_f, b_o`}. - - The shape is (1 x 4D). - 2. `use_peepholes = True` - - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ - W_{fc}, W_{oc}`}. - - The shape is (1 x 7D). - - If it is set to None or one attribute of ParamAttr, - dynamic_lstm will create ParamAttr as bias_attr. - If the Initializer of the bias_attr is not set, - the bias is initialized zero. Default: None. - use_peepholes (bool): ${use_peepholes_comment} - is_reverse (bool): ${is_reverse_comment} - gate_activation (str): ${gate_activation_comment} - cell_activation (str): ${cell_activation_comment} - candidate_activation (str): ${candidate_activation_comment} - dtype (str): Data type. Choices = ["float32", "float64"], default "float32". - name (str|None): A name for this layer(optional). If set None, the layer - will be named automatically. - - Returns: - tuple: The hidden state, and cell state of LSTM. The shape of both \ - is (T x D), and lod is the same with the `input`. - - Examples: - .. code-block:: python - - hidden_dim = 512 - forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, - bias_attr=False) - forward, _ = fluid.layers.dynamic_lstm( - input=forward_proj, size=hidden_dim * 4, use_peepholes=False) - """ - assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp." - helper = LayerHelper('lstm', **locals()) - size = size // 4 - weight = helper.create_parameter( - attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype) - bias_size = [1, 7 * size] - if not use_peepholes: - bias_size[1] = 4 * size - bias = helper.create_parameter( - attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) + @templatedoc(op_type="lstm") + def dynamic_lstm(input, + size, + h_0=None, + c_0=None, + param_attr=None, + bias_attr=None, + use_peepholes=True, + is_reverse=False, + gate_activation='sigmoid', + cell_activation='tanh', + candidate_activation='tanh', + dtype='float32', + name=None): + """ + ${comment} + + Args: + input (Variable): ${input_comment} + size (int): 4 * hidden size. + h_0(Variable): The initial hidden state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size and D is the hidden size. + c_0(Variable): The initial cell state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size. `h_0` and `c_0` can be NULL but only at the same time. + param_attr(ParamAttr|None): The parameter attribute for the learnable + hidden-hidden weights. + + - Weights = {:math:`W_{ch}, W_{ih}, \ + W_{fh}, W_{oh}`} + - The shape is (D x 4D), where D is the hidden + size. + + If it is set to None or one attribute of ParamAttr, + dynamic_lstm will create ParamAttr as param_attr. + If the Initializer of the param_attr is not set, the + parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|None): The bias attribute for the learnable bias + weights, which contains two parts, input-hidden + bias weights and peephole connections weights if + setting `use_peepholes` to `True`. + + 1. `use_peepholes = False` + - Biases = {:math:`b_c, b_i, b_f, b_o`}. + - The shape is (1 x 4D). + 2. `use_peepholes = True` + - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ + W_{fc}, W_{oc}`}. + - The shape is (1 x 7D). + + If it is set to None or one attribute of ParamAttr, + dynamic_lstm will create ParamAttr as bias_attr. + If the Initializer of the bias_attr is not set, + the bias is initialized zero. Default: None. + use_peepholes (bool): ${use_peepholes_comment} + is_reverse (bool): ${is_reverse_comment} + gate_activation (str): ${gate_activation_comment} + cell_activation (str): ${cell_activation_comment} + candidate_activation (str): ${candidate_activation_comment} + dtype (str): Data type. Choices = ["float32", "float64"], default "float32". + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + tuple: The hidden state, and cell state of LSTM. The shape of both \ + is (T x D), and lod is the same with the `input`. + + Examples: + .. code-block:: python + + hidden_dim = 512 + forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=forward_proj, size=hidden_dim * 4, use_peepholes=False) + """ + assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp." + helper = LayerHelper('lstm', **locals()) + size = size // 4 + weight = helper.create_parameter( + attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype) + bias_size = [1, 7 * size] + if not use_peepholes: + bias_size[1] = 4 * size + bias = helper.create_parameter( + attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) - hidden = helper.create_variable_for_type_inference(dtype) - cell = helper.create_variable_for_type_inference(dtype) - batch_gate = helper.create_variable_for_type_inference(dtype) - batch_cell_pre_act = helper.create_variable_for_type_inference(dtype) - inputs = {'Input': input, 'Weight': weight, 'Bias': bias} - batch_size = input.shape[0] - if h_0: - assert h_0.shape == (batch_size, size), \ - 'The shape of h0 should be (batch_size, %d)' % size - inputs['H0'] = h_0 - if c_0: - assert c_0.shape == (batch_size, size), \ - 'The shape of c0 should be (batch_size, %d)' % size - inputs['C0'] = c_0 + hidden = helper.create_variable_for_type_inference(dtype) + cell = helper.create_variable_for_type_inference(dtype) + batch_gate = helper.create_variable_for_type_inference(dtype) + batch_cell_pre_act = helper.create_variable_for_type_inference(dtype) + inputs = {'Input': input, 'Weight': weight, 'Bias': bias} + batch_size = input.shape[0] + if h_0: + assert h_0.shape == (batch_size, size), \ + 'The shape of h0 should be (batch_size, %d)' % size + inputs['H0'] = h_0 + if c_0: + assert c_0.shape == (batch_size, size), \ + 'The shape of c0 should be (batch_size, %d)' % size + inputs['C0'] = c_0 - helper.append_op( - type='lstm', - inputs=inputs, - outputs={ - 'Hidden': hidden, - 'Cell': cell, - 'BatchGate': batch_gate, - 'BatchCellPreAct': batch_cell_pre_act - }, - attrs={ - 'use_peepholes': use_peepholes, - 'is_reverse': is_reverse, - 'gate_activation': gate_activation, - 'cell_activation': cell_activation, - 'candidate_activation': candidate_activation - }) - return hidden, cell + helper.append_op( + type='lstm', + inputs=inputs, + outputs={ + 'Hidden': hidden, + 'Cell': cell, + 'BatchGate': batch_gate, + 'BatchCellPreAct': batch_cell_pre_act + }, + attrs={ + 'use_peepholes': use_peepholes, + 'is_reverse': is_reverse, + 'gate_activation': gate_activation, + 'cell_activation': cell_activation, + 'candidate_activation': candidate_activation + }) + return hidden, cell def dynamic_lstmp(input, @@ -958,39 +962,43 @@ def linear_chain_crf(input, label, param_attr=None): return log_likelihood -@templatedoc() -def crf_decoding(input, param_attr, label=None): - """ - ${comment} +if os.name != 'nt': - Args: - input(${emission_type}): ${emission_comment} + @templatedoc() + def crf_decoding(input, param_attr, label=None): + """ + ${comment} - param_attr(ParamAttr): The parameter attribute for training. + Args: + input(${emission_type}): ${emission_comment} - label(${label_type}): ${label_comment} + param_attr(ParamAttr): The parameter attribute for training. - Returns: - Variable: ${viterbi_path_comment} + label(${label_type}): ${label_comment} - Examples: - .. code-block:: python + Returns: + Variable: ${viterbi_path_comment} - crf_decode = layers.crf_decoding( - input=hidden, param_attr=ParamAttr(name="crfw")) - """ - helper = LayerHelper('crf_decoding', **locals()) - transition = helper.get_parameter(param_attr.name) - viterbi_path = helper.create_variable_for_type_inference( - dtype=helper.input_dtype()) - helper.append_op( - type='crf_decoding', - inputs={"Emission": [input], + Examples: + .. code-block:: python + + crf_decode = layers.crf_decoding( + input=hidden, param_attr=ParamAttr(name="crfw")) + """ + helper = LayerHelper('crf_decoding', **locals()) + transition = helper.get_parameter(param_attr.name) + viterbi_path = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) + helper.append_op( + type='crf_decoding', + inputs={ + "Emission": [input], "Transition": transition, - "Label": label}, - outputs={"ViterbiPath": [viterbi_path]}) + "Label": label + }, + outputs={"ViterbiPath": [viterbi_path]}) - return viterbi_path + return viterbi_path @templatedoc() @@ -4066,8 +4074,8 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None): Examples: .. code-block:: python - x = fluid.layers.data(name='x', shape=[8], dtype='float32') - y = fluid.layers.data(name='y', shape=[7], dtype='float32') + x = fluid.layers.data(name='x', shape=[1], dtype='float32') + y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.edit_distance(input=x,label=y) """ helper = LayerHelper("edit_distance", **locals()) @@ -4741,7 +4749,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False, ignore_index=-100, - numeric_stable_mode=False): + numeric_stable_mode=False, + return_softmax=False): """ **Softmax With Cross Entropy Operator.** @@ -4805,9 +4814,15 @@ def softmax_with_cross_entropy(logits, the algorithm is always numerically stable. Note that the speed may be slower when use stable algorithm. Default: False + return_softmax (bool): A flag indicating whether to return the softmax + along with the cross entropy loss. Default: False Returns: - Variable: The cross entropy loss is a 2-D tensor with shape [N x 1]. + Variable or Tuple of two Variables: Return the cross entropy loss if + `return_softmax` is False, otherwise the tuple + (loss, softmax), where the cross entropy loss is + a 2-D tensor with shape [N x 1], and softmax is a + 2-D tensor with shape [N x K]. Examples: .. code-block:: python @@ -4832,6 +4847,10 @@ def softmax_with_cross_entropy(logits, 'ignore_index': ignore_index, 'numeric_stable_mode': numeric_stable_mode }) + + if return_softmax: + return loss, softmax + return loss @@ -5526,42 +5545,48 @@ def label_smooth(label, return smooth_label -@templatedoc() -def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0): - """ - ${comment} - - Args: - input (Variable): ${x_comment} - rois (Variable): ROIs (Regions of Interest) to pool over. - pooled_height (integer): ${pooled_height_comment} Default: 1 - pooled_width (integer): ${pooled_width_comment} Default: 1 - spatial_scale (float): ${spatial_scale_comment} Default: 1.0 - - Returns: - Variable: ${out_comment}. - - Examples: - .. code-block:: python - - pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0) - """ - helper = LayerHelper('roi_pool', **locals()) - dtype = helper.input_dtype() - pool_out = helper.create_variable_for_type_inference(dtype) - argmaxes = helper.create_variable_for_type_inference(dtype='int32') - helper.append_op( - type="roi_pool", - inputs={"X": input, - "ROIs": rois}, - outputs={"Out": pool_out, - "Argmax": argmaxes}, - attrs={ - "pooled_height": pooled_height, - "pooled_width": pooled_width, - "spatial_scale": spatial_scale - }) - return pool_out +if os.name != 'nt': + + @templatedoc() + def roi_pool(input, + rois, + pooled_height=1, + pooled_width=1, + spatial_scale=1.0): + """ + ${comment} + + Args: + input (Variable): ${x_comment} + rois (Variable): ROIs (Regions of Interest) to pool over. + pooled_height (integer): ${pooled_height_comment} Default: 1 + pooled_width (integer): ${pooled_width_comment} Default: 1 + spatial_scale (float): ${spatial_scale_comment} Default: 1.0 + + Returns: + Variable: ${out_comment}. + + Examples: + .. code-block:: python + + pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0) + """ + helper = LayerHelper('roi_pool', **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_variable_for_type_inference(dtype) + argmaxes = helper.create_variable_for_type_inference(dtype='int32') + helper.append_op( + type="roi_pool", + inputs={"X": input, + "ROIs": rois}, + outputs={"Out": pool_out, + "Argmax": argmaxes}, + attrs={ + "pooled_height": pooled_height, + "pooled_width": pooled_width, + "spatial_scale": spatial_scale + }) + return pool_out @templatedoc() @@ -6810,7 +6835,7 @@ def prelu(x, mode, param_attr=None, name=None): alpha_shape = x.shape dtype = helper.input_dtype(input_param_name='x') alpha = helper.create_parameter( - attr=param_attr, + attr=helper.param_attr, shape=alpha_shape, dtype='float32', is_bias=False, @@ -8289,3 +8314,72 @@ def add_position_encoding(input, alpha, beta, name=None): attrs={"alpha": alpha, "beta": beta}) return out + + +def bilinear_tensor_product(x, + y, + size, + act=None, + name=None, + param_attr=None, + bias_attr=None): + """ + **Add Bilinear Tensor Product Layer** + + This layer performs bilinear tensor product on two inputs. + For example: + + .. math:: + out{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1 + + In this formula: + - :math:`x`: the first input contains M elements, shape is [batch_size, M]. + - :math:`y`: the second input contains N elements, shape is [batch_size, N]. + - :math:`W_{i}`: the i-th learned weight, shape is [M, N] + - :math:`out{i}`: the i-th element of out, shape is [batch_size, size]. + - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`. + + Args: + x (Variable): 2-D input tensor with shape [batch_size, M] + y (Variable): 2-D input tensor with shape [batch_size, N] + size (int): The dimension of this layer. + act (str, default None): Activation to be applied to the output of this layer. + name (str, default None): The name of this layer. + param_attr (ParamAttr, default None): The parameter attribute for the learnable w. + parameters/weights of this layer. + bias_attr (ParamAttr, default None): The parameter attribute for the bias + of this layer. If it is set to False, no bias will be added to the output units. + If it is set to None, the bias is initialized zero. Default: None. + + Returns: + Variable: A 2-D Tensor of shape [batch_size, size]. + + Examples: + .. code-block:: python + + tensor = bilinear_tensor_product(x=layer1, y=layer2, size=1000) + """ + helper = LayerHelper('bilinear_tensor_product', **locals()) + dtype = helper.input_dtype('x') + + param_shape = [size, x.shape[1], y.shape[1]] + + w = helper.create_parameter( + attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=False) + + if name is None: + out = helper.create_variable_for_type_inference(dtype=dtype) + else: + out = helper.create_variable(name=name, dtype=dtype, persistable=False) + + inputs = {"X": x, "Y": y, "Weight": w} + if helper.bias_attr: + bias_size = [1, size] + bias = helper.create_parameter( + attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) + inputs["Bias"] = bias + helper.append_op( + type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out}) + + # add activation + return helper.append_activation(out) diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 1ff40a26f2f24e2ff06719972489b0c1e5d140c3..66eb1229aa3ec7a956146f12da2889d59b88671a 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +import os from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr from .. import core from ..framework import convert_np_dtype_to_dtype_ @@ -99,27 +100,26 @@ Examples: >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3) """ -__all__ += ['cumsum'] - -_cum_sum_ = generate_layer_fn('cumsum') - - -def cumsum(x, axis=None, exclusive=None, reverse=None): - locals_var = locals().keys() - kwargs = dict() - for name in locals_var: - val = locals()[name] - if val is not None: - kwargs[name] = val - return _cum_sum_(**kwargs) - - -cumsum.__doc__ = _cum_sum_.__doc__ + """ -Examples: - - >>> data = fluid.layers.data(name="input", shape=[32, 784]) - >>> result = fluid.layers.cumsum(data, axis=0) -""" +if os.name != 'nt': + __all__ += ['cumsum'] + + _cum_sum_ = generate_layer_fn('cumsum') + + def cumsum(x, axis=None, exclusive=None, reverse=None): + locals_var = locals().keys() + kwargs = dict() + for name in locals_var: + val = locals()[name] + if val is not None: + kwargs[name] = val + return _cum_sum_(**kwargs) + + cumsum.__doc__ = _cum_sum_.__doc__ + """ + Examples: + + >>> data = fluid.layers.data(name="input", shape=[32, 784]) + >>> result = fluid.layers.cumsum(data, axis=0) + """ __all__ += ['thresholded_relu'] diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 57e5d197b618615b32a7f446df0a81e18c25b097..ff32c00104171bf42c00be33f05758a4387228e1 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -235,11 +235,11 @@ def tensor_array_to_tensor(input, axis=1, name=None): output, output_index = fluid.layers.tensor_array_to_tensor(input=tensor_array) """ - helper = LayerHelper('tensor_array_concat', **locals()) + helper = LayerHelper('tensor_array_to_tensor', **locals()) out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) out_index = helper.create_variable_for_type_inference(dtype="int32") helper.append_op( - type='tensor_array_concat', + type='tensor_array_to_tensor', inputs={'X': input}, outputs={'Out': [out], 'OutIndex': [out_index]}, diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 28dc7519571d8b5464e92fddf634ba46691ceaa9..982d29180141d052e25ea3dcba6e3e7ce4181c48 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -128,6 +128,24 @@ class TestPriorBox(unittest.TestCase): assert box.shape[3] == 4 +class TestDensityPriorBox(unittest.TestCase): + def test_density_prior_box(self): + data_shape = [3, 224, 224] + images = fluid.layers.data( + name='pixel', shape=data_shape, dtype='float32') + conv1 = fluid.layers.conv2d(images, 3, 3, 2) + box, var = layers.density_prior_box( + input=conv1, + image=images, + densities=[3, 4], + fixed_sizes=[50., 60.], + fixed_ratios=[1.0], + clip=True) + assert len(box.shape) == 4 + assert box.shape == var.shape + assert box.shape[3] == 4 + + class TestAnchorGenerator(unittest.TestCase): def test_anchor_generator(self): data_shape = [3, 224, 224] diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py index edc60550058f53da456c21de4b41142b907743df..cf62817956c12cd4487eba88bf49ed43331dff03 100644 --- a/python/paddle/fluid/tests/unittests/dist_save_load.py +++ b/python/paddle/fluid/tests/unittests/dist_save_load.py @@ -26,6 +26,7 @@ from multiprocessing import Process from functools import reduce import numpy as np +import pickle import unittest import six @@ -166,7 +167,10 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2): io.save_persistables(startup_exe, model_dir, trainer_prog) var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor()) - print(np.ravel(var).tolist()) + if six.PY2: + print(pickle.dumps(np.ravel(var).tolist())) + else: + sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist())) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py new file mode 100644 index 0000000000000000000000000000000000000000..79d1fd3d7171e06a88a75cf50b6a51ef4da51f07 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py @@ -0,0 +1,142 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest + + +class TestDensityPriorBoxOp(OpTest): + def set_data(self): + self.init_test_params() + self.init_test_input() + self.init_test_output() + self.inputs = {'Input': self.input, 'Image': self.image} + + self.attrs = { + 'variances': self.variances, + 'clip': self.clip, + 'step_w': self.step_w, + 'step_h': self.step_h, + 'offset': self.offset, + 'densities': self.densities, + 'fixed_sizes': self.fixed_sizes, + 'fixed_ratios': self.fixed_ratios + } + self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var} + + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "density_prior_box" + self.set_data() + + def set_density(self): + self.densities = [] + self.fixed_sizes = [] + self.fixed_ratios = [] + + def init_test_params(self): + self.layer_w = 32 + self.layer_h = 32 + + self.image_w = 40 + self.image_h = 40 + + self.step_w = float(self.image_w) / float(self.layer_w) + self.step_h = float(self.image_h) / float(self.layer_h) + + self.input_channels = 2 + self.image_channels = 3 + self.batch_size = 10 + + self.variances = [0.1, 0.1, 0.2, 0.2] + self.variances = np.array(self.variances, dtype=np.float).flatten() + + self.set_density() + + self.clip = True + self.num_priors = 0 + if len(self.fixed_sizes) > 0 and len(self.densities) > 0: + for density in self.densities: + if len(self.fixed_ratios) > 0: + self.num_priors += len(self.fixed_ratios) * (pow(density, + 2)) + self.offset = 0.5 + + def init_test_input(self): + self.image = np.random.random( + (self.batch_size, self.image_channels, self.image_w, + self.image_h)).astype('float32') + + self.input = np.random.random( + (self.batch_size, self.input_channels, self.layer_w, + self.layer_h)).astype('float32') + + def init_test_output(self): + out_dim = (self.layer_h, self.layer_w, self.num_priors, 4) + out_boxes = np.zeros(out_dim).astype('float32') + out_var = np.zeros(out_dim).astype('float32') + + step_average = int((self.step_w + self.step_h) * 0.5) + for h in range(self.layer_h): + for w in range(self.layer_w): + idx = 0 + c_x = (w + self.offset) * self.step_w + c_y = (h + self.offset) * self.step_h + # Generate density prior boxes with fixed size + for density, fixed_size in zip(self.densities, + self.fixed_sizes): + if (len(self.fixed_ratios) > 0): + for ar in self.fixed_ratios: + shift = int(step_average / density) + box_width_ratio = fixed_size * math.sqrt(ar) + box_height_ratio = fixed_size / math.sqrt(ar) + for di in range(density): + for dj in range(density): + c_x_temp = c_x - step_average / 2.0 + shift / 2.0 + dj * shift + c_y_temp = c_y - step_average / 2.0 + shift / 2.0 + di * shift + out_boxes[h, w, idx, :] = [ + max((c_x_temp - box_width_ratio / 2.0) / + self.image_w, 0), + max((c_y_temp - box_height_ratio / 2.0) + / self.image_h, 0), + min((c_x_temp + box_width_ratio / 2.0) / + self.image_w, 1), + min((c_y_temp + box_height_ratio / 2.0) + / self.image_h, 1) + ] + idx += 1 + if self.clip: + out_boxes = np.clip(out_boxes, 0.0, 1.0) + out_var = np.tile(self.variances, + (self.layer_h, self.layer_w, self.num_priors, 1)) + self.out_boxes = out_boxes.astype('float32') + self.out_var = out_var.astype('float32') + + +class TestDensityPriorBox(TestDensityPriorBoxOp): + def set_density(self): + self.densities = [3, 4] + self.fixed_sizes = [1.0, 2.0] + self.fixed_ratios = [1.0] + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 4b8a215190a90c974a9ecc8658d044c59b80c989..97e7ee6229f081ff67ca3e2aedcad0a2e3d9cabf 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -105,7 +105,7 @@ class TestDistRunnerBase(object): build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce if args.batch_merge_repeat > 1: - pass_builder = build_stra._create_passes_from_strategy() + pass_builder = build_stra._finalize_strategy_and_create_passes() mypass = pass_builder.insert_pass( len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass") mypass.set_int("num_repeats", args.batch_merge_repeat) diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py index 03066fee48b703f8b55bd4ae6a9c4bb8deecab1e..ea2b554dac83988955e3a7e8919e57a4ed7a8215 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py @@ -65,14 +65,14 @@ class TestDistSaveLoadDense2x2(TestDistBase): shutil.rmtree(model_dir) - local_np = np.array(eval(local_var[0])) - train0_np = np.array(eval(tr0_var[0])) - train1_np = np.array(eval(tr1_var[0])) + local_np = np.array(local_var) + train0_np = np.array(tr0_var) + train1_np = np.array(tr1_var) + self.assertAlmostEqual(local_np.all(), train0_np.all(), delta=delta) self.assertAlmostEqual(local_np.all(), train1_np.all(), delta=delta) self.assertAlmostEqual(train0_np.all(), train1_np.all(), delta=delta) - @unittest.skip(reason="CI fail") def test_dist(self): need_envs = { "IS_DISTRIBUTED": '0', diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 692f2375f2119c3def1751d92087613cd965bf25..a8fa5436c43d2f05f632b920f67d43d837d28da9 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -369,6 +369,10 @@ class TestBook(unittest.TestCase): with program_guard(program): x = layers.data(name='x', shape=[16], dtype='float32') y = layers.data(name='label', shape=[1], dtype='int64') + loss, softmax = layers.softmax_with_cross_entropy( + x, y, return_softmax=True) + self.assertIsNotNone(loss) + self.assertIsNotNone(softmax) loss = layers.softmax_with_cross_entropy(x, y) self.assertIsNotNone(loss) print(str(program)) @@ -911,6 +915,16 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(data_1) print(str(program)) + def test_bilinear_tensor_product_layer(self): + program = Program() + with program_guard(program): + data = layers.data(name='data', shape=[4], dtype="float32") + + theta = layers.data(name="theta", shape=[5], dtype="float32") + out = layers.bilinear_tensor_product(data, theta, 6) + + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py index 11e5d8b536fb65b66c954991bf815241774702ec..c7f4f3e913bfd66cbbb703c0e73336f9a3563507 100644 --- a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py @@ -80,6 +80,33 @@ class TestLookupSpraseTable(OpTest): assert (result_array2[3] == w_array[6]).all() assert (result_array2[4] == w_array[7]).all() + # create and run lookup_table operator + test_lookup_table = Operator( + "lookup_sparse_table", + W='W', + Ids='Ids', + Out='Out', + min=-5.0, + max=10.0, + seed=10, + is_test=True) + + ids = scope.var("Ids").get_tensor() + unknown_id = [44, 22, 33] + ids_array2 = np.array([4, 2, 3, 7, 100000] + unknown_id).astype("int64") + ids.set(ids_array2, place) + test_lookup_table.run(scope, place) + + result_array2 = np.array(out_tensor) + assert (result_array2[0] == w_array[5]).all() + assert (result_array2[1] == w_array[1]).all() + assert (result_array2[2] == w_array[2]).all() + assert (result_array2[3] == w_array[6]).all() + assert (result_array2[4] == w_array[7]).all() + + for i in [5, 6, 7]: + assert np.all(result_array2[i] == 0) + def test_w_is_selected_rows(self): places = [core.CPUPlace()] # currently only support CPU diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py index c93740669f40aee3a6c143d153cfd0f5bb72dbd9..18d95c94ad36316b7149eb5412260b40a57ac002 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py @@ -21,8 +21,8 @@ import six class TestBase(unittest.TestCase): def main(self, network_func, - iter=100, - iter_per_pe=100, + iter=10, + iter_per_pe=10, use_gpu=True, use_experimental_executor=False): if use_gpu and not fluid.core.is_compiled_with_cuda(): @@ -45,7 +45,7 @@ class TestBase(unittest.TestCase): exe_strategy._dry_run = True exe_strategy.use_experimental_executor = use_experimental_executor pe = fluid.ParallelExecutor( - use_cuda=True, + use_cuda=use_gpu, loss_name=loss.name, main_program=main_prog, exec_strategy=exe_strategy) diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py index 288c5f6a1f6b1760ca40c0c653e4c0726b799519..5a3ec8ff0180281babeaa006133b3ff9dc6d8083 100644 --- a/python/paddle/fluid/tests/unittests/test_pass_builder.py +++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py @@ -94,7 +94,12 @@ class TestPassBuilder(unittest.TestCase): def test_parallel_testing_with_new_strategy(self): build_strategy = fluid.BuildStrategy() - pass_builder = build_strategy._create_passes_from_strategy() + self.assertFalse(build_strategy.fuse_elewise_add_act_ops) + build_strategy.fuse_elewise_add_act_ops = True + pass_builder = build_strategy._finalize_strategy_and_create_passes() + self.assertTrue("fuse_elewise_add_act_pass" in + [p.type() for p in pass_builder.all_passes()]) + origin_len = len(pass_builder.all_passes()) viz_pass = pass_builder.append_pass("graph_viz_pass") diff --git a/python/setup.py.in b/python/setup.py.in index b1ff9f3a5c3d877edb6bc6a12efce053a44b4c9c..26f3c5aeaa17f60ccdac849e589c779a73138088 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -9,7 +9,7 @@ class BinaryDistribution(Distribution): RC = 0 - +ext_name = '.dll' if os.name == 'nt' else '.so' def git_commit(): try: @@ -136,10 +136,13 @@ if '${WITH_FLUID_ONLY}'== 'OFF': '${PADDLE_BINARY_DIR}/paddle/legacy/pserver/paddle_pserver_main', '${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] -package_data={'paddle.fluid': ['core.so']} +package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]} +if os.name == 'nt': + package_data['paddle.fluid'] += ['openblas' + ext_name] + if '${WITH_FLUID_ONLY}'== 'OFF': - package_data['paddle.v2.master']=['libpaddle_master.so'] - package_data['py_paddle']=['*.py','_swig_paddle.so'] + package_data['paddle.v2.master']=['libpaddle_master' + ext_name] + package_data['py_paddle']=['*.py','_swig_paddle' + + ext_name] package_dir={ '': '${PADDLE_BINARY_DIR}/python', @@ -153,13 +156,15 @@ if '${WITH_FLUID_ONLY}'== 'OFF': package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle' # put all thirdparty libraries in paddle.libs -package_data['paddle.libs']=['libwarpctc.so'] libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs' -shutil.copy('${WARPCTC_LIBRARIES}', libs_path) +if os.name != 'nt': + package_data['paddle.libs']= [] + package_data['paddle.libs']=['libwarpctc' + ext_name] + shutil.copy('${WARPCTC_LIBRARIES}', libs_path) if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_LIB}', libs_path) shutil.copy('${MKLML_IOMP_LIB}', libs_path) - package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so'] + package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name] if '${CMAKE_BUILD_TYPE}' == 'Release': # only change rpath in Release mode. if '${WITH_MKLDNN}' == 'ON': @@ -174,37 +179,60 @@ if '${CMAKE_BUILD_TYPE}' == 'Release': raise Exception("patch libmkldnn.so failed, command: %s" % command) package_data['paddle.libs']+=['libmkldnn.so.0'] shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) +if '${WITH_NGRAPH}' == 'ON': + if '${CMAKE_BUILD_TYPE}' == 'Release': + # only change rpath in Release mode. + command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}" + if os.system(command) != 0: + raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command) + shutil.copy('${NGRAPH_SHARED_LIB}', libs_path) + shutil.copy('${NGRAPH_CPU_LIB}', libs_path) + shutil.copy('${NGRAPH_TBB_LIB}', libs_path) + package_data['paddle.libs']+=['${NGRAPH_SHARED_LIB_NAME}', + '${NGRAPH_CPU_LIB_NAME}', + '${NGRAPH_TBB_LIB_NAME}'] # remove unused paddle/libs/__init__.py -os.remove(libs_path+'/__init__.py') +if os.path.isfile(libs_path+'/__init__.py'): + os.remove(libs_path+'/__init__.py') package_dir['paddle.libs']=libs_path -# change rpath of core.so, add $ORIGIN/../libs/ to it. -# The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and -# core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries. +# change rpath of core.ext, add $ORIGIN/../libs/ to it. +# The reason is that libwarpctc.ext, libiomp5.ext etc are in paddle.libs, and +# core.ext is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries. # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213 if '${CMAKE_BUILD_TYPE}' == 'Release': - # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed. - if "@APPLE@" == "1": - command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so" - else: - command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so" - if os.system(command) != 0: - raise Exception("patch core.so failed, command: %s" % command) - if '${WITH_FLUID_ONLY}'== 'OFF': - # change rpath of _swig_paddle.so. + if os.name != 'nt': + # only change rpath in Release mode, since in Debug mode, core.xx is too large to be changed. if "@APPLE@" == "1": - command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so" + command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name else: - command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so" + command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name if os.system(command) != 0: - raise Exception("patch _swig_paddle.so failed, command: %s" % command) + raise Exception("patch core.%s failed, command: %s" % (ext_name, command)) + if '${WITH_FLUID_ONLY}'== 'OFF': + # change rpath of _swig_paddle.xx. + if "@APPLE@" == "1": + command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name + else: + command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name + if os.system(command) != 0: + raise Exception("patch _swig_paddle.%s failed, command: %s" % (ext_name, command)) + +ext_modules = [Extension('_foo', ['stub.cc'])] +if os.name == 'nt': + # fix the path separator under windows + fix_package_dir = {} + for k, v in package_dir.items(): + fix_package_dir[k] = v.replace('/', '\\') + package_dir = fix_package_dir + ext_modules = [] setup(name='${PACKAGE_NAME}', version='${PADDLE_VERSION}', description='Parallel Distributed Deep Learning', install_requires=setup_requires, packages=packages, - ext_modules=[Extension('_foo', ['stub.cc'])], + ext_modules=ext_modules, package_data=package_data, package_dir=package_dir, scripts=paddle_bins