提交 9dead9a2 编写于 作者: Z Zhang, Guoming

Merge branch 'develop' into prv-calibration

...@@ -4,6 +4,7 @@ paddle/operators/tensor.save ...@@ -4,6 +4,7 @@ paddle/operators/tensor.save
python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/ python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/ python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/ python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
paddle/fluid/operators/distributed/send_recv.proto
*.DS_Store *.DS_Store
*.vs *.vs
build/ build/
...@@ -28,4 +29,5 @@ third_party/ ...@@ -28,4 +29,5 @@ third_party/
build_* build_*
# clion workspace. # clion workspace.
cmake-build-* cmake-build-*
paddle/fluid/operators/distributed/send_recv.proto
model_test model_test
...@@ -43,6 +43,7 @@ ...@@ -43,6 +43,7 @@
| qingqing01 | Qing-Qing Dang | | qingqing01 | Qing-Qing Dang |
| reyoung | Yang Yu | | reyoung | Yang Yu |
| Superjom | Chun-Wei Yan | | Superjom | Chun-Wei Yan |
| tensor-tang | Jian Tang |
| tianbingsz | Tian-Bing Xu | | tianbingsz | Tian-Bing Xu |
| tpatejko | Tomasz Patejko | | tpatejko | Tomasz Patejko |
| typhoonzero | Yi Wu | | typhoonzero | Yi Wu |
......
...@@ -26,6 +26,11 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " ...@@ -26,6 +26,11 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
if(WIN32) if(WIN32)
set(CMAKE_STATIC_LIBRARY_PREFIX lib) set(CMAKE_STATIC_LIBRARY_PREFIX lib)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
endif(WIN32) endif(WIN32)
if(NOT CMAKE_CROSSCOMPILING) if(NOT CMAKE_CROSSCOMPILING)
...@@ -41,6 +46,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F ...@@ -41,6 +46,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF) option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF)
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF)
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
...@@ -65,6 +71,8 @@ option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) ...@@ -65,6 +71,8 @@ option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(ON_INFER "Turn on inference optimization." OFF) option(ON_INFER "Turn on inference optimization." OFF)
...@@ -103,6 +111,8 @@ if(ANDROID OR IOS) ...@@ -103,6 +111,8 @@ if(ANDROID OR IOS)
"Disable RDMA when cross-compiling for Android and iOS" FORCE) "Disable RDMA when cross-compiling for Android and iOS" FORCE)
set(WITH_MKL OFF CACHE STRING set(WITH_MKL OFF CACHE STRING
"Disable MKL when cross-compiling for Android and iOS" FORCE) "Disable MKL when cross-compiling for Android and iOS" FORCE)
set(WITH_NGRAPH OFF CACHE STRING
"Disable nGraph when cross-compiling for Android and iOS" FORCE)
set(WITH_GOLANG OFF CACHE STRING set(WITH_GOLANG OFF CACHE STRING
"Disable golang when cross-compiling for Android and iOS" FORCE) "Disable golang when cross-compiling for Android and iOS" FORCE)
...@@ -171,6 +181,7 @@ include(external/protobuf) # download, build, install protobuf ...@@ -171,6 +181,7 @@ include(external/protobuf) # download, build, install protobuf
include(external/python) # download, build, install python include(external/python) # download, build, install python
include(external/openblas) # download, build, install openblas include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn include(external/mkldnn) # download, build, install mkldnn
include(external/ngraph) # download, build, install nGraph
include(external/swig) # download, build, install swig include(external/swig) # download, build, install swig
include(external/boost) # download boost include(external/boost) # download boost
include(external/any) # download libn::any include(external/any) # download libn::any
...@@ -304,7 +315,6 @@ endif() ...@@ -304,7 +315,6 @@ endif()
if (ON_INFER) if (ON_INFER)
message(STATUS "On inference mode, will take place some specific optimization.") message(STATUS "On inference mode, will take place some specific optimization.")
add_definitions(-DPADDLE_ON_INFERENCE)
else() else()
#TODO(luotao), combine this warning with `make inference_lib_dist` command. #TODO(luotao), combine this warning with `make inference_lib_dist` command.
message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.") message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
......
...@@ -218,3 +218,7 @@ endif(WITH_GRPC) ...@@ -218,3 +218,7 @@ endif(WITH_GRPC)
if(WITH_BRPC_RDMA) if(WITH_BRPC_RDMA)
add_definitions(-DPADDLE_WITH_BRPC_RDMA) add_definitions(-DPADDLE_WITH_BRPC_RDMA)
endif(WITH_BRPC_RDMA) endif(WITH_BRPC_RDMA)
if(ON_INFER)
add_definitions(-DPADDLE_ON_INFERENCE)
endif(ON_INFER)
...@@ -157,6 +157,9 @@ list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) ...@@ -157,6 +157,9 @@ list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO) if(NOT WITH_DSO)
# TODO(panyx0718): CUPTI only allows DSO? # TODO(panyx0718): CUPTI only allows DSO?
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
if(WIN32)
set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
endif(WIN32)
endif(NOT WITH_DSO) endif(NOT WITH_DSO)
# setting nvcc arch flags # setting nvcc arch flags
...@@ -196,10 +199,12 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") ...@@ -196,10 +199,12 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif() endif()
else(NOT WIN32) else(NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Release") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS "-g -G")
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
else() else()
message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.") message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
endif() endif()
endif(NOT WIN32) endif(NOT WIN32)
......
...@@ -2,7 +2,12 @@ if(NOT WITH_GPU) ...@@ -2,7 +2,12 @@ if(NOT WITH_GPU)
return() return()
endif() endif()
set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT") if(WIN32)
set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
else(WIN32)
set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
endif(WIN32)
find_path(CUDNN_INCLUDE_DIR cudnn.h find_path(CUDNN_INCLUDE_DIR cudnn.h
PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
$ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE} $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
......
...@@ -58,19 +58,21 @@ ExternalProject_Add( ...@@ -58,19 +58,21 @@ ExternalProject_Add(
-DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
-DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
-DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER} -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
-DBUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN}
-DBUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM}
${EXTERNAL_OPTIONAL_ARGS} ${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
) )
message(STATUS "Anakin for inference is enabled") message(STATUS "Anakin for inference is enabled")
message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}") message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
add_dependencies(extern_anakin protobuf mklml)
add_library(anakin_shared SHARED IMPORTED GLOBAL) add_library(anakin_shared SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB}) set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
add_dependencies(anakin_shared extern_anakin protobuf mklml) add_dependencies(anakin_shared extern_anakin)
add_library(anakin_saber SHARED IMPORTED GLOBAL) add_library(anakin_saber SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB}) set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
add_dependencies(anakin_saber extern_anakin protobuf mklml) add_dependencies(anakin_saber extern_anakin)
list(APPEND external_project_dependencies anakin_shared anakin_saber) list(APPEND external_project_dependencies anakin_shared anakin_saber)
...@@ -28,34 +28,28 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL)) ...@@ -28,34 +28,28 @@ if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
endif() endif()
IF (WIN32)
MESSAGE(WARNING, "In windows, boost can not be downloaded automaticlly, please build it manually and put it at " ${THIRD_PARTY_PATH}install/boost) MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
else()
MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
ENDIF(WIN32)
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}" CACHE PATH "boost include directory." FORCE)
set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
include_directories(${BOOST_INCLUDE_DIR}) include_directories(${BOOST_INCLUDE_DIR})
if (NOT WIN32)
ExternalProject_Add( ExternalProject_Add(
${BOOST_PROJECT} ${BOOST_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR} DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz URL ${BOOST_URL}
&& tar zxf ${BOOST_TAR}.tar.gz
DOWNLOAD_NO_PROGRESS 1 DOWNLOAD_NO_PROGRESS 1
PREFIX ${BOOST_SOURCES_DIR} PREFIX ${BOOST_SOURCES_DIR}
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
INSTALL_COMMAND "" INSTALL_COMMAND ""
UPDATE_COMMAND "" UPDATE_COMMAND ""
) )
endif(NOT WIN32)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
......
...@@ -35,7 +35,12 @@ ExternalProject_Add( ...@@ -35,7 +35,12 @@ ExternalProject_Add(
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DBUILD_STATIC_LIBS=ON
-DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF -DBUILD_TESTING=OFF
...@@ -48,8 +53,8 @@ ExternalProject_Add( ...@@ -48,8 +53,8 @@ ExternalProject_Add(
IF(WIN32) IF(WIN32)
IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib") IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
add_custom_command(TARGET extern_gflags POST_BUILD add_custom_command(TARGET extern_gflags POST_BUILD
COMMAND cmake -E rename ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib COMMAND cmake -E copy ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
) )
ENDIF() ENDIF()
ENDIF(WIN32) ENDIF(WIN32)
ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
......
...@@ -46,7 +46,11 @@ ExternalProject_Add( ...@@ -46,7 +46,11 @@ ExternalProject_Add(
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
...@@ -63,7 +67,7 @@ ExternalProject_Add( ...@@ -63,7 +67,7 @@ ExternalProject_Add(
IF(WIN32) IF(WIN32)
IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib") IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib")
add_custom_command(TARGET extern_glog POST_BUILD add_custom_command(TARGET extern_glog POST_BUILD
COMMAND cmake -E rename ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib COMMAND cmake -E copy ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
) )
ENDIF() ENDIF()
ENDIF(WIN32) ENDIF(WIN32)
......
...@@ -37,7 +37,6 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) ...@@ -37,7 +37,6 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers. INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h
IF(${CBLAS_PROVIDER} STREQUAL "MKLML") IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) SET(MKLDNN_DEPENDS ${MKLML_PROJECT})
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
add_library(ngraph INTERFACE)
IF(WIN32 OR APPLE)
MESSAGE(WARNING
"Windows or Mac is not supported with nGraph in Paddle yet."
"Force WITH_NGRAPH=OFF")
SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph in Windows and MacOS" FORCE)
ENDIF()
IF(${WITH_NGRAPH} AND NOT ${WITH_MKLDNN})
MESSAGE(WARNING
"nGraph needs mkl-dnn to be enabled."
"Force WITH_NGRAPH=OFF")
SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph if mkl-dnn is disabled" FORCE)
ENDIF()
IF(NOT ${WITH_NGRAPH})
return()
ENDIF()
INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph")
SET(NGRAPH_VERSION "0.9")
SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so)
SET(NGRAPH_TBB_LIB_NAME libtbb.so.2)
SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git")
ExternalProject_Add(
${NGRAPH_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_PROJECT} ${MKLML_PROJECT}
GIT_REPOSITORY ${NGRAPH_GIT_REPO}
GIT_TAG ${NGRAPH_GIT_TAG}
PREFIX ${NGRAPH_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR}
CMAKE_ARGS -DNGRAPH_UNIT_TEST_ENABLE=FALSE
CMAKE_ARGS -DNGRAPH_TOOLS_ENABLE=FALSE
CMAKE_ARGS -DNGRAPH_INTERPRETER_ENABLE=FALSE
CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
)
if(UNIX AND NOT APPLE)
include(GNUInstallDirs)
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
else()
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
endif()
MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
# Workaround for nGraph expecting mklml to be in mkldnn install directory.
ExternalProject_Add_Step(
${NGRAPH_PROJECT}
PrepareMKL
COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_LIB} ${MKLDNN_INSTALL_DIR}/lib/libmklml_intel.so
COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_IOMP_LIB} ${MKLDNN_INSTALL_DIR}/lib/libiomp5.so
DEPENDEES download
DEPENDERS configure
)
add_dependencies(ngraph ${NGRAPH_PROJECT})
target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH)
target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR})
target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB})
LIST(APPEND external_project_dependencies ngraph)
...@@ -17,12 +17,8 @@ IF(USE_EIGEN_FOR_BLAS) ...@@ -17,12 +17,8 @@ IF(USE_EIGEN_FOR_BLAS)
ENDIF(USE_EIGEN_FOR_BLAS) ENDIF(USE_EIGEN_FOR_BLAS)
INCLUDE(cblas) INCLUDE(cblas)
# IF(WIN32 AND NOT ${CBLAS_FOUND})
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas) SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
...@@ -34,6 +30,7 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -34,6 +30,7 @@ IF(NOT ${CBLAS_FOUND})
CACHE FILEPATH "openblas library." FORCE) CACHE FILEPATH "openblas library." FORCE)
ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS) ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
IF (WIN32) IF (WIN32)
SET(CBLAS_FOUND true) SET(CBLAS_FOUND true)
MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR}) MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})
......
...@@ -30,66 +30,61 @@ UNSET_VAR(PROTOBUF_LITE_LIBRARY) ...@@ -30,66 +30,61 @@ UNSET_VAR(PROTOBUF_LITE_LIBRARY)
UNSET_VAR(PROTOBUF_LIBRARY) UNSET_VAR(PROTOBUF_LIBRARY)
UNSET_VAR(PROTOBUF_INCLUDE_DIR) UNSET_VAR(PROTOBUF_INCLUDE_DIR)
UNSET_VAR(Protobuf_PROTOC_EXECUTABLE) UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
function(protobuf_generate_python SRCS)
# shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
if(NOT ARGN)
message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
return()
endif()
if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined. if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
function(protobuf_generate_python SRCS) # Create an include path for each file specified
# shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
if(NOT ARGN)
message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
return()
endif()
if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
# Create an include path for each file specified
foreach(FIL ${ARGN})
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
get_filename_component(ABS_PATH ${ABS_FIL} PATH)
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
if(${_contains_already} EQUAL -1)
list(APPEND _protobuf_include_path -I ${ABS_PATH})
endif()
endforeach()
else()
set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
endif()
if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
endif()
if(DEFINED Protobuf_IMPORT_DIRS)
foreach(DIR ${Protobuf_IMPORT_DIRS})
get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
if(${_contains_already} EQUAL -1)
list(APPEND _protobuf_include_path -I ${ABS_PATH})
endif()
endforeach()
endif()
set(${SRCS})
foreach(FIL ${ARGN}) foreach(FIL ${ARGN})
get_filename_component(ABS_FIL ${FIL} ABSOLUTE) get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
get_filename_component(FIL_WE ${FIL} NAME_WE) get_filename_component(ABS_PATH ${ABS_FIL} PATH)
if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
get_filename_component(FIL_DIR ${FIL} DIRECTORY) if(${_contains_already} EQUAL -1)
if(FIL_DIR) list(APPEND _protobuf_include_path -I ${ABS_PATH})
set(FIL_WE "${FIL_DIR}/${FIL_WE}")
endif()
endif() endif()
endforeach()
else()
set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
endif()
if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
endif()
list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") if(DEFINED Protobuf_IMPORT_DIRS)
add_custom_command( foreach(DIR ${Protobuf_IMPORT_DIRS})
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
COMMAND ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE} if(${_contains_already} EQUAL -1)
COMMENT "Running Python protocol buffer compiler on ${FIL}" list(APPEND _protobuf_include_path -I ${ABS_PATH})
VERBATIM ) endif()
endforeach() endforeach()
endif()
set(${SRCS} ${${SRCS}} PARENT_SCOPE) set(${SRCS})
endfunction() foreach(FIL ${ARGN})
endif() get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
get_filename_component(FIL_WE ${FIL} NAME_WE)
if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
get_filename_component(FIL_DIR ${FIL} DIRECTORY)
if(FIL_DIR)
set(FIL_WE "${FIL_DIR}/${FIL_WE}")
endif()
endif()
list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE}
COMMENT "Running Python protocol buffer compiler on ${FIL}"
VERBATIM )
endforeach()
set(${SRCS} ${${SRCS}} PARENT_SCOPE)
endfunction()
# Print and set the protobuf library information, # Print and set the protobuf library information,
# finish this cmake process and exit from this file. # finish this cmake process and exit from this file.
...@@ -126,6 +121,7 @@ macro(PROMPT_PROTOBUF_LIB) ...@@ -126,6 +121,7 @@ macro(PROMPT_PROTOBUF_LIB)
# FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
# make `protobuf_generate_cpp` happy. # make `protobuf_generate_cpp` happy.
SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
FOREACH(dep ${protobuf_DEPS}) FOREACH(dep ${protobuf_DEPS})
ADD_DEPENDENCIES(protobuf ${dep}) ADD_DEPENDENCIES(protobuf ${dep})
ADD_DEPENDENCIES(protobuf_lite ${dep}) ADD_DEPENDENCIES(protobuf_lite ${dep})
...@@ -144,7 +140,6 @@ endmacro() ...@@ -144,7 +140,6 @@ endmacro()
set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
IF (WIN32) IF (WIN32)
SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf) SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
MESSAGE(WARNING, "In windows, protobuf only support msvc build, please build it manually and put it at " ${PROTOBUF_ROOT})
ENDIF(WIN32) ENDIF(WIN32)
if (NOT "${PROTOBUF_ROOT}" STREQUAL "") if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
...@@ -192,13 +187,20 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -192,13 +187,20 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
SET(OPTIONAL_ARGS SET(OPTIONAL_ARGS
"-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
"-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
"-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
"-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
"-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
"-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}"
"-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
"-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
"-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
"-Dprotobuf_WITH_ZLIB=ON" "-Dprotobuf_WITH_ZLIB=ON"
"-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}" "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
${EXTERNAL_OPTIONAL_ARGS}) ${EXTERNAL_OPTIONAL_ARGS})
SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}") SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
ENDIF() ENDIF()
IF(WIN32)
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
ENDIF()
SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
......
...@@ -21,6 +21,48 @@ INCLUDE(python_module) ...@@ -21,6 +21,48 @@ INCLUDE(python_module)
FIND_PACKAGE(PythonInterp ${PY_VERSION}) FIND_PACKAGE(PythonInterp ${PY_VERSION})
FIND_PACKAGE(PythonLibs ${PY_VERSION}) FIND_PACKAGE(PythonLibs ${PY_VERSION})
if(WIN32)
execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
"from distutils import sysconfig as s;import sys;import struct;
print(sys.prefix);
print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
"
RESULT_VARIABLE _PYTHON_SUCCESS
OUTPUT_VARIABLE _PYTHON_VALUES
ERROR_VARIABLE _PYTHON_ERROR_VALUE)
if(NOT _PYTHON_SUCCESS MATCHES 0)
set(PYTHONLIBS_FOUND FALSE)
return()
endif()
# Convert the process output into a list
string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
list(GET _PYTHON_VALUES 0 PYTHON_PREFIX)
list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX)
# Make sure all directory separators are '/'
string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
set(PYTHON_LIBRARY
"${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
# when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
# original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
if(NOT EXISTS "${PYTHON_LIBRARY}")
get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
set(PYTHON_LIBRARY
"${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
endif()
# raise an error if the python libs are still not found.
if(NOT EXISTS "${PYTHON_LIBRARY}")
message(FATAL_ERROR "Python libraries not found")
endif()
SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
endif(WIN32)
# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
ADD_LIBRARY(python SHARED IMPORTED GLOBAL) ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
......
...@@ -14,23 +14,52 @@ ELSE() ...@@ -14,23 +14,52 @@ ELSE()
ENDIF(APPLE) ENDIF(APPLE)
ENDIF() ENDIF()
ExternalProject_Add( if(WIN32)
extern_xxhash ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS} extern_xxhash
GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_TAG "v0.6.5" GIT_REPOSITORY "https://github.com/Cyan4973/xxHash"
PREFIX ${XXHASH_SOURCE_DIR} GIT_TAG "v0.6.5"
DOWNLOAD_NAME "xxhash" PREFIX ${XXHASH_SOURCE_DIR}
UPDATE_COMMAND "" DOWNLOAD_NAME "xxhash"
CONFIGURE_COMMAND "" UPDATE_COMMAND ""
BUILD_IN_SOURCE 1 BUILD_IN_SOURCE 1
PATCH_COMMAND PATCH_COMMAND
BUILD_COMMAND ${BUILD_CMD} CONFIGURE_COMMAND
INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/src/extern_xxhash/cmake_unofficial
TEST_COMMAND "" -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR}
) -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DBUILD_XXHSUM=OFF
-DCMAKE_GENERATOR_PLATFORM=x64
-DBUILD_SHARED_LIBS=OFF
${OPTIONAL_CACHE_ARGS}
TEST_COMMAND ""
)
else()
ExternalProject_Add(
extern_xxhash
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/Cyan4973/xxHash"
GIT_TAG "v0.6.5"
PREFIX ${XXHASH_SOURCE_DIR}
DOWNLOAD_NAME "xxhash"
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
PATCH_COMMAND
BUILD_COMMAND ${BUILD_CMD}
INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
TEST_COMMAND ""
)
endif()
set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") if (WIN32)
set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib")
else()
set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
endif ()
INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
add_library(xxhash STATIC IMPORTED GLOBAL) add_library(xxhash STATIC IMPORTED GLOBAL)
......
...@@ -266,7 +266,11 @@ function(cc_library TARGET_NAME) ...@@ -266,7 +266,11 @@ function(cc_library TARGET_NAME)
if("${cc_library_DEPS};" MATCHES "python;") if("${cc_library_DEPS};" MATCHES "python;")
list(REMOVE_ITEM cc_library_DEPS python) list(REMOVE_ITEM cc_library_DEPS python)
add_dependencies(${TARGET_NAME} python) add_dependencies(${TARGET_NAME} python)
target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup") if(WIN32)
target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
else()
target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
endif(WIN32)
endif() endif()
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
...@@ -288,6 +292,45 @@ function(cc_library TARGET_NAME) ...@@ -288,6 +292,45 @@ function(cc_library TARGET_NAME)
endif(cc_library_SRCS) endif(cc_library_SRCS)
endfunction(cc_library) endfunction(cc_library)
# The link operation under windows may exceeds the maximum characters limit, simply break the link command
# into multiple link opeartion can fix that, say
# original:
# lib /out:target.lib a.lib b.lib c.lib d.lib
# after:
# 1. lib /out:dummy_lib_1.lib a.lib b.lib
# 2. lib /out:dummy_lib_2.lib c.lib d.lib
# 1. lib /out:target.lib dummy_lib_1.lib dummy_lib_2.lib
function(sep_library TARGET_NAME)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(sep_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(dummy_index 1)
set(dummy_offset 1)
# the dummy target would be consisted of limit size libraries
set(dummy_limit 50)
list(LENGTH sep_library_DEPS sep_all_len)
foreach(v ${sep_library_DEPS})
list(APPEND dummy_list ${v})
list(LENGTH dummy_list listlen )
if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${sep_all_len}))
message("create dummy library ${TARGET_NAME}_dummy_lib_${dummy_index} for ${TARGET_NAME}")
cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} STATIC DEPS ${dummy_list})
foreach(i ${dummy_list})
list(REMOVE_AT dummy_list 0)
endforeach()
list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_lib_${dummy_index})
MATH(EXPR dummy_index "${dummy_index}+1")
endif()
MATH(EXPR dummy_offset "${dummy_offset}+1")
endforeach()
if(${sep_library_SHARED})
cc_library(${TARGET_NAME} SHARED SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
else(${sep_library_SHARED})
cc_library(${TARGET_NAME} STATIC SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
endif(${sep_library_SHARED})
endfunction(sep_library)
function(cc_binary TARGET_NAME) function(cc_binary TARGET_NAME)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
......
...@@ -22,175 +22,196 @@ function(copy TARGET) ...@@ -22,175 +22,196 @@ function(copy TARGET)
list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len}) if (NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers") message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
endif() endif ()
math(EXPR len "${copy_lib_SRCS_len} - 1") math(EXPR len "${copy_lib_SRCS_len} - 1")
add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS}) add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
foreach(index RANGE ${len}) foreach (index RANGE ${len})
list(GET copy_lib_SRCS ${index} src) list(GET copy_lib_SRCS ${index} src)
list(GET copy_lib_DSTS ${index} dst) list(GET copy_lib_DSTS ${index} dst)
add_custom_command(TARGET ${TARGET} PRE_BUILD if (WIN32)
COMMAND mkdir -p "${dst}" # windows cmd shell will not expand wildcard automatically.
COMMAND cp -r "${src}" "${dst}" # below expand the files,libs and copy them by rules.
COMMENT "copying ${src} -> ${dst}") file(GLOB header_files ${src} "*.h")
endforeach() file(GLOB static_lib_files ${src} "*.lib")
file(GLOB dll_lib_files ${src} "*.dll")
set(src_files ${header_files} ${static_lib_files} ${dll_lib_files})
if (NOT "${src_files}" STREQUAL "")
list(REMOVE_DUPLICATES src_files)
endif ()
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
)
foreach (src_file ${src_files})
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
COMMENT "copying ${src_file} -> ${dst}")
endforeach ()
else (WIN32) # not windows
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND mkdir -p "${dst}"
COMMAND cp -r "${src}" "${dst}"
COMMENT "copying ${src} -> ${dst}")
endif (WIN32) # not windows
endforeach ()
endfunction() endfunction()
# third party # third party
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
copy(eigen3_lib copy(eigen3_lib
SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
DEPS eigen3 DEPS eigen3
) )
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags")
copy(gflags_lib copy(gflags_lib
SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS gflags DEPS gflags
) )
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog")
copy(glog_lib copy(glog_lib
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS glog DEPS glog
) )
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/")
copy(boost_lib copy(boost_lib
SRCS ${BOOST_INCLUDE_DIR}/boost SRCS ${BOOST_INCLUDE_DIR}/boost
DSTS ${dst_dir} DSTS ${dst_dir}
DEPS boost DEPS boost
) )
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
copy(xxhash_lib copy(xxhash_lib
SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES} SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS xxhash DEPS xxhash
) )
if(NOT PROTOBUF_FOUND) if (NOT PROTOBUF_FOUND)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
copy(protobuf_lib copy(protobuf_lib
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS extern_protobuf DEPS extern_protobuf
) )
endif() endif ()
if(NOT CBLAS_FOUND) if (NOT CBLAS_FOUND)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
copy(openblas_lib copy(openblas_lib
SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
DSTS ${dst_dir} ${dst_dir} DSTS ${dst_dir} ${dst_dir}
DEPS extern_openblas DEPS extern_openblas
) )
elseif (WITH_MKLML) elseif (WITH_MKLML)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
copy(mklml_lib copy(mklml_lib
SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir} DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
DEPS mklml DEPS mklml
) )
endif() endif ()
if(WITH_MKLDNN) if (WITH_MKLDNN)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
copy(mkldnn_lib copy(mkldnn_lib
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS mkldnn DEPS mkldnn
) )
endif() endif ()
if (NOT WIN32) if (NOT WIN32)
if(NOT MOBILE_INFERENCE AND NOT RPI) if (NOT MOBILE_INFERENCE AND NOT RPI)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
copy(snappy_lib copy(snappy_lib
SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappy) DEPS snappy)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
copy(snappystream_lib copy(snappystream_lib
SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappystream) DEPS snappystream)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
copy(zlib_lib copy(zlib_lib
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
DEPS zlib) DEPS zlib)
endif() endif ()
endif(NOT WIN32) endif (NOT WIN32)
# paddle fluid module # paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid") set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
set(module "framework") set(module "framework")
if (NOT WIN32) if (NOT WIN32)
set(framework_lib_deps framework_py_proto) set(framework_lib_deps framework_py_proto)
endif(NOT WIN32) endif (NOT WIN32)
copy(framework_lib DEPS ${framework_lib_deps} copy(framework_lib DEPS ${framework_lib_deps}
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
${src_dir}/${module}/ir/*.h ${src_dir}/${module}/ir/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
) )
set(module "memory") set(module "memory")
copy(memory_lib copy(memory_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
) )
set(inference_deps paddle_fluid_shared paddle_fluid) set(inference_deps paddle_fluid_shared paddle_fluid)
set(module "inference/api") set(module "inference/api")
if (WITH_ANAKIN AND WITH_MKL) if (WITH_ANAKIN AND WITH_MKL)
copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS SRCS
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
${ANAKIN_INSTALL_DIR} # anakin release ${ANAKIN_INSTALL_DIR} # anakin release
DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin) DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
list(APPEND inference_deps anakin_inference_lib) list(APPEND inference_deps anakin_inference_lib)
endif() endif ()
set(module "inference") set(module "inference")
copy(inference_lib DEPS ${inference_deps} copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/paddle_*.h
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
) )
set(module "platform") set(module "platform")
copy(platform_lib DEPS profiler_py_proto copy(platform_lib DEPS profiler_py_proto
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
) )
set(module "string") set(module "string")
copy(string_lib copy(string_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
) )
set(module "pybind") set(module "pybind")
copy(pybind_lib copy(pybind_lib
SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
DSTS ${dst_dir}/${module} DSTS ${dst_dir}/${module}
) )
# CMakeCache Info # CMakeCache Info
copy(cmake_cache copy(cmake_cache
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INSTALL_DIR}) DSTS ${FLUID_INSTALL_DIR})
# This command generates a complete fluid library for both train and inference # This command generates a complete fluid library for both train and inference
add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
...@@ -198,14 +219,14 @@ add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) ...@@ -198,14 +219,14 @@ add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
# Following commands generate a inference-only fluid library # Following commands generate a inference-only fluid library
# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR} # third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
copy(third_party DEPS fluid_lib_dist copy(third_party DEPS fluid_lib_dist
SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR} DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
) )
# only need libpaddle_fluid.so/a and paddle_inference_api.h for inference-only library # only need libpaddle_fluid.so/a and paddle_*.h for inference-only library
copy(inference_api_lib DEPS fluid_lib_dist copy(inference_api_lib DEPS fluid_lib_dist
SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.* SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_inference_api.h ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
) )
...@@ -213,20 +234,20 @@ add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib) ...@@ -213,20 +234,20 @@ add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)
# paddle fluid version # paddle fluid version
function(version version_file) function(version version_file)
execute_process( execute_process(
COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_GIT_COMMIT) OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
file(WRITE ${version_file} file(WRITE ${version_file}
"GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
"WITH_MKL: ${WITH_MKL}\n" "WITH_MKL: ${WITH_MKL}\n"
"WITH_MKLDNN: ${WITH_MKLDNN}\n" "WITH_MKLDNN: ${WITH_MKLDNN}\n"
"WITH_GPU: ${WITH_GPU}\n") "WITH_GPU: ${WITH_GPU}\n")
if(WITH_GPU) if (WITH_GPU)
file(APPEND ${version_file} file(APPEND ${version_file}
"CUDA version: ${CUDA_VERSION}\n" "CUDA version: ${CUDA_VERSION}\n"
"CUDNN version: v${CUDNN_MAJOR_VERSION}\n") "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
endif() endif ()
endfunction() endfunction()
version(${FLUID_INSTALL_DIR}/version.txt) version(${FLUID_INSTALL_DIR}/version.txt)
version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt) version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
set(PART_CUDA_KERNEL_FILES)
function(op_library TARGET)
# op_library is a function to create op library. The interface is same as
# cc_library. But it handle split GPU/CPU code and link some common library
# for ops.
set(cc_srcs)
set(cu_srcs)
set(hip_cu_srcs)
set(miopen_hip_cc_srcs)
set(cu_cc_srcs)
set(cudnn_cu_cc_srcs)
set(CUDNN_FILE)
set(mkldnn_cc_srcs)
set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
set(pybind_flag 0)
cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})
list(LENGTH op_library_SRCS op_library_SRCS_len)
if (${op_library_SRCS_len} EQUAL 0)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND cc_srcs ${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
endif()
string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
endif()
if(WITH_AMD_GPU)
string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
endif()
endif()
if(WITH_MKLDNN)
string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
endif()
endif()
else()
foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.hip.cu$")
list(APPEND hip_cu_srcs ${src})
elseif (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
list(APPEND cudnn_cu_cc_srcs ${src})
elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
list(APPEND miopen_hip_cc_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
else()
message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
endif()
endforeach()
endif()
list(LENGTH cc_srcs cc_srcs_len)
if (${cc_srcs_len} EQUAL 0)
message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
endif()
if (WIN32)
# remove windows unsupported op, because windows has no nccl, no warpctc such ops.
foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
"crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
"fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
return()
endif()
endforeach()
endif(WIN32)
set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs")
list(LENGTH op_library_DEPS op_library_DEPS_len)
if (${op_library_DEPS_len} GREATER 0)
set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
endif()
if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
elseif (WITH_AMD_GPU)
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
else()
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
endif()
# Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()
endforeach()
# The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
# Note that it's enough to just adding one operator to pybind in a *_op.cc file.
# And for detail pybind information, please see generated paddle/pybind/pybind.h.
file(READ ${TARGET}.cc TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
if (one_register STREQUAL "")
string(REPLACE "_op" "" TARGET "${TARGET}")
else ()
string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
string(REPLACE "," "" TARGET "${TARGET}")
endif()
# pybind USE_NO_KERNEL_OP
# HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
string(REPLACE "_op" "" TARGET "${TARGET}")
if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
# pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
list(LENGTH hip_cu_srcs hip_cu_srcs_len)
list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
# pybind USE_OP_DEVICE_KERNEL for CUDNN
list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MIOPEN
if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator
if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif()
endif()
# pybind USE_OP
if (${pybind_flag} EQUAL 0)
# NOTE(*): activation use macro to regist the kernels, set use_op manually.
if(${TARGET} STREQUAL "activation")
file(APPEND ${pybind_file} "USE_OP(relu);\n")
elseif(${TARGET} STREQUAL "fake_dequantize")
file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
elseif(${TARGET} STREQUAL "fake_quantize")
file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
elseif(${TARGET} STREQUAL "tensorrt_engine_op")
message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
elseif(${TARGET} STREQUAL "fc")
# HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
else()
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
endif()
endif()
endfunction()
function(register_operators)
set(options "")
set(oneValueArgs "")
set(multiValueArgs EXCLUDES DEPS)
cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE "_mkldnn" "" OPS "${OPS}")
string(REPLACE ".cc" "" OPS "${OPS}")
list(REMOVE_DUPLICATES OPS)
list(LENGTH register_operators_DEPS register_operators_DEPS_len)
foreach(src ${OPS})
list(FIND register_operators_EXCLUDES ${src} _index)
if (${_index} EQUAL -1)
if (${register_operators_DEPS_len} GREATER 0)
op_library(${src} DEPS ${register_operators_DEPS})
else()
op_library(${src})
endif()
endif()
endforeach()
endfunction()
...@@ -34,4 +34,5 @@ if(TENSORRT_FOUND) ...@@ -34,4 +34,5 @@ if(TENSORRT_FOUND)
"Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
include_directories(${TENSORRT_INCLUDE_DIR}) include_directories(${TENSORRT_INCLUDE_DIR})
list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY}) list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
add_definitions(-DPADDLE_WITH_TENSORRT)
endif() endif()
../../../CONTRIBUTING.md ../../../CONTRIBUTING.md
\ No newline at end of file
...@@ -93,17 +93,17 @@ paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', ...@@ -93,17 +93,17 @@ paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized',
paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)) paddle.fluid.layers.l2_normalize ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None))
paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)) paddle.fluid.layers.matmul ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None))
paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.topk ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times'], varargs=None, keywords=None, defaults=(0, False)) paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False))
paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None)) paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False)) paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False))
paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
...@@ -128,6 +128,7 @@ paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', ...@@ -128,6 +128,7 @@ paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates',
paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.selu ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
...@@ -184,6 +185,7 @@ paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'] ...@@ -184,6 +185,7 @@ paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name']
paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
...@@ -273,6 +275,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k ...@@ -273,6 +275,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k
paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)) paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False))
paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None))
paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)) paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False))
paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
......
...@@ -4,11 +4,12 @@ add_subdirectory(framework) ...@@ -4,11 +4,12 @@ add_subdirectory(framework)
add_subdirectory(operators) add_subdirectory(operators)
add_subdirectory(string) add_subdirectory(string)
if (NOT WIN32)
add_subdirectory(pybind) add_subdirectory(pybind)
if (NOT WIN32)
add_subdirectory(recordio) add_subdirectory(recordio)
endif(NOT WIN32) endif(NOT WIN32)
# NOTE: please add subdirectory inference at last. # NOTE: please add subdirectory inference at last.
add_subdirectory(inference) add_subdirectory(inference)
add_subdirectory(train) add_subdirectory(train)
...@@ -136,20 +136,32 @@ cc_library(version SRCS version.cc) ...@@ -136,20 +136,32 @@ cc_library(version SRCS version.cc)
cc_test(version_test SRCS version_test.cc DEPS version) cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
if(NOT WIN32)
cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
endif(NOT WIN32)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
if (NOT WIN32)
py_proto_compile(framework_py_proto SRCS framework.proto) py_proto_compile(framework_py_proto SRCS framework.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module. # Generate an empty __init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init) add_dependencies(framework_py_proto framework_py_proto_init)
add_custom_command(TARGET framework_py_proto POST_BUILD if (NOT WIN32)
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMENT "Copy generated python proto into directory paddle/fluid/proto." COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
else(NOT WIN32)
string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND copy /Y *.py ${proto_dstpath}
COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif(NOT WIN32) endif(NOT WIN32)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
...@@ -163,10 +175,14 @@ if(WITH_DISTRIBUTE) ...@@ -163,10 +175,14 @@ if(WITH_DISTRIBUTE)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else() else()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) if(NOT WIN32)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
else(NOT WIN32)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
endif(NOT WIN32)
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif() endif()
if (NOT WIN32) if (NOT WIN32)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
......
...@@ -79,9 +79,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -79,9 +79,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
BuildStrategy strategy_; BuildStrategy strategy_;
}; };
std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy() std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
const { bool finalize_strategy) const {
if (is_finalized_) {
return pass_builder_;
}
pass_builder_.reset(new ParallelExecutorPassBuilder(*this)); pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
if (finalize_strategy) {
is_finalized_ = true;
}
return pass_builder_; return pass_builder_;
} }
...@@ -95,10 +101,8 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -95,10 +101,8 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
#else #else
const bool use_cuda) const { const bool use_cuda) const {
#endif #endif
// Create a default one if not initialized by user. // Create a default one if not finalized by user.
if (!pass_builder_) { CreatePassesFromStrategy(false);
CreatePassesFromStrategy();
}
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program)); std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
......
...@@ -75,12 +75,20 @@ struct BuildStrategy { ...@@ -75,12 +75,20 @@ struct BuildStrategy {
bool remove_unnecessary_lock_{false}; bool remove_unnecessary_lock_{false};
// NOTE:
// Before you add new options, think if it's a general strategy that works
// with other strategy. If not, the strategy should be created through
// CreatePassesFromStrategy and the pass can be managed separately.
// User normally doesn't need to call this API. // User normally doesn't need to call this API.
// The PassBuilder allows for more customized insert, remove of passes // The PassBuilder allows for more customized insert, remove of passes
// from python side. // from python side.
// A new PassBuilder is created based on configs defined above and // A new PassBuilder is created based on configs defined above and
// passes are owned by the PassBuilder. // passes are owned by the PassBuilder.
std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const; std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy(
bool finalize_strategy) const;
bool IsFinalized() const { return is_finalized_; }
// Apply the passes built by the pass_builder_. The passes will be // Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph. // applied to the Program and output an ir::Graph.
...@@ -97,6 +105,7 @@ struct BuildStrategy { ...@@ -97,6 +105,7 @@ struct BuildStrategy {
#endif #endif
private: private:
mutable bool is_finalized_ = false;
mutable std::shared_ptr<ir::PassBuilder> pass_builder_; mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
}; };
......
...@@ -30,6 +30,8 @@ class ExceptionHolder { ...@@ -30,6 +30,8 @@ class ExceptionHolder {
Catch(exp); Catch(exp);
} catch (platform::EnforceNotMet exp) { } catch (platform::EnforceNotMet exp) {
Catch(exp); Catch(exp);
} catch (std::exception& ex) {
LOG(FATAL) << "std::exception caught, " << ex.what();
} catch (...) { } catch (...) {
LOG(FATAL) << "Unknown exception caught"; LOG(FATAL) << "Unknown exception caught";
} }
......
...@@ -30,8 +30,8 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( ...@@ -30,8 +30,8 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
local_scopes_(local_scopes), local_scopes_(local_scopes),
places_(places), places_(places),
graph_(std::move(graph)), graph_(std::move(graph)),
pool_(strategy.num_threads_ + pool_(strategy.num_threads_),
1), // add one more thread for generate op_deps prepare_pool_(1), // add one more thread for generate op_deps
fetch_ctxs_(places) { fetch_ctxs_(places) {
for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) { for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
int dep = static_cast<int>(op->NotReadyInputSize()); int dep = static_cast<int>(op->NotReadyInputSize());
...@@ -160,7 +160,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( ...@@ -160,7 +160,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
}); });
} }
void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() { void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
atomic_op_deps_ = pool_.enqueue([&] { atomic_op_deps_ = prepare_pool_.enqueue([&] {
auto *op_deps = new std::unordered_map<OpHandleBase *, std::atomic<int>>; auto *op_deps = new std::unordered_map<OpHandleBase *, std::atomic<int>>;
for (auto &pair : op_deps_) { for (auto &pair : op_deps_) {
(*op_deps)[pair.first] = pair.second; (*op_deps)[pair.first] = pair.second;
......
...@@ -46,6 +46,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -46,6 +46,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
std::vector<OpHandleBase *> bootstrap_ops_; std::vector<OpHandleBase *> bootstrap_ops_;
::ThreadPool pool_; ::ThreadPool pool_;
::ThreadPool prepare_pool_;
platform::DeviceContextPool fetch_ctxs_; platform::DeviceContextPool fetch_ctxs_;
std::atomic<int> remaining_; std::atomic<int> remaining_;
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/macros.h"
...@@ -25,6 +26,7 @@ limitations under the License. */ ...@@ -25,6 +26,7 @@ limitations under the License. */
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -81,6 +83,24 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, ...@@ -81,6 +83,24 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
} }
} }
static void EnableFusedOp(ExecutorPrepareContext* ctx) {
#ifdef PADDLE_WITH_NGRAPH
VLOG(3) << "use_ngraph=True";
auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_);
for (auto& interval : intervals) {
auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_,
interval.at(0), interval.at(1));
*interval[0] = std::unique_ptr<OperatorBase>(fused_op);
}
for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
ctx->ops_.erase(it->at(0) + 1, it->at(1));
}
#else
LOG(WARNING)
<< "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
#endif
}
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
void Executor::Close() { void Executor::Close() {
...@@ -338,6 +358,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( ...@@ -338,6 +358,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
} }
if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
return ctx; return ctx;
} }
...@@ -359,6 +380,7 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare( ...@@ -359,6 +380,7 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars, bool create_local_scope, bool create_vars,
bool keep_kids) { bool keep_kids) {
PADDLE_ENFORCE_NOT_NULL(scope);
Scope* local_scope = scope; Scope* local_scope = scope;
if (create_vars) { if (create_vars) {
if (create_local_scope) { if (create_local_scope) {
...@@ -396,11 +418,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -396,11 +418,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
DeleteUnusedTensors(*local_scope, op.get(), gc.get(), DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
&(ctx->cur_ref_cnts_)); &(ctx->cur_ref_cnts_));
} }
if (FLAGS_benchmark) {
VLOG(20) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
}
} }
if (gc != nullptr) { if (gc != nullptr) {
...@@ -422,13 +439,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, ...@@ -422,13 +439,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
scope->DropKids(); scope->DropKids();
} }
} }
if (FLAGS_benchmark) {
VLOG(20) << "-------------------------------------------------------";
VLOG(20) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_);
VLOG(20) << "-------------------------------------------------------";
}
} }
void Executor::RunPreparedContext( void Executor::RunPreparedContext(
...@@ -485,6 +495,5 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) { ...@@ -485,6 +495,5 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
<< "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
#endif #endif
} }
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -29,7 +29,7 @@ template <typename T> ...@@ -29,7 +29,7 @@ template <typename T>
class GarbageCollector { class GarbageCollector {
public: public:
GarbageCollector(const platform::Place &place, size_t max_memory_size) GarbageCollector(const platform::Place &place, size_t max_memory_size)
: max_memory_size_(std::max(max_memory_size, static_cast<size_t>(1))) { : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
garbages_.reset(new std::deque<T *>()); garbages_.reset(new std::deque<T *>());
dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
} }
......
...@@ -5,6 +5,7 @@ file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") ...@@ -5,6 +5,7 @@ file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
# Usage: pass_library(target inference) will append to paddle_inference_pass.h # Usage: pass_library(target inference) will append to paddle_inference_pass.h
unset(INFER_IR_PASSES CACHE) # clear the global variable
function(pass_library TARGET DEST) function(pass_library TARGET DEST)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
...@@ -15,10 +16,11 @@ function(pass_library TARGET DEST) ...@@ -15,10 +16,11 @@ function(pass_library TARGET DEST)
if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference") if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
message(STATUS "add pass ${TARGET} ${DEST}") message(STATUS "add pass ${TARGET} ${DEST}")
file(APPEND ${pass_file} "USE_PASS(${TARGET});\n") file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE) set(INFER_IR_PASSES ${INFER_IR_PASSES} ${TARGET} CACHE INTERNAL "")
endif() endif()
endfunction() endfunction()
cc_library(node SRCS node.cc DEPS proto_desc) cc_library(node SRCS node.cc DEPS proto_desc)
cc_library(graph SRCS graph.cc DEPS node pretty_log) cc_library(graph SRCS graph.cc DEPS node pretty_log)
cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
...@@ -39,6 +41,7 @@ pass_library(seq_concat_fc_fuse_pass inference) ...@@ -39,6 +41,7 @@ pass_library(seq_concat_fc_fuse_pass inference)
pass_library(multi_batch_merge_pass base) pass_library(multi_batch_merge_pass base)
pass_library(conv_bn_fuse_pass inference) pass_library(conv_bn_fuse_pass inference)
pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference)
pass_library(is_test_pass base)
if(WITH_MKLDNN) if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base) pass_library(mkldnn_placement_pass base)
pass_library(depthwise_conv_mkldnn_pass base) pass_library(depthwise_conv_mkldnn_pass base)
...@@ -63,6 +66,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r ...@@ -63,6 +66,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
if (WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
......
...@@ -91,10 +91,10 @@ void FindWhileOp(Graph* graph) { ...@@ -91,10 +91,10 @@ void FindWhileOp(Graph* graph) {
#undef OP_SET_IN #undef OP_SET_IN
#undef OP_SET_OUT #undef OP_SET_OUT
auto* X = graph->RetriveNode(34); auto* X = graph->RetrieveNode(34);
auto* LSTMOUT = graph->RetriveNode(81); auto* LSTMOUT = graph->RetrieveNode(81);
auto* cell_init = graph->RetriveNode(6); auto* cell_init = graph->RetrieveNode(6);
auto* hidden_init = graph->RetriveNode(8); auto* hidden_init = graph->RetrieveNode(8);
auto* lstm_op = graph->CreateOpNode(&op_desc); auto* lstm_op = graph->CreateOpNode(&op_desc);
PrepareParameters(graph, param); PrepareParameters(graph, param);
...@@ -211,12 +211,12 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, ...@@ -211,12 +211,12 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
VLOG(30) << "LSTMWeight resized to " << out->dims(); VLOG(30) << "LSTMWeight resized to " << out->dims();
float* out_data = out->mutable_data<float>(platform::CPUPlace()); float* out_data = out->mutable_data<float>(platform::CPUPlace());
std::array<const float*, 4> tensors( std::array<const float*, 4> tensors{
{{W_forget_w0.data<float>(), W_input_w0.data<float>(), W_forget_w0.data<float>(), W_input_w0.data<float>(),
W_output_w0.data<float>(), W_cell_w0.data<float>()}}); W_output_w0.data<float>(), W_cell_w0.data<float>()};
std::array<const float*, 4> tensors1( std::array<const float*, 4> tensors1{
{{W_forget_w1.data<float>(), W_input_w1.data<float>(), W_forget_w1.data<float>(), W_input_w1.data<float>(),
W_output_w1.data<float>(), W_cell_w1.data<float>()}}); W_output_w1.data<float>(), W_cell_w1.data<float>()};
for (int row = 0; row < D; row++) { for (int row = 0; row < D; row++) {
for (int col = 0; col < 4; col++) { for (int col = 0; col < 4; col++) {
...@@ -238,9 +238,9 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0, ...@@ -238,9 +238,9 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
const LoDTensor& B_output, const LoDTensor& B_cell, const LoDTensor& B_output, const LoDTensor& B_cell,
LoDTensor* out) { LoDTensor* out) {
std::array<const float*, 4> tensors( std::array<const float*, 4> tensors{
{{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(), B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
B_cell.data<float>()}}); B_cell.data<float>()};
PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1); PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
int D = B_forget.dims()[0]; int D = B_forget.dims()[0];
......
...@@ -14,14 +14,15 @@ ...@@ -14,14 +14,15 @@
#include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h"
#include <functional> #include <functional>
#include <utility> #include <list>
#include <map>
#include <tuple>
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
namespace {
// The function keeps the graph consistent by replacing // The function keeps the graph consistent by replacing
// a node 'from' in the set of inputs nodes // a node 'from' in the set of inputs nodes
...@@ -52,97 +53,382 @@ void CorrectGraphEdges(Graph* graph, Node* from, Node* to) { ...@@ -52,97 +53,382 @@ void CorrectGraphEdges(Graph* graph, Node* from, Node* to) {
} }
} }
} }
} // namespace
using graph_ptr = std::unique_ptr<ir::Graph>;
graph_ptr ConvElementwiseAddMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { bool IsReachable(ir::Graph* graph, Node* from, Node* to) {
FusePassBase::Init(name_scope_, graph.get()); auto find_node = [](ir::Graph* graph, const Node* node) -> Node* {
for (auto n : graph->Nodes()) {
if (n == node) {
return n;
}
}
GraphPatternDetector gpd; return nullptr;
auto pattern = gpd.mutable_pattern(); };
patterns::Conv conv_pattern{pattern, name_scope_}; if (from == to) {
auto conv_output = conv_pattern(); return true;
}
std::map<Node*, bool> visited;
patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_}; for (auto& node : GraphTraits::DFS(*graph)) {
visited[&node] = false;
}
elementwise_add_pattern(conv_output); visited[from] = true;
conv_output->AsIntermediate(); std::list<Node*> queue;
queue.push_back(from);
while (!queue.empty()) {
auto cur = find_node(graph, queue.front());
queue.pop_front();
if (!cur) return false;
auto conv_op_has_bias = [](const Node& conv_op) -> std::pair<bool, Node*> { for (auto n : cur->outputs) {
auto bias_input_names = conv_op.Op()->Inputs(); if (n == to) {
auto bias_it = bias_input_names.find("Bias"); return true;
if (bias_it != std::end(bias_input_names)) {
bool has_bias = !bias_it->second.empty();
if (has_bias) {
auto conv_bias_names = bias_it->second;
auto conv_bias_names_it =
std::find_if(std::begin(conv_op.inputs), std::end(conv_op.inputs),
[&conv_bias_names](Node* n) -> bool {
return n->Name() == conv_bias_names[0];
});
return std::make_pair(has_bias, *conv_bias_names_it);
} }
if (!visited[n]) {
visited[n] = true;
queue.push_back(n);
}
}
}
return false;
}
boost::optional<Node*> HasBias(const Node& op, const std::string& bias_name) {
auto bias_input_names = op.Op()->Inputs();
auto bias_it = bias_input_names.find(bias_name);
if (bias_it != std::end(bias_input_names)) {
bool has_bias = !bias_it->second.empty();
if (has_bias) {
auto bias_names = bias_it->second;
auto bias_names_it =
std::find_if(std::begin(op.inputs), std::end(op.inputs),
[&bias_names](Node* n) -> bool {
return n->Name() == bias_names[0];
});
return *bias_names_it;
} }
}
return std::make_pair(false, nullptr); return boost::none;
}; }
ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle(
const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
const ResidualConnectionMKLDNNFusePass::IdentityConvFunc&
get_node_from_conv_op,
const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc&
get_node_from_elementwise_add_op)
: fusion_stats{std::make_shared<int>(0)},
can_fuse_func{can_fuse_func},
get_node_from_conv_op{get_node_from_conv_op},
get_node_from_elementwise_add_op{get_node_from_elementwise_add_op} {}
void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
Node* conv_op;
Node* conv_input;
Node* conv_filter;
Node* conv_output;
Node* elementwise_add_op;
Node* elementwise_add_identity;
Node* elementwise_add_out;
std::tie(conv_op, conv_input, conv_filter, conv_output) =
get_node_from_conv_op(subgraph);
std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) =
get_node_from_elementwise_add_op(subgraph);
if (!can_fuse_func(conv_op, elementwise_add_op)) return;
if (!IsReachable(graph, elementwise_add_identity, conv_output)) return;
OpDesc op_desc;
op_desc.SetType("conv2d");
op_desc.SetInput("Input", {conv_input->Name()});
op_desc.SetInput("Filter", {conv_filter->Name()});
op_desc.SetInput("ResidualData", {elementwise_add_identity->Name()});
op_desc.SetOutput("Output", {conv_output->Name()});
auto conv_bias = HasBias(*conv_op, "Bias");
if (conv_bias) {
op_desc.SetInput("Bias", {(*conv_bias)->Name()});
}
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, for (const auto& attr : conv_op->Op()->GetAttrMap()) {
Graph* g) { op_desc.SetAttr(attr.first, attr.second);
GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); }
GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern);
if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; op_desc.SetAttr("fuse_residual_connection", true);
OpDesc op_desc; auto fused_conv_op = graph->CreateOpNode(&op_desc);
op_desc.SetType("conv2d");
op_desc.SetInput("Input", {conv_input->Name()}); IR_NODE_LINK_TO(conv_input, fused_conv_op);
op_desc.SetInput("Filter", {conv_filter->Name()}); IR_NODE_LINK_TO(conv_filter, fused_conv_op);
op_desc.SetInput("ResidualData", {elementwise_add_x->Name()}); IR_NODE_LINK_TO(elementwise_add_identity, fused_conv_op);
op_desc.SetOutput("Output", {conv_output->Name()}); IR_NODE_LINK_TO(fused_conv_op, conv_output);
bool has_bias; if (conv_bias) {
Node* conv_bias; IR_NODE_LINK_TO((*conv_bias), fused_conv_op);
}
std::tie(has_bias, conv_bias) = conv_op_has_bias(*conv_op); CorrectGraphEdges(graph, elementwise_add_out, conv_output);
GraphSafeRemoveNodes(graph,
{elementwise_add_out, conv_op, elementwise_add_op});
(*fusion_stats)++;
}
if (has_bias) { ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle(
op_desc.SetInput("Bias", {conv_bias->Name()}); const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func,
} const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
get_node_from_conv_x_op,
const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc&
get_node_from_conv_y_op,
const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc&
get_node_from_elementwise_add_op)
: fusion_stats{std::make_shared<int>(0)},
can_fuse_func{can_fuse_func},
get_node_from_conv_x_op{get_node_from_conv_x_op},
get_node_from_conv_y_op{get_node_from_conv_y_op},
get_node_from_elementwise_add_op{get_node_from_elementwise_add_op} {}
void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()(
const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
Node* conv_x_op;
Node* conv_x_input;
Node* conv_x_filter;
Node* conv_x_output;
Node* conv_y_op;
Node* conv_y_input;
Node* conv_y_filter;
Node* conv_y_output;
Node* elementwise_add_op;
Node* elementwise_add_out;
std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) =
get_node_from_conv_x_op(subgraph);
std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) =
get_node_from_conv_y_op(subgraph);
std::tie(elementwise_add_op, elementwise_add_out) =
get_node_from_elementwise_add_op(subgraph);
if (!can_fuse_func(conv_x_op, elementwise_add_op)) return;
if (!can_fuse_func(conv_y_op, elementwise_add_op)) return;
Node* projection_node;
Node* residual_conv_op;
Node* residual_conv_input;
Node* residual_conv_filter;
Node* residual_conv_output;
if (IsReachable(graph, conv_x_input, conv_y_output)) {
projection_node = conv_x_output;
residual_conv_op = conv_y_op;
residual_conv_input = conv_y_input;
residual_conv_filter = conv_y_filter;
residual_conv_output = conv_y_output;
} else if (IsReachable(graph, conv_y_input, conv_x_output)) {
projection_node = conv_y_output;
residual_conv_op = conv_x_op;
residual_conv_input = conv_x_input;
residual_conv_filter = conv_x_filter;
residual_conv_output = conv_x_output;
} else {
return;
}
for (const auto& attr : conv_op->Op()->GetAttrMap()) { OpDesc op_desc;
op_desc.SetAttr(attr.first, attr.second); op_desc.SetType("conv2d");
}
op_desc.SetAttr("fuse_residual_connection", true); op_desc.SetInput("Input", {residual_conv_input->Name()});
op_desc.SetInput("Filter", {residual_conv_filter->Name()});
op_desc.SetInput("ResidualData", {projection_node->Name()});
op_desc.SetOutput("Output", {residual_conv_output->Name()});
auto fused_conv_op = g->CreateOpNode(&op_desc); auto residual_conv_bias = HasBias(*residual_conv_op, "Bias");
IR_NODE_LINK_TO(conv_input, fused_conv_op); if (residual_conv_bias) {
IR_NODE_LINK_TO(conv_filter, fused_conv_op); op_desc.SetInput("Bias", {(*residual_conv_bias)->Name()});
IR_NODE_LINK_TO(elementwise_add_x, fused_conv_op); }
IR_NODE_LINK_TO(fused_conv_op, conv_output);
CorrectGraphEdges(g, elementwise_add_out, conv_output); for (const auto& attr : residual_conv_op->Op()->GetAttrMap()) {
GraphSafeRemoveNodes(g, {elementwise_add_out, conv_op, elementwise_add_op}); op_desc.SetAttr(attr.first, attr.second);
}; }
op_desc.SetAttr("fuse_residual_connection", true);
auto fused_conv_op = graph->CreateOpNode(&op_desc);
IR_NODE_LINK_TO(residual_conv_input, fused_conv_op);
IR_NODE_LINK_TO(residual_conv_filter, fused_conv_op);
IR_NODE_LINK_TO(projection_node, fused_conv_op);
IR_NODE_LINK_TO(fused_conv_op, residual_conv_output);
if (residual_conv_bias) {
IR_NODE_LINK_TO((*residual_conv_bias), fused_conv_op);
}
CorrectGraphEdges(graph, elementwise_add_out, residual_conv_output);
GraphSafeRemoveNodes(
graph, {elementwise_add_out, residual_conv_op, elementwise_add_op});
(*fusion_stats)++;
}
std::tuple<Node*, Node*, Node*, Node*>
ResidualConnectionMKLDNNFusePass::GetNodesFromConv(
const patterns::Conv& conv_pattern,
const GraphPatternDetector::subgraph_t& subgraph) const {
GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
return std::make_tuple(conv_op, conv_input, conv_filter, conv_output);
}
GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX(
const std::string& name_scope,
const GraphWithStats& graph_with_stats) const {
ir::Graph* graph;
int stats;
std::tie(graph, stats) = graph_with_stats;
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
patterns::Conv conv_pattern{pattern, name_scope};
auto conv_output = conv_pattern();
gpd(graph.get(), handler); patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
elementwise_add_pattern(
conv_output,
pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
conv_output->AsIntermediate();
auto get_node_from_elementwise_add = [&elementwise_add_pattern](
const GraphPatternDetector::subgraph_t& subgraph)
-> std::tuple<Node*, Node*, Node*> {
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern);
return std::make_tuple(elementwise_add_op, elementwise_add_y,
elementwise_add_out);
};
return ExecuteHandleOnGraph<IdentityFuseHandle>(
&gpd, graph_with_stats,
[this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
return GetNodesFromConv(conv_pattern, subgraph);
},
get_node_from_elementwise_add);
}
GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY(
const std::string& name_scope,
const GraphWithStats& graph_with_stats) const {
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
patterns::Conv conv_pattern{pattern, name_scope};
auto conv_output = conv_pattern();
patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
elementwise_add_pattern(
pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
conv_output);
conv_output->AsIntermediate();
auto get_node_from_elementwise_add = [&elementwise_add_pattern](
const GraphPatternDetector::subgraph_t& subgraph)
-> std::tuple<Node*, Node*, Node*> {
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern);
return std::make_tuple(elementwise_add_op, elementwise_add_x,
elementwise_add_out);
};
return ExecuteHandleOnGraph<IdentityFuseHandle>(
&gpd, graph_with_stats,
[this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
return GetNodesFromConv(conv_pattern, subgraph);
},
get_node_from_elementwise_add);
}
GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv(
const std::string& name_scope,
const GraphWithStats& graph_with_stats) const {
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
patterns::Conv conv_x_pattern{pattern, name_scope};
auto conv_x_output = conv_x_pattern();
patterns::Conv conv_y_pattern{pattern, name_scope};
auto conv_y_output = conv_y_pattern();
patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope};
elementwise_add_pattern(conv_x_output, conv_y_output);
conv_x_output->AsIntermediate();
conv_y_output->AsIntermediate();
auto get_node_from_elementwise_add = [&elementwise_add_pattern](
const GraphPatternDetector::subgraph_t& subgraph)
-> std::tuple<Node*, Node*> {
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
elementwise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
elementwise_add_pattern);
return std::make_tuple(elementwise_add_op, elementwise_add_out);
};
return ExecuteHandleOnGraph<ProjectionFuseHandle>(
&gpd, graph_with_stats,
[this,
&conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
return GetNodesFromConv(conv_x_pattern, subgraph);
},
[this,
&conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) {
return GetNodesFromConv(conv_y_pattern, subgraph);
},
get_node_from_elementwise_add);
}
graph_ptr ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
FusePassBase::Init(name_scope_, graph.get());
auto fused_graph_with_stats = FuseConvAsY(
name_scope_,
FuseConvAsX(
name_scope_,
FuseProjectionConv(name_scope_, std::make_pair(graph.get(), 0))));
std::cout << "Fused graph " << fused_graph_with_stats.second << std::endl;
AddStatis(fused_graph_with_stats.second);
return graph; return graph;
} }
} // namespace ir } // namespace ir
...@@ -150,4 +436,4 @@ graph_ptr ConvElementwiseAddMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { ...@@ -150,4 +436,4 @@ graph_ptr ConvElementwiseAddMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
} // namespace paddle } // namespace paddle
REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass, REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
paddle::framework::ir::ConvElementwiseAddMKLDNNFusePass); paddle::framework::ir::ResidualConnectionMKLDNNFusePass);
...@@ -15,24 +15,119 @@ ...@@ -15,24 +15,119 @@
#pragma once #pragma once
#include <string> #include <string>
#include <tuple>
#include <utility>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include <boost/optional.hpp>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
class ConvElementwiseAddMKLDNNFusePass : public FusePassBase { using graph_ptr = std::unique_ptr<ir::Graph>;
using GraphWithStats = std::pair<ir::Graph*, int>;
void CorrectGraphEdges(Graph* graph, Node* from, Node* to);
bool IsReachable(ir::Graph* graph, Node* from, Node* to);
boost::optional<Node*> HasBias(const Node& op, const std::string& bias_name);
class ResidualConnectionMKLDNNFusePass : public FusePassBase {
private:
GraphWithStats FuseConvAsX(const std::string& name_scope,
const GraphWithStats& graph_with_stats) const;
GraphWithStats FuseConvAsY(const std::string& name_scope,
const GraphWithStats& graph_with_stats) const;
GraphWithStats FuseProjectionConv(
const std::string& name_scope,
const GraphWithStats& graph_with_stats) const;
template <typename RetType>
using GetNodeFunc =
std::function<RetType(const GraphPatternDetector::subgraph_t& subgraph)>;
using IdentityConvFunc = GetNodeFunc<std::tuple<Node*, Node*, Node*, Node*>>;
using IdentityElementwiseAddFunc =
GetNodeFunc<std::tuple<Node*, Node*, Node*>>;
using ProjectionConvFunc = IdentityConvFunc;
using ProjectionElementwiseAddFunc = GetNodeFunc<std::tuple<Node*, Node*>>;
using CanFuseFunc = std::function<bool(Node*, Node*)>;
std::tuple<Node*, Node*, Node*, Node*> GetNodesFromConv(
const patterns::Conv& conv_pattern,
const GraphPatternDetector::subgraph_t& subgraph) const;
std::tuple<Node*, Node*, Node*, Node*> GetNodesFromProjectionConv(
const patterns::Conv& conv_pattern,
const GraphPatternDetector::subgraph_t& subgraph) const;
template <typename HandleType, typename... OpFuncs>
GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd,
const GraphWithStats& graph_with_stats,
OpFuncs&&... op_funcs) const {
ir::Graph* graph;
int stats;
std::tie(graph, stats) = graph_with_stats;
auto can_fuse = [this](Node* op1, Node* op2) -> bool {
return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN;
};
auto fuse_handle = HandleType{can_fuse, std::forward<OpFuncs>(op_funcs)...};
(*gpd)(graph, fuse_handle);
return std::make_pair(graph, stats + fuse_handle.get_stats());
}
struct IdentityFuseHandle {
IdentityFuseHandle(
const CanFuseFunc& can_fuse_func,
const IdentityConvFunc& get_node_from_conv_op,
const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op);
void operator()(const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph);
int get_stats() const { return *fusion_stats; }
private:
std::shared_ptr<int> fusion_stats;
CanFuseFunc can_fuse_func;
IdentityConvFunc get_node_from_conv_op;
IdentityElementwiseAddFunc get_node_from_elementwise_add_op;
};
struct ProjectionFuseHandle {
ProjectionFuseHandle(
const CanFuseFunc& can_fuse_func,
const ProjectionConvFunc& get_node_from_conv_x_op,
const ProjectionConvFunc& get_node_from_conv_y_op,
const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op);
void operator()(const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph);
int get_stats() const { return *fusion_stats; }
private:
std::shared_ptr<int> fusion_stats;
CanFuseFunc can_fuse_func;
ProjectionConvFunc get_node_from_conv_x_op;
ProjectionConvFunc get_node_from_conv_y_op;
ProjectionElementwiseAddFunc get_node_from_elementwise_add_op;
};
public: public:
virtual ~ConvElementwiseAddMKLDNNFusePass() {} virtual ~ResidualConnectionMKLDNNFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(graph_ptr graph) const;
const std::string name_scope_{"residual_connections_fuse_pass"}; const std::string name_scope_{"residual_connection_fuse_pass"};
}; };
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -41,7 +41,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, ...@@ -41,7 +41,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
} }
struct IsReachable { struct TestIsReachable {
using func = std::function<bool(const std::string&, const std::string&)>; using func = std::function<bool(const std::string&, const std::string&)>;
auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func { auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
...@@ -90,7 +90,9 @@ struct IsReachable { ...@@ -90,7 +90,9 @@ struct IsReachable {
} }
}; };
void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) { void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph,
int expected_conv_count,
int expected_elementwise_add_count = 0) {
int conv_count = 0; int conv_count = 0;
int elementwise_add_count = 0; int elementwise_add_count = 0;
...@@ -102,8 +104,8 @@ void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) { ...@@ -102,8 +104,8 @@ void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
++elementwise_add_count; ++elementwise_add_count;
} }
} }
EXPECT_EQ(conv_count, 1); EXPECT_EQ(conv_count, expected_conv_count);
EXPECT_EQ(elementwise_add_count, 0); EXPECT_EQ(elementwise_add_count, expected_elementwise_add_count);
} }
ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars, ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
...@@ -128,23 +130,13 @@ ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars, ...@@ -128,23 +130,13 @@ ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
return prog; return prog;
} }
} // namespace
TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
auto prog =
BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
SetOp(&prog, "conv2d",
{{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "b"});
SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
IsReachable is_reachable; void RunPassAndAssert(ProgramDesc* prog, const std::string& from,
const std::string& to, int expected_conv_num) {
std::unique_ptr<ir::Graph> graph(new ir::Graph(*prog));
EXPECT_TRUE(is_reachable(graph)("a", "relu")); TestIsReachable is_reachable;
EXPECT_TRUE(is_reachable(graph)(from, to));
auto pass = auto pass =
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
...@@ -152,82 +144,87 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) { ...@@ -152,82 +144,87 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
graph = pass->Apply(std::move(graph)); graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size(); int current_nodes_num = graph->Nodes().size();
EXPECT_TRUE(is_reachable(graph)("a", "relu")); EXPECT_TRUE(is_reachable(graph)(from, to));
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added, EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
current_nodes_num); current_nodes_num);
AssertOpsCount(graph); AssertOpsCount(graph, expected_conv_num);
} }
} // namespace
TEST(ConvElementwiseAddMKLDNNFusePass, TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) {
ConvolutionWithElementwiseAddReluNoBias) { auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
SetOp(&prog, "conv2d", {{"Input", "a"}, {"Filter", "weights"}},
{"Output", "b"});
SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
IsReachable is_reachable; SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d",
{{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "c"});
EXPECT_TRUE(is_reachable(graph)("a", "relu")); SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
auto pass = RunPassAndAssert(&prog, "a", "relu", 1);
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); }
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
EXPECT_TRUE(is_reachable(graph)("a", "relu")); TEST(ConvElementwiseAddMKLDNNFusePass,
ConvolutionAsYWithElementwiseAddReluNoBias) {
auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added, SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
current_nodes_num); SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
{"Output", "c"});
SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
AssertOpsCount(graph); RunPassAndAssert(&prog, "a", "relu", 1);
} }
TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionElementwiseAdd) { TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsXWithElementwiseAddRelu) {
auto prog = BuildProgramDesc({"a", "b", "c", "d"}, {"bias", "weights"}); auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d", SetOp(&prog, "conv2d",
{{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}}, {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "b"}); {"Output", "c"});
SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog)); SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
IsReachable is_reachable; RunPassAndAssert(&prog, "a", "relu", 1);
EXPECT_TRUE(is_reachable(graph)("a", "d")); }
auto pass = TEST(ConvElementwiseAddMKLDNNFusePass,
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); ConvolutionAsXWithElementwiseAddReluNoBias) {
int original_nodes_num = graph->Nodes().size(); auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
EXPECT_FALSE(is_reachable(graph)("a", "d")); SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
{"Output", "c"});
SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added, RunPassAndAssert(&prog, "a", "relu", 1);
current_nodes_num);
AssertOpsCount(graph);
} }
TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) { TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
auto prog = auto prog =
BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"}); BuildProgramDesc({"a", "b", "c", "d", "e", "f", "g"}, {"weights"});
SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"}); SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
SetOp(&prog, "conv2d", SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
{{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
{"Output", "c"}); {"Output", "c"});
SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "d"}}, {"Out", "e"});
SetOp(&prog, "relu", {{"X", "e"}}, {"Out", "f"});
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog)); SetOp(&prog, "conv2d", {{"Input", "d"}, {"Filter", "weights"}},
{"Output", "e"});
IsReachable is_reachable; SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "e"}}, {"Out", "f"});
SetOp(&prog, "relu", {{"X", "f"}}, {"Out", "g"});
EXPECT_TRUE(is_reachable(graph)("a", "f")); std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
TestIsReachable is_reachable;
EXPECT_TRUE(is_reachable(graph)("a", "g"));
auto pass = auto pass =
PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
...@@ -235,11 +232,10 @@ TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) { ...@@ -235,11 +232,10 @@ TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
graph = pass->Apply(std::move(graph)); graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size(); int current_nodes_num = graph->Nodes().size();
EXPECT_TRUE(is_reachable(graph)("a", "f")); EXPECT_TRUE(is_reachable(graph)("a", "g"));
EXPECT_EQ(original_nodes_num, current_nodes_num);
EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added, AssertOpsCount(graph, 2, 1);
current_nodes_num);
AssertOpsCount(graph);
} }
} // namespace ir } // namespace ir
......
...@@ -57,6 +57,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( ...@@ -57,6 +57,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
desc.SetInput("W", std::vector<std::string>({fc_Y_in})); desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
desc.SetInput("Bias", std::vector<std::string>({fc_bias_in})); desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
desc.SetOutput("Out", std::vector<std::string>({fc_out_out})); desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims"));
desc.SetType("fc"); desc.SetType("fc");
auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out}); GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
......
...@@ -29,6 +29,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, ...@@ -29,6 +29,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
if (type == "mul") { if (type == "mul") {
op->SetInput("X", {inputs[0]}); op->SetInput("X", {inputs[0]});
op->SetInput("Y", {inputs[1]}); op->SetInput("Y", {inputs[1]});
op->SetAttr("x_num_col_dims", {1});
} else if (type == "elementwise_add") { } else if (type == "elementwise_add") {
op->SetInput("X", inputs); op->SetInput("X", inputs);
} }
......
...@@ -84,8 +84,6 @@ void CheckProgram(const ProgramDesc &program) { ...@@ -84,8 +84,6 @@ void CheckProgram(const ProgramDesc &program) {
Graph::Graph(const ProgramDesc &program) : program_(program) { Graph::Graph(const ProgramDesc &program) : program_(program) {
CheckProgram(program_); CheckProgram(program_);
// Make the nodes id start from 0.
Node::ResetId();
auto var_nodes = InitFromProgram(program_); auto var_nodes = InitFromProgram(program_);
ResolveHazard(var_nodes); ResolveHazard(var_nodes);
} }
......
...@@ -116,13 +116,17 @@ class Graph { ...@@ -116,13 +116,17 @@ class Graph {
// Create a normal variable with non-null VarDesc. // Create a normal variable with non-null VarDesc.
ir::Node *CreateVarNode(VarDesc *var_desc) { ir::Node *CreateVarNode(VarDesc *var_desc) {
PADDLE_ENFORCE(var_desc); PADDLE_ENFORCE(var_desc);
return AddNode(new ir::Node(var_desc)); auto *x = AddNode(new ir::Node(var_desc));
x->SetId(num_node_created_++);
return x;
} }
// Create a normal runnable operator with OpDesc. // Create a normal runnable operator with OpDesc.
ir::Node *CreateOpNode(OpDesc *op_desc) { ir::Node *CreateOpNode(OpDesc *op_desc) {
PADDLE_ENFORCE(op_desc); PADDLE_ENFORCE(op_desc);
return AddNode(new ir::Node(op_desc)); auto *x = AddNode(new ir::Node(op_desc));
x->SetId(num_node_created_++);
return x;
} }
// Create a control dependency var that connects 2 operations. The // Create a control dependency var that connects 2 operations. The
...@@ -132,13 +136,17 @@ class Graph { ...@@ -132,13 +136,17 @@ class Graph {
// TODO(panyx0718): control var name should be really unique. // TODO(panyx0718): control var name should be really unique.
const std::string name = string::Sprintf( const std::string name = string::Sprintf(
"%s@%llu", ir::Node::kControlDepVarName, node_set_.size()); "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
return AddNode(new ir::Node(name, ir::Node::Type::kVariable)); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable));
x->SetId(num_node_created_++);
return x;
} }
// A more free style way of creating a graph node. Mostly use for test // A more free style way of creating a graph node. Mostly use for test
// or "copy" from another node. Avoid using it if possible. // or "copy" from another node. Avoid using it if possible.
ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) { ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) {
return AddNode(new ir::Node(name, type)); auto *x = AddNode(new ir::Node(name, type));
x->SetId(num_node_created_++);
return x;
} }
// Clear all node information of the graph and return the ownership of the // Clear all node information of the graph and return the ownership of the
...@@ -160,7 +168,7 @@ class Graph { ...@@ -160,7 +168,7 @@ class Graph {
} }
// NOTE low performance, but simple and secure. // NOTE low performance, but simple and secure.
Node *RetriveNode(int id) { Node *RetrieveNode(int id) {
for (auto &node : nodes_) { for (auto &node : nodes_) {
if (node.second->id() == id) { if (node.second->id() == id) {
return node.second.get(); return node.second.get();
...@@ -169,6 +177,7 @@ class Graph { ...@@ -169,6 +177,7 @@ class Graph {
return nullptr; return nullptr;
} }
const ProgramDesc &program() const { return program_; }
std::map<std::string, std::vector<ir::Node *>> InitFromProgram( std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
const ProgramDesc &program); const ProgramDesc &program);
...@@ -190,6 +199,7 @@ class Graph { ...@@ -190,6 +199,7 @@ class Graph {
std::map<std::string, std::function<void(void)>> attr_dels_; std::map<std::string, std::function<void(void)>> attr_dels_;
std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_; std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
std::unordered_set<ir::Node *> node_set_; std::unordered_set<ir::Node *> node_set_;
size_t num_node_created_{0}; // help to generate a unique node id.
}; };
bool IsControlDepVar(const ir::Node &var); bool IsControlDepVar(const ir::Node &var);
......
...@@ -15,8 +15,15 @@ limitations under the License. */ ...@@ -15,8 +15,15 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include <algorithm> #include <algorithm>
#include <deque> #include <deque>
#include <fstream>
#include <iosfwd>
#include <ostream>
#include <unordered_set> #include <unordered_set>
DEFINE_string(print_sub_graph_dir, "",
"FLAGS_print_sub_graph_dir is used "
"to print the nodes of sub_graphs.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
...@@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) { ...@@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) {
graph_nodes.emplace_back(g_nodes); graph_nodes.emplace_back(g_nodes);
} }
if (VLOG_IS_ON(100)) { if (FLAGS_print_sub_graph_dir.size()) {
VLOG(100) << "graph_num: " << graph_nodes.size(); if (graph_nodes.size() > 1) {
for (auto &g_n : graph_nodes) { std::stringstream out;
VLOG(100) << "graph_nodes: " << g_n.size(); for (auto &g_n : graph_nodes) {
if (g_n.size() < 10) { out << "graph_nodes: " << g_n.size() << "\n";
std::stringstream out; }
out << "\n\n";
for (auto &g_n : graph_nodes) {
out << "graph_nodes: " << g_n.size();
for (auto &node : g_n) { for (auto &node : g_n) {
out << "\nNode: " << node->Name() << " in ["; out << "\nNode: " << node->Name() << " in [";
for (auto &n : node->inputs) { for (auto &n : node->inputs) {
...@@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) { ...@@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) {
} }
out << "]"; out << "]";
} }
VLOG(100) << out.str(); out << "\n\n\n";
} }
std::unique_ptr<std::ostream> fout(
new std::ofstream(FLAGS_print_sub_graph_dir));
PADDLE_ENFORCE(fout->good());
*fout << out.str();
} }
} }
......
...@@ -167,10 +167,12 @@ struct HitGroup { ...@@ -167,10 +167,12 @@ struct HitGroup {
bool Match(Node *node, PDNode *pat) { bool Match(Node *node, PDNode *pat) {
if (nodes_.count(node)) { if (nodes_.count(node)) {
if (!roles.count(pat)) return false; if (roles.count(pat) && roles[pat] == node) return true;
return roles[pat] == node; return false;
} else {
if (roles.count(pat) && roles[pat] != node) return false;
return true;
} }
return !roles.count(pat) || roles.at(pat) == node;
} }
void Register(Node *node, PDNode *pat) { void Register(Node *node, PDNode *pat) {
...@@ -198,7 +200,6 @@ GraphPatternDetector::DetectPatterns() { ...@@ -198,7 +200,6 @@ GraphPatternDetector::DetectPatterns() {
std::vector<GraphPatternDetector::subgraph_t> result; std::vector<GraphPatternDetector::subgraph_t> result;
std::vector<HitGroup> init_groups; std::vector<HitGroup> init_groups;
std::array<std::vector<HitGroup>, 2> bi_records; std::array<std::vector<HitGroup>, 2> bi_records;
// PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get() auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
: pattern_.edges().front().first; : pattern_.edges().front().first;
if (!pdnodes2nodes_.count(first_pnode)) return result; if (!pdnodes2nodes_.count(first_pnode)) return result;
...@@ -228,11 +229,12 @@ GraphPatternDetector::DetectPatterns() { ...@@ -228,11 +229,12 @@ GraphPatternDetector::DetectPatterns() {
VLOG(80) << "check " << source->id() << " -- " << target->id(); VLOG(80) << "check " << source->id() << " -- " << target->id();
// TODO(Superjomn) add some prune strategies. // TODO(Superjomn) add some prune strategies.
for (const auto &group : pre_groups) { for (const auto &group : pre_groups) {
HitGroup new_group = group; if (IsNodesLink(source, target)) {
if (IsNodesLink(source, target) && HitGroup new_group = group;
new_group.Match(source, edge.first)) { bool flag = new_group.Match(source, edge.first) &&
new_group.Register(source, edge.first); new_group.Match(target, edge.second);
if (new_group.Match(target, edge.second)) { if (flag) {
new_group.Register(source, edge.first);
new_group.Register(target, edge.second); new_group.Register(target, edge.second);
cur_groups.push_back(new_group); cur_groups.push_back(new_group);
// TODO(Superjomn) need to unique // TODO(Superjomn) need to unique
...@@ -261,14 +263,16 @@ GraphPatternDetector::DetectPatterns() { ...@@ -261,14 +263,16 @@ GraphPatternDetector::DetectPatterns() {
return result; return result;
} }
bool GraphItemCMP(const std::pair<PDNode *, Node *> &a, struct GraphItemLessThan {
bool operator()(const std::pair<PDNode *, Node *> &a,
const std::pair<PDNode *, Node *> &b) { const std::pair<PDNode *, Node *> &b) {
if (a.first != b.first) { if (a.first != b.first) {
return a.first < b.first; return a.first < b.first;
} else { } else {
return a.second < b.second; return a.second < b.second;
}
} }
} };
// TODO(Superjomn) enhance the function as it marks unique unique as duplicates // TODO(Superjomn) enhance the function as it marks unique unique as duplicates
// see https://github.com/PaddlePaddle/Paddle/issues/13550 // see https://github.com/PaddlePaddle/Paddle/issues/13550
...@@ -282,7 +286,7 @@ void GraphPatternDetector::UniquePatterns( ...@@ -282,7 +286,7 @@ void GraphPatternDetector::UniquePatterns(
for (auto &g : *subgraphs) { for (auto &g : *subgraphs) {
// Sort the items in the sub-graph, and transform to a string key. // Sort the items in the sub-graph, and transform to a string key.
std::vector<std::pair<PDNode *, Node *>> sorted_keys(g.begin(), g.end()); std::vector<std::pair<PDNode *, Node *>> sorted_keys(g.begin(), g.end());
std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP); std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan());
std::stringstream ss; std::stringstream ss;
for (auto &item : sorted_keys) { for (auto &item : sorted_keys) {
ss << item.first << ":" << item.second; ss << item.first << ":" << item.second;
...@@ -1080,16 +1084,12 @@ PDNode *patterns::Conv::operator()() { ...@@ -1080,16 +1084,12 @@ PDNode *patterns::Conv::operator()() {
return output_var; return output_var;
} }
PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var) { PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
->assert_is_op("elementwise_add"); ->assert_is_op("elementwise_add");
x_var->assert_is_op_input("elementwise_add", "X"); x_var->AsInput()->assert_is_op_input("elementwise_add", "X");
y_var->AsInput()->assert_is_op_input("elementwise_add", "Y");
auto y_var = pattern->NewNode(elementwise_add_x_repr())
->AsInput()
->assert_is_op_input("elementwise_add", "Y");
auto out_var = pattern->NewNode(elementwise_add_out_repr()) auto out_var = pattern->NewNode(elementwise_add_out_repr())
->AsOutput() ->AsOutput()
->assert_is_op_output("elementwise_add", "Out"); ->assert_is_op_output("elementwise_add", "Out");
......
...@@ -310,8 +310,8 @@ void GraphSafeRemoveNodes(Graph* graph, ...@@ -310,8 +310,8 @@ void GraphSafeRemoveNodes(Graph* graph,
const std::unordered_set<const Node*>& nodes); const std::unordered_set<const Node*>& nodes);
// Some pre-defined patterns those can be reused in multiple passes. // Some pre-defined patterns those can be reused in multiple passes.
// The related Fluid Layer or Op should be one pattern here for better reusage // The related Fluid Layer or Op should be one pattern here for better re-usage
// accross different fusion. // across different fusion.
namespace patterns { namespace patterns {
struct KeyCounter { struct KeyCounter {
...@@ -664,7 +664,7 @@ struct ElementwiseAdd : public PatternBase { ...@@ -664,7 +664,7 @@ struct ElementwiseAdd : public PatternBase {
ElementwiseAdd(PDPattern* pattern, const std::string& name_scope) ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "elementwise_add") {} : PatternBase(pattern, name_scope, "elementwise_add") {}
PDNode* operator()(PDNode* x_var); PDNode* operator()(PDNode* x_var, PDNode* y_var);
PATTERN_DECL_NODE(elementwise_add_op); PATTERN_DECL_NODE(elementwise_add_op);
PATTERN_DECL_NODE(elementwise_add_x); PATTERN_DECL_NODE(elementwise_add_x);
......
...@@ -35,10 +35,11 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl( ...@@ -35,10 +35,11 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
new proto::ProgramDesc(*program.Proto())); new proto::ProgramDesc(*program.Proto()));
auto block = program_pb->mutable_blocks(kRootBlockIndex); auto block = program_pb->mutable_blocks(kRootBlockIndex);
block->set_idx(kRootBlockIndex);
block->clear_vars(); block->clear_vars();
std::unordered_set<std::string> visited_vars; std::unordered_set<std::string> visited_vars;
for (ir::Node* n : graph->Nodes()) { for (ir::Node* n : graph->Nodes()) {
if (n->NodeType() == ir::Node::Type::kVariable) { if (n->IsVar()) {
if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) { if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) {
visited_vars.insert(n->Var()->Name()); visited_vars.insert(n->Var()->Name());
block->add_vars()->MergeFrom(*n->Var()->Proto()); block->add_vars()->MergeFrom(*n->Var()->Proto());
......
...@@ -66,6 +66,76 @@ NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) { ...@@ -66,6 +66,76 @@ NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) {
} }
Node *NodesDFSIterator::operator->() { return stack_.top(); } Node *NodesDFSIterator::operator->() { return stack_.top(); }
inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
return node.inputs.size() == n;
}
NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
PADDLE_ENFORCE(!source.empty(),
"Start points of topological sorting should not be empty!");
// CHECK all the inputs' in-degree is 0
for (auto *node : source) {
PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
}
std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()};
std::vector<Node *> inlink_visited;
while (!to_visit.empty()) {
std::vector<Node *> queue(to_visit.begin(), to_visit.end());
for (auto *p : queue) {
inlink_visited.clear();
std::copy_if(p->inputs.begin(), p->inputs.end(),
std::back_inserter(inlink_visited),
[&](Node *x) -> bool { return visited.count(x) != 0; });
if (inlink_visited.size() == p->inputs.size()) {
sorted_.push_back(p);
for (auto *_ : p->outputs) {
if (!visited.count(_)) {
to_visit.insert(_);
}
}
to_visit.erase(p);
visited.insert(p);
}
}
}
}
NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
: sorted_(other.sorted_), cursor_(other.cursor_) {}
Node &NodesTSIterator::operator*() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return *sorted_[cursor_];
}
NodesTSIterator &NodesTSIterator::operator++() {
if (++cursor_ >= sorted_.size()) {
sorted_.clear();
cursor_ = 0;
}
return *this;
}
NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
cursor_ = other.cursor_;
sorted_ = other.sorted_;
return *this;
}
bool NodesTSIterator::operator==(const NodesTSIterator &other) {
return sorted_ == other.sorted_ && cursor_ == other.cursor_;
}
Node *NodesTSIterator::operator->() {
PADDLE_ENFORCE_LT(cursor_, sorted_.size());
return sorted_[cursor_];
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -62,6 +62,32 @@ struct NodesDFSIterator ...@@ -62,6 +62,32 @@ struct NodesDFSIterator
std::unordered_set<Node *> visited_; std::unordered_set<Node *> visited_;
}; };
// Topological sorting iterator on nodes.
struct NodesTSIterator
: public std::iterator<std::forward_iterator_tag, Node *> {
NodesTSIterator() = default;
NodesTSIterator(const std::vector<Node *> &source);
NodesTSIterator(NodesTSIterator &&other)
: sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
other.cursor_ = 0;
}
NodesTSIterator(const NodesTSIterator &other);
Node &operator*();
NodesTSIterator &operator++();
// TODO(Superjomn) current implementation just compare the first
// element, need to compare the graph and all the elements in the queue and
// set.
NodesTSIterator &operator=(const NodesTSIterator &other);
bool operator==(const NodesTSIterator &other);
bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
Node *operator->();
private:
std::vector<Node *> sorted_;
size_t cursor_{0};
};
/* /*
* GraphTraits contains some graph traversal algorithms. * GraphTraits contains some graph traversal algorithms.
* *
...@@ -76,6 +102,14 @@ struct GraphTraits { ...@@ -76,6 +102,14 @@ struct GraphTraits {
NodesDFSIterator()); NodesDFSIterator());
} }
static iterator_range<NodesTSIterator> TS(const Graph &g) {
auto start_points = ExtractStartPoints(g);
PADDLE_ENFORCE(!start_points.empty());
NodesTSIterator x(start_points);
return iterator_range<NodesTSIterator>(NodesTSIterator(start_points),
NodesTSIterator());
}
private: private:
// The nodes those have no input will be treated as start points. // The nodes those have no input will be treated as start points.
static std::vector<Node *> ExtractStartPoints(const Graph &g) { static std::vector<Node *> ExtractStartPoints(const Graph &g) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/is_test_pass.h"
#include <string>
#include <utility>
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it "
"for activations and pooling.";
auto op_list = {"pool2d", "sigmoid", "logsigmoid",
"softshrink", "exp", "brelu",
"pow", "leaky_relu", "stanh",
"relu", "tanh", "tanh_shrink",
"sqrt", "abs", "ceil",
"elu", "floor", "cos",
"sin", "round", "reciprocal",
"hard_shrink", "hard_sigmoid", "relu6",
"soft_relu", "swish", "thresholded_relu",
"log", "square", "softplus",
"softsign"};
for (const Node* n : graph->Nodes()) {
if (n->IsOp()) {
auto* op = n->Op();
if (op->HasAttr("is_test")) {
op->SetAttr("is_test", true);
} else if (std::find(begin(op_list), end(op_list), op->Type()) !=
end(op_list)) {
op->MutableAttrMap()->insert(
std::pair<std::string, Attribute>("is_test", true));
}
}
}
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(is_test_pass, paddle::framework::ir::IsTestPass);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/inference/analysis/graph_traits.h" #pragma once
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class IsTestPass : public Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/is_test_pass.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
namespace ir {
enum class ISTEST_STATE { FALSE, TRUE, UNSET };
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs, bool use_mkldnn = false,
ISTEST_STATE is_test = ISTEST_STATE::UNSET) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
op->SetAttr("name", name);
op->SetInput("X", inputs);
op->SetOutput("Out", outputs);
op->SetAttr("use_mkldnn", use_mkldnn);
if (is_test == ISTEST_STATE::UNSET)
op->MutableAttrMap()->erase("is_test");
else if (is_test == ISTEST_STATE::FALSE)
op->SetAttr("is_test", false);
else
op->SetAttr("is_test", true);
}
// a->pool2d->b
// b->relu->c
// c,weights1)->conv2d->d
//
// d->pool2d->e
// e->hard_sigmoid->f
// (f,weights2)->conv2d->g
//
// g->pool2d->h
// h->tanh->i
// (i,weights3)->conv2d->j
ProgramDesc BuildProgramDesc() {
ProgramDesc prog;
for (auto& v :
std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h", "i",
"j", "weights1", "weights2", "weights3"})) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::SELECTED_ROWS);
if (v == "weights1" || v == "weights2" || v == "weights3") {
var->SetPersistable(true);
}
}
SetOp(&prog, "pool2d", "pooling1", std::vector<std::string>({"a"}),
std::vector<std::string>({"b"}), true, ISTEST_STATE::TRUE);
SetOp(&prog, "relu", "activation1", std::vector<std::string>({"b"}),
std::vector<std::string>({"c"}), true, ISTEST_STATE::TRUE);
SetOp(&prog, "conv2d", "conv1", std::vector<std::string>({"c", "weights1"}),
std::vector<std::string>({"d"}), true, ISTEST_STATE::TRUE);
SetOp(&prog, "pool2d", "pooling2", std::vector<std::string>({"d"}),
std::vector<std::string>({"e"}), false, ISTEST_STATE::FALSE);
SetOp(&prog, "hard_sigmoid", "activation2", std::vector<std::string>({"e"}),
std::vector<std::string>({"f"}), false, ISTEST_STATE::FALSE);
SetOp(&prog, "conv2d", "conv2", std::vector<std::string>({"f", "weights2"}),
std::vector<std::string>({"g"}), false, ISTEST_STATE::FALSE);
SetOp(&prog, "pool2d", "pooling3", std::vector<std::string>({"g"}),
std::vector<std::string>({"h"}), false, ISTEST_STATE::UNSET);
SetOp(&prog, "tanh", "activation3", std::vector<std::string>({"h"}),
std::vector<std::string>({"i"}), true, ISTEST_STATE::UNSET);
SetOp(&prog, "conv2d", "conv3", std::vector<std::string>({"i", "weights3"}),
std::vector<std::string>({"j"}), false, ISTEST_STATE::UNSET);
return prog;
}
TEST(IsTestPass, basic) {
auto prog = BuildProgramDesc();
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto pass = PassRegistry::Instance().Get("is_test_pass");
graph = pass->Apply(std::move(graph));
for (auto* node : graph->Nodes()) {
if (node->IsOp()) {
auto* op = node->Op();
auto op_name = boost::get<std::string>(op->GetAttr("name"));
if (op_name == "conv3") {
ASSERT_FALSE(op->HasAttr("is_test"));
} else {
ASSERT_TRUE(op->HasAttr("is_test"));
EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
}
}
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(is_test_pass);
...@@ -17,8 +17,12 @@ limitations under the License. */ ...@@ -17,8 +17,12 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
// msvc15 don't support constexpr in correct way.
#if !defined(_WIN32)
constexpr char Node::kControlDepVarName[]; constexpr char Node::kControlDepVarName[];
int Node::count_ = 0; #else
const char Node::kControlDepVarName[] = "__control_var";
#endif
std::unique_ptr<Node> CreateNodeForTest(const std::string& name, std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
Node::Type type) { Node::Type type) {
......
...@@ -55,7 +55,11 @@ class Node { ...@@ -55,7 +55,11 @@ class Node {
} }
enum class Type { kOperation, kVariable }; enum class Type { kOperation, kVariable };
#if !defined(_WIN32) // msvc not support constexpr correctly.
static constexpr char kControlDepVarName[] = "__control_var"; static constexpr char kControlDepVarName[] = "__control_var";
#else
static const char kControlDepVarName[];
#endif
Type NodeType() const { return type_; } Type NodeType() const { return type_; }
...@@ -115,37 +119,30 @@ class Node { ...@@ -115,37 +119,30 @@ class Node {
int id_; int id_;
private: private:
// ID can only set by a Graph.
void SetId(int id) { id_ = id; }
friend class Graph; friend class Graph;
friend std::unique_ptr<Node> CreateNodeForTest(const std::string& name, friend std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
Node::Type type); Node::Type type);
explicit Node(const std::string& name, Type type) explicit Node(const std::string& name, Type type)
: name_(name), : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
var_desc_(nullptr),
op_desc_(nullptr),
type_(type),
id_(count_++) {}
explicit Node(VarDesc* var_desc) explicit Node(VarDesc* var_desc)
: name_(var_desc->Name()), : name_(var_desc->Name()),
var_desc_(new VarDesc(*var_desc)), var_desc_(new VarDesc(*var_desc)),
op_desc_(nullptr), op_desc_(nullptr),
type_(Type::kVariable), type_(Type::kVariable) {}
id_(count_++) {}
explicit Node(OpDesc* op_desc) explicit Node(OpDesc* op_desc)
: name_(op_desc->Type()), : name_(op_desc->Type()),
var_desc_(nullptr), var_desc_(nullptr),
op_desc_(new OpDesc(*op_desc, op_desc->Block())), op_desc_(new OpDesc(*op_desc, op_desc->Block())),
type_(Type::kOperation), type_(Type::kOperation) {}
id_(count_++) {}
Node() = delete; Node() = delete;
static int count_;
// Please don't use this API or make this public.
static void ResetId() { count_ = 0; }
boost::any wrapper_; boost::any wrapper_;
std::function<void(void)> wrapper_deleter_; std::function<void(void)> wrapper_deleter_;
std::type_index wrapper_type_ = std::type_index(typeid(void)); std::type_index wrapper_type_ = std::type_index(typeid(void));
......
...@@ -93,6 +93,7 @@ class Pass { ...@@ -93,6 +93,7 @@ class Pass {
protected: protected:
virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const { virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
LOG(FATAL) << "Calling virtual Pass not implemented."; LOG(FATAL) << "Calling virtual Pass not implemented.";
return graph;
} }
private: private:
...@@ -196,26 +197,26 @@ struct PassRegistrar : public Registrar { ...@@ -196,26 +197,26 @@ struct PassRegistrar : public Registrar {
msg) msg)
// Register a new pass that can be applied on the IR. // Register a new pass that can be applied on the IR.
#define REGISTER_PASS(pass_type, pass_class) \ #define REGISTER_PASS(pass_type, pass_class) \
STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \
__reg_pass__##pass_type, \ __reg_pass__##pass_type, \
"REGISTER_PASS must be called in global namespace"); \ "REGISTER_PASS must be called in global namespace"); \
static ::paddle::framework::ir::PassRegistrar<pass_class> \ static ::paddle::framework::ir::PassRegistrar<pass_class> \
__pass_registrar_##pass_type##__(#pass_type); \ __pass_registrar_##pass_type##__(#pass_type); \
int TouchPassRegistrar_##pass_type() { \ int TouchPassRegistrar_##pass_type() { \
__pass_registrar_##pass_type##__.Touch(); \ __pass_registrar_##pass_type##__.Touch(); \
return 0; \ return 0; \
} \ } \
static ::paddle::framework::ir::PassRegistrar<pass_class> \ static ::paddle::framework::ir::PassRegistrar<pass_class> \
&__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \ &__pass_tmp_registrar_##pass_type##__ UNUSED = \
__pass_registrar_##pass_type##__ __pass_registrar_##pass_type##__
#define USE_PASS(pass_type) \ #define USE_PASS(pass_type) \
STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \
__use_pass_itself_##pass_type, \ __use_pass_itself_##pass_type, \
"USE_PASS must be called in global namespace"); \ "USE_PASS must be called in global namespace"); \
extern int TouchPassRegistrar_##pass_type(); \ extern int TouchPassRegistrar_##pass_type(); \
static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \ static int use_pass_itself_##pass_type##_ UNUSED = \
TouchPassRegistrar_##pass_type() TouchPassRegistrar_##pass_type()
} // namespace ir } // namespace ir
......
...@@ -111,9 +111,6 @@ class LoDTensor : public Tensor { ...@@ -111,9 +111,6 @@ class LoDTensor : public Tensor {
public: public:
LoDTensor() : Tensor() {} LoDTensor() : Tensor() {}
/* Constructor with place should only be used in pybind */
explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
explicit LoDTensor(const LoD& lod) : lod_(lod) {} explicit LoDTensor(const LoD& lod) : lod_(lod) {}
void set_lod(const LoD& lod) { lod_ = lod; } void set_lod(const LoD& lod) { lod_ = lod; }
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "paddle/fluid/framework/details/cow_ptr.h" #include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "glog/logging.h" #include "glog/logging.h"
...@@ -31,46 +32,6 @@ namespace paddle { ...@@ -31,46 +32,6 @@ namespace paddle {
namespace framework { namespace framework {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
namespace details {
struct CUDABuffer {
void *data_{nullptr};
size_t size_{0};
platform::CUDAPlace place_;
CUDABuffer() {}
CUDABuffer(platform::Place place, size_t size)
: size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
data_ = memory::Alloc(place_, size);
}
~CUDABuffer() { ClearMemory(); }
CUDABuffer(const CUDABuffer &o) = delete;
CUDABuffer &operator=(const CUDABuffer &o) = delete;
void Resize(platform::Place place, size_t size) {
ClearMemory();
place_ = boost::get<platform::CUDAPlace>(place);
data_ = memory::Alloc(place_, size);
PADDLE_ENFORCE_NOT_NULL(data_);
size_ = size;
}
void Swap(CUDABuffer &o) {
std::swap(data_, o.data_);
std::swap(place_, o.place_);
std::swap(size_, o.size_);
}
private:
void ClearMemory() const {
if (data_ != nullptr) {
memory::Free(place_, data_);
}
}
};
} // namespace details
// Vector<T> implements the std::vector interface, and can get Data or // Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside. // MutableData from any place. The data will be synced implicitly inside.
template <typename T> template <typename T>
...@@ -103,8 +64,6 @@ class Vector { ...@@ -103,8 +64,6 @@ class Vector {
o.ImmutableCPU(); o.ImmutableCPU();
cpu_ = o.cpu_; cpu_ = o.cpu_;
flag_ = kDataInCPU; flag_ = kDataInCPU;
details::CUDABuffer null;
gpu_.Swap(null);
return *this; return *this;
} }
...@@ -199,7 +158,7 @@ class Vector { ...@@ -199,7 +158,7 @@ class Vector {
PADDLE_ENFORCE(platform::is_gpu_place(place), PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place"); "CUDA Data must on CUDA place");
ImmutableCUDA(place); ImmutableCUDA(place);
return reinterpret_cast<T *>(gpu_.data_); return reinterpret_cast<T *>(gpu_->ptr());
} }
// get cuda ptr. mutable // get cuda ptr. mutable
...@@ -234,13 +193,11 @@ class Vector { ...@@ -234,13 +193,11 @@ class Vector {
std::mutex &Mutex() const { return mtx_; } std::mutex &Mutex() const { return mtx_; }
std::unique_ptr<platform::CUDAPlace> CUDAPlace() const { boost::optional<platform::CUDAPlace> CUDAPlace() const {
if (gpu_.data_ == nullptr) { return gpu_ == nullptr
return nullptr; ? boost::none
} else { : boost::optional<platform::CUDAPlace>(
return std::unique_ptr<platform::CUDAPlace>( boost::get<platform::CUDAPlace>(gpu_->place()));
new platform::CUDAPlace(gpu_.place_));
}
} }
private: private:
...@@ -254,13 +211,12 @@ class Vector { ...@@ -254,13 +211,12 @@ class Vector {
void CopyToCPU() const { void CopyToCPU() const {
// COPY GPU Data To CPU // COPY GPU Data To CPU
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>( auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get( platform::DeviceContextPool::Instance().Get(gpu_->place()));
platform::Place(gpu_.place_)));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
void *src = gpu_.data_; void *src = gpu_->ptr();
void *dst = cpu_.data(); void *dst = cpu_.data();
memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_, memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
stream); gpu_->size(), stream);
dev_ctx->Wait(); dev_ctx->Wait();
} }
...@@ -277,8 +233,7 @@ class Vector { ...@@ -277,8 +233,7 @@ class Vector {
CopyCPUDataToCUDA(place); CopyCPUDataToCUDA(place);
UnsetFlag(kDirty); UnsetFlag(kDirty);
SetFlag(kDataInCUDA); SetFlag(kDataInCUDA);
} else if (IsInCUDA() && } else if (IsInCUDA() && !(place == gpu_->place())) {
!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
PADDLE_THROW("This situation should not happen"); PADDLE_THROW("This situation should not happen");
// Still dirty // Still dirty
} else { } else {
...@@ -290,7 +245,7 @@ class Vector { ...@@ -290,7 +245,7 @@ class Vector {
// Even data is not dirty. However, data is not in CUDA. Copy data. // Even data is not dirty. However, data is not in CUDA. Copy data.
CopyCPUDataToCUDA(place); CopyCPUDataToCUDA(place);
SetFlag(kDataInCUDA); SetFlag(kDataInCUDA);
} else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) { } else if (!(place == gpu_->place())) {
PADDLE_THROW("This situation should not happen."); PADDLE_THROW("This situation should not happen.");
} else { } else {
// Not Dirty && DataInCUDA && Device is same // Not Dirty && DataInCUDA && Device is same
...@@ -301,13 +256,13 @@ class Vector { ...@@ -301,13 +256,13 @@ class Vector {
void CopyCPUDataToCUDA(const platform::Place &place) const { void CopyCPUDataToCUDA(const platform::Place &place) const {
void *src = cpu_.data(); void *src = cpu_.data();
gpu_.Resize(place, cpu_.size() * sizeof(T)); gpu_ = memory::Alloc(place, cpu_.size() * sizeof(T));
void *dst = gpu_.data_; void *dst = gpu_->ptr();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>( auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream(); auto stream = dev_ctx->stream();
memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_, memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
stream); gpu_->size(), stream);
} }
void ImmutableCPU() const { void ImmutableCPU() const {
...@@ -329,7 +284,7 @@ class Vector { ...@@ -329,7 +284,7 @@ class Vector {
bool IsInCPU() const { return flag_ & kDataInCPU; } bool IsInCPU() const { return flag_ & kDataInCPU; }
mutable std::vector<T> cpu_; mutable std::vector<T> cpu_;
mutable details::CUDABuffer gpu_; mutable memory::AllocationPtr gpu_;
mutable int flag_; mutable int flag_;
mutable std::mutex mtx_; mutable std::mutex mtx_;
...@@ -428,8 +383,8 @@ class Vector { ...@@ -428,8 +383,8 @@ class Vector {
auto &mtx = m_.Data().Mutex(); auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx); std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace(); auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr || if (cuda_place == boost::none ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) { cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.Data().CUDAData(place); return m_.Data().CUDAData(place);
} }
} }
...@@ -444,8 +399,8 @@ class Vector { ...@@ -444,8 +399,8 @@ class Vector {
auto &mtx = m_.Data().Mutex(); auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx); std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace(); auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr || if (cuda_place == boost::none ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) { cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.MutableData()->CUDAMutableData(place); return m_.MutableData()->CUDAMutableData(place);
} }
} }
......
...@@ -57,60 +57,68 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { ...@@ -57,60 +57,68 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
} }
} }
void NaiveExecutor::Prepare(Scope *parent_scope, void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
const ProgramDesc &program_desc, int block_id, int block_id, bool with_feed_fetch_ops) {
bool with_feed_fetch_ops) { if (!scope) {
if (!parent_scope) {
scope_ = new framework::Scope; scope_ = new framework::Scope;
} else { } else {
scope_ = &parent_scope->NewScope(); scope_ = scope;
} }
CreateVariables(program_desc, scope_, block_id);
VLOG(3) << "NaiveExecutor init with scope " << scope;
CreateOps(program_desc, block_id, with_feed_fetch_ops); CreateOps(program_desc, block_id, with_feed_fetch_ops);
} }
void NaiveExecutor::Run() { void NaiveExecutor::Run() {
#ifndef PADDLE_ON_INFERENCE
LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
"cmake flag ON_INFER is not set.";
LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
"variables will be reused to save the allocation "
"overhead.";
LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
"setting the cmake flag ON_INFER=ON if you are "
"running Paddle Inference";
#endif // PADDLE_ON_INFERENCE
for (auto &op : ops_) { for (auto &op : ops_) {
VLOG(40) << "run " << op->Type(); VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
<< " on scope " << scope_;
op->Run(*scope_, place_); op->Run(*scope_, place_);
} }
} }
void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope, void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
int block_id) { bool persistable, Scope *scope) {
PADDLE_ENFORCE(scope); PADDLE_ENFORCE_NOT_NULL(scope);
auto &global_block = desc.Block(block_id); auto &global_block = desc.Block(block_id);
const Scope *ancestor_scope = scope; const auto *anc = scope;
while (ancestor_scope->parent()) { PADDLE_ENFORCE(anc->parent() != anc);
ancestor_scope = ancestor_scope->parent(); while (anc->parent()) {
anc = anc->parent();
} }
if (ancestor_scope != scope) { for (auto &var : global_block.AllVars()) {
for (auto &var : global_block.AllVars()) { if (var->Name() == framework::kEmptyVarName) {
if (var->Name() == framework::kEmptyVarName) { continue;
continue; }
}
// Create persistable vars in ancestor scope. if (persistable == var->Persistable()) {
if (var->Persistable()) { if (persistable) {
auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name()); if (!anc->FindVar(var->Name())) {
InitializeVariable(ptr, var->GetType()); auto *ptr = const_cast<Scope *>(anc)->Var(var->Name());
VLOG(30) << "Create Variable " << var->Name() VLOG(3) << scope << " Create persistable variable " << var->Name()
<< " global, which pointer is " << ptr; << ", which pointer is " << ptr;
} else { // Create temporary variables in local scope. InitializeVariable(ptr, var->GetType());
auto *ptr = scope->Var(var->Name()); }
} else {
auto *ptr = const_cast<Scope *>(scope)->Var(var->Name());
VLOG(3) << scope << " Create variable " << var->Name()
<< ", which pointer is " << ptr;
InitializeVariable(ptr, var->GetType()); InitializeVariable(ptr, var->GetType());
VLOG(30) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
} }
} }
} else {
for (auto &var : global_block.AllVars()) {
auto *ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(30) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
} }
} }
......
...@@ -35,8 +35,14 @@ class NaiveExecutor { ...@@ -35,8 +35,14 @@ class NaiveExecutor {
// Create child scope. // Create child scope.
// Create variables. // Create variables.
// @with_feed_fetch_ops: whether to work with the feed and fetch operators. // @with_feed_fetch_ops: whether to work with the feed and fetch operators.
void Prepare(Scope* parent_scope, const ProgramDesc& program_desc, void Prepare(Scope* scope, const ProgramDesc& program_desc, int block_id,
int block_id, bool with_feed_fetch_ops); bool with_feed_fetch_ops);
// Create variables before head.
// Create parameters if persistable is ture, or create the temporary variables
// instead.
void CreateVariables(const ProgramDesc& desc, int block_id, bool persistable,
Scope* scope);
// Run all the operators. // Run all the operators.
void Run(); void Run();
...@@ -49,8 +55,6 @@ class NaiveExecutor { ...@@ -49,8 +55,6 @@ class NaiveExecutor {
void CleanFeedFetchOps(); void CleanFeedFetchOps();
protected: protected:
void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
void CreateOps(const ProgramDesc& desc, int block_id, void CreateOps(const ProgramDesc& desc, int block_id,
bool with_feed_fetch_ops); bool with_feed_fetch_ops);
......
...@@ -39,7 +39,7 @@ TEST(NaiveExecutor, Basic) { ...@@ -39,7 +39,7 @@ TEST(NaiveExecutor, Basic) {
auto place = platform::CPUPlace(); auto place = platform::CPUPlace();
NaiveExecutor exe(place); NaiveExecutor exe(place);
exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/); exe.Prepare(nullptr, program, 0, false);
auto* a_tensor = exe.FindTensor("a"); auto* a_tensor = exe.FindTensor("a");
auto* b_tensor = exe.FindTensor("b"); auto* b_tensor = exe.FindTensor("b");
auto* c_tensor = exe.FindTensor("c"); auto* c_tensor = exe.FindTensor("c");
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <functional>
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace framework {
std::map<std::string,
std::function<void(const std::shared_ptr<OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = {};
void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
auto& op_type = op->Type();
NG_NODE_MAP[op_type](op, ngb_node_map);
}
} // namespace framework
} // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace framework {
class NgraphBridge {
public:
static std::map<
std::string,
std::function<void(const std::shared_ptr<OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NG_NODE_MAP;
explicit NgraphBridge(
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map)
: ngb_node_map(var_node_map) {}
void build_graph(const std::shared_ptr<OperatorBase>& op);
private:
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map;
};
} // namespace framework
} // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#include <glog/logging.h>
#include <algorithm>
#include <map>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type.h"
namespace paddle {
namespace framework {
static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
{proto::VarType::FP32, ngraph::element::f32},
{proto::VarType::FP64, ngraph::element::f64},
{proto::VarType::INT32, ngraph::element::i32},
{proto::VarType::INT64, ngraph::element::i64},
{proto::VarType::BOOL, ngraph::element::boolean},
};
typedef enum { /* nGraph support state on ops */
FULL_TRAIN, /* Support full ops for train */
PARTIAL_TRAIN, /* Support partial ops for train */
FULL_TEST, /* Support full list of ops for test */
PARTIAL_TEST /* Support partial list of ops for test */
} op_state;
class NgraphOperator {
public:
explicit NgraphOperator(const Scope& scope, const platform::Place& place,
const std::vector<std::shared_ptr<OperatorBase>>& ops,
const std::unordered_map<
std::string, ngraph::element::Type>& var_type_map,
const std::unordered_set<std::string>& persist,
const std::unordered_set<std::string>& fetches,
const std::unordered_set<std::string>& post_op_inputs,
op_state ng_op_state)
: scope_(scope),
place_(place),
fused_ops_(ops),
var_type_map_(var_type_map),
persistables_(persist),
fetches_(fetches),
post_op_inputs_(post_op_inputs),
ng_op_state_(ng_op_state) {}
void Run(const Scope& scope, const platform::Place& place) const;
private:
static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
func_cache;
const Scope& scope_;
const platform::Place& place_;
std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_;
op_state ng_op_state_;
};
std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
FusedOperator::FusedOpIntervals(
std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
intervals;
if (ops->empty()) {
return intervals;
}
size_t size = ops->size();
size_t left = 0;
while (left < size && ops.at(left)->Type() != kFeedOpType) {
++left;
}
if (left == size) {
return intervals;
}
while (left < size && ops->at(left)->Type() == kFeedOpType) {
++left;
}
size_t right = left;
while (right < size && ops->at(right)->Type() != kFetchOpType) {
++right;
}
if (right == size) {
return intervals;
}
if (left >= right) return intervals;
// (left, right - 1) represents indices between feed and fetch
size_t pivot = left;
while (pivot < right) {
auto op_type = ops->at(pivot)->Type();
if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) ==
paddle::framework::NgraphBridge::NG_NODE_MAP.end()) {
++pivot;
} else {
size_t start = pivot, end = start;
while (pivot < right &&
(paddle::framework::NgraphBridge::NG_NODE_MAP.find(
ops.at(pivot)->Type()) !=
paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
++pivot;
++end;
}
std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>
interval = {ops->begin() + start, ops->begin() + end};
intervals.push_back(interval);
}
} // end while
return intervals;
}
FusedOperator::FusedOperator(
const ProgramDesc& prog, size_t block_id,
std::vector<std::unique_ptr<OperatorBase>>::iterator start,
std::vector<std::unique_ptr<OperatorBase>>::iterator end,
const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
it != end; ++it) {
fused_ops_.push_back(std::move(*it));
}
for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end;
(*it)->Type() != kFetchOpType; ++it) {
for (auto& var_name_item : (*it)->Inputs()) {
for (auto& var_name : var_name_item.second) {
post_op_inputs_.insert(var_name);
}
}
}
if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
is_complete = true;
}
Process();
}
void FusedOperator::Process() {
auto& bdesc = pdesc_.Block(block_);
for (auto& var : bdesc.AllVars()) {
if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
var->GetType() == proto::VarType::LOD_TENSOR ||
var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) {
continue;
}
auto var_name = var->Name();
if (var->Name() == framework::kEmptyVarName) {
continue;
}
if (var_name != "fetch" && var_name != "feed") {
auto pd_type = var->GetDataType();
if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
var_name);
}
var_type_map_[var_name] = pd2ng_type_map[pd_type];
}
if (var->Persistable()) {
persistables_.insert(var->Name());
}
}
for (auto* op : bdesc.AllOps()) {
if (op->Type() == kFetchOpType) {
std::string fetch_target_name = op->Input("X")[0];
fetches_.insert(fetch_target_name);
}
}
}
void FusedOperator::RunImpl(const Scope& scope,
const platform::Place& place) const {
op_state ng_op_state = PARTIAL_TEST;
auto& bdesc = pdesc_.Block(block_);
for (auto* op : bdesc.AllOps()) {
if (op->Type().find("_grad") != std::string::npos) {
ng_op_state = PARTIAL_TRAIN;
break;
}
}
if (is_full) {
ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
}
NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_,
persistables_, fetches_, post_op_inputs_,
ng_op_state);
ngraph_op.Run(scope, place);
}
} // namespace framework
} // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <atomic>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/variant.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace framework {
class FusedOperator : public OperatorBase {
public:
static std::vector<
std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
FusedOpIntervals(
std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
explicit FusedOperator(
const ProgramDesc& prog, size_t block_id,
std::vector<std::unique_ptr<OperatorBase>>::iterator start,
std::vector<std::unique_ptr<OperatorBase>>::iterator end,
const std::string& type = "fused_op", const VariableNameMap& inputs = {},
const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
void RunImpl(const Scope& scope, const platform::Place& place) const final;
private:
const ProgramDesc pdesc_;
size_t block_;
std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_;
bool is_full_ = false;
void Process();
};
} // namespace framework
} // namespace paddle
#endif
...@@ -63,6 +63,8 @@ struct OpKernelType { ...@@ -63,6 +63,8 @@ struct OpKernelType {
place_(dev_ctx.GetPlace()), place_(dev_ctx.GetPlace()),
library_type_(library_type) {} library_type_(library_type) {}
size_t hash_key() const { return Hash()(*this); }
bool operator==(const OpKernelType& o) const { bool operator==(const OpKernelType& o) const {
return platform::places_are_same_class(place_, o.place_) && return platform::places_are_same_class(place_, o.place_) &&
data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
......
...@@ -35,6 +35,11 @@ DEFINE_bool(check_nan_inf, false, ...@@ -35,6 +35,11 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// Combine two hash values to a single hash.
inline size_t CombineHash(size_t seed, size_t a) {
return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = { std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN), std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain), std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
...@@ -150,14 +155,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -150,14 +155,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#endif #endif
} }
// The profile has a process-wide mutex, results in serious performance issue // The profile has a process-wide mutex, results in serious performance issue
// in concurrency scenerio. Here use an `if` to fix this issue. // in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern. // Please not remove the `if`, ask @Superjomn if there are any concern.
#ifndef _WIN32
if (platform::IsProfileEnabled()) { if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place)); platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place); RunImpl(scope, place);
} else { } else
#endif
{
RunImpl(scope, place); RunImpl(scope, place);
} }
VLOG(30) << place << " " << DebugStringEx(&scope); VLOG(30) << place << " " << DebugStringEx(&scope);
...@@ -791,6 +799,17 @@ void OperatorWithKernel::TransferInplaceVarsBack( ...@@ -791,6 +799,17 @@ void OperatorWithKernel::TransferInplaceVarsBack(
Scope* OperatorWithKernel::TryTransferData( Scope* OperatorWithKernel::TryTransferData(
const Scope& scope, const OpKernelType& expected_kernel_key, const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars) const { std::vector<std::string>* transfered_inplace_vars) const {
// In the inference scenerio, the scopes will be reused across the batches, so
// the `new_scope` here will result in GPU memroy explosion over the running of
// operators.
// We use a thread_local cache to fix that issue, the key in the cache is the
// combination of the `scope` argument, from_kernel_type, target_kernel_type.
// Have a discussion with @Superjomn or the inference developers if some changes
// on this logic for this macro might not tested on the other scenerios.
#ifdef PADDLE_ON_INFERENCE
thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
#endif
Scope* new_scope = nullptr; Scope* new_scope = nullptr;
for (auto& var_name_item : Inputs()) { for (auto& var_name_item : Inputs()) {
for (auto& var_name : var_name_item.second) { for (auto& var_name : var_name_item.second) {
...@@ -821,11 +840,28 @@ Scope* OperatorWithKernel::TryTransferData( ...@@ -821,11 +840,28 @@ Scope* OperatorWithKernel::TryTransferData(
VLOG(30) << "Transform Variable " << var_name << " from " VLOG(30) << "Transform Variable " << var_name << " from "
<< kernel_type_for_var << " to " << expected_kernel_key; << kernel_type_for_var << " to " << expected_kernel_key;
#ifdef PADDLE_ON_INFERENCE
size_t infer_cache_key =
CombineHash(OpKernelType::Hash()(kernel_type_for_var),
OpKernelType::Hash()(expected_kernel_key));
infer_cache_key =
CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
auto it = infer_transfer_scope_cache.find(infer_cache_key);
if (it != infer_transfer_scope_cache.end()) {
new_scope = infer_transfer_scope_cache[infer_cache_key];
} else {
new_scope = &scope.NewScope();
infer_transfer_scope_cache[infer_cache_key] = new_scope;
}
#endif
if (new_scope == nullptr) { if (new_scope == nullptr) {
new_scope = &scope.NewScope(); new_scope = &scope.NewScope();
} }
auto* trans_var = new_scope->Var(var_name); auto* trans_var = new_scope->Var(var_name);
Tensor out; Tensor out;
TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
SetTensorToVariable(*var, out, trans_var); SetTensorToVariable(*var, out, trans_var);
......
...@@ -171,8 +171,17 @@ ParallelExecutor::ParallelExecutor( ...@@ -171,8 +171,17 @@ ParallelExecutor::ParallelExecutor(
} }
// If the loss_var_name is given, the number of graph should be only one. // If the loss_var_name is given, the number of graph should be only one.
if (loss_var_name.size()) { if (loss_var_name.size()) {
PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, size_t graph_num = ir::GraphNum(*graph);
"The number of graph should be only one"); if (graph_num > 1) {
LOG(WARNING)
<< "The number of graph should be only one, "
"but the current graph has "
<< ir::GraphNum(*graph)
<< " sub_graphs. If you want to see the nodes of the "
"sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
"to specify the output dir. NOTES: if you not do training, "
"please don't pass loss_var_name.";
}
} }
if (exec_strategy.type_ == ExecutionStrategy::kDefault) { if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
......
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include <memory> // for unique_ptr #include <memory> // for unique_ptr
#include <queue>
#include <set> #include <set>
#include <unordered_set>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
...@@ -36,6 +38,16 @@ DEFINE_double( ...@@ -36,6 +38,16 @@ DEFINE_double(
"Memory size threshold (GB) when the garbage collector clear tensors." "Memory size threshold (GB) when the garbage collector clear tensors."
"Disabled when this value is less than 0"); "Disabled when this value is less than 0");
// When in inference scenario, the scopes will not be written by two threads in
// a mean time, but a scope may be read by multiple threads concurrently, and
// the mutex will cause serious performance issue.
// So the mutex is disabled when `ON_INFER`.
#ifdef PADDLE_ON_INFERENCE
#define SCOPE_LOCK_GUARD
#else
#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -49,18 +61,18 @@ int64_t GetEagerDeletionThreshold() { ...@@ -49,18 +61,18 @@ int64_t GetEagerDeletionThreshold() {
Scope::~Scope() { DropKids(); } Scope::~Scope() { DropKids(); }
Scope& Scope::NewScope() const { Scope& Scope::NewScope() const {
std::lock_guard<std::mutex> lock(mutex_); SCOPE_LOCK_GUARD
kids_.push_back(new Scope(this)); kids_.push_back(new Scope(this));
return *kids_.back(); return *kids_.back();
} }
Variable* Scope::Var(const std::string& name) { Variable* Scope::Var(const std::string& name) {
std::lock_guard<std::mutex> lock(mutex_); SCOPE_LOCK_GUARD
return VarInternal(name); return VarInternal(name);
} }
Variable* Scope::Var(std::string* name) { Variable* Scope::Var(std::string* name) {
std::lock_guard<std::mutex> lock(mutex_); SCOPE_LOCK_GUARD
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
if (name != nullptr) { if (name != nullptr) {
*name = new_name; *name = new_name;
...@@ -69,34 +81,34 @@ Variable* Scope::Var(std::string* name) { ...@@ -69,34 +81,34 @@ Variable* Scope::Var(std::string* name) {
} }
Variable* Scope::FindVar(const std::string& name) const { Variable* Scope::FindVar(const std::string& name) const {
std::lock_guard<std::mutex> lock(mutex_); SCOPE_LOCK_GUARD
return FindVarInternal(name); return FindVarInternal(name);
} }
Variable* Scope::FindLocalVar(const std::string& name) const { Variable* Scope::FindLocalVar(const std::string& name) const {
std::lock_guard<std::mutex> lock(mutex_); SCOPE_LOCK_GUARD
return FindVarLocally(name); return FindVarLocally(name);
} }
const Scope* Scope::FindScope(const Variable* var) const { const Scope* Scope::FindScope(const Variable* var) const {
std::lock_guard<std::mutex> lock(mutex_); SCOPE_LOCK_GUARD
return FindScopeInternal(var); return FindScopeInternal(var);
} }
void Scope::DropKids() { void Scope::DropKids() {
std::lock_guard<std::mutex> lock(mutex_); SCOPE_LOCK_GUARD
for (Scope* s : kids_) delete s; for (Scope* s : kids_) delete s;
kids_.clear(); kids_.clear();
} }
bool Scope::HasKid(const Scope* scope) const { bool Scope::HasKid(const Scope* scope) const {
std::lock_guard<std::mutex> lock(mutex_); SCOPE_LOCK_GUARD
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
return it != this->kids_.end(); return it != this->kids_.end();
} }
std::vector<std::string> Scope::LocalVarNames() const { std::vector<std::string> Scope::LocalVarNames() const {
std::lock_guard<std::mutex> lock(mutex_); SCOPE_LOCK_GUARD
std::vector<std::string> known_vars; std::vector<std::string> known_vars;
known_vars.reserve(this->vars_.size()); known_vars.reserve(this->vars_.size());
for (auto& p : vars_) { for (auto& p : vars_) {
...@@ -106,9 +118,10 @@ std::vector<std::string> Scope::LocalVarNames() const { ...@@ -106,9 +118,10 @@ std::vector<std::string> Scope::LocalVarNames() const {
} }
void Scope::DeleteScope(Scope* scope) const { void Scope::DeleteScope(Scope* scope) const {
std::lock_guard<std::mutex> lock(mutex_); SCOPE_LOCK_GUARD
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
this, scope);
this->kids_.erase(it); this->kids_.erase(it);
// When making memory benchmark on Fluid, we have to delete scope sync. // When making memory benchmark on Fluid, we have to delete scope sync.
if (FLAGS_benchmark || FLAGS_eager_delete_scope) { if (FLAGS_benchmark || FLAGS_eager_delete_scope) {
...@@ -119,7 +132,7 @@ void Scope::DeleteScope(Scope* scope) const { ...@@ -119,7 +132,7 @@ void Scope::DeleteScope(Scope* scope) const {
} }
void Scope::EraseVars(const std::vector<std::string>& var_names) { void Scope::EraseVars(const std::vector<std::string>& var_names) {
std::lock_guard<std::mutex> lock(mutex_); SCOPE_LOCK_GUARD
std::set<std::string> var_set(var_names.begin(), var_names.end()); std::set<std::string> var_set(var_names.begin(), var_names.end());
for (auto it = vars_.begin(); it != vars_.end();) { for (auto it = vars_.begin(); it != vars_.end();) {
if (var_set.find(it->first) != var_set.end()) { if (var_set.find(it->first) != var_set.end()) {
...@@ -132,12 +145,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) { ...@@ -132,12 +145,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
void Scope::Rename(const std::string& origin_name, void Scope::Rename(const std::string& origin_name,
const std::string& new_name) const { const std::string& new_name) const {
std::lock_guard<std::mutex> lock(mutex_); SCOPE_LOCK_GUARD
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
} }
std::string Scope::Rename(const std::string& origin_name) const { std::string Scope::Rename(const std::string& origin_name) const {
std::lock_guard<std::mutex> lock(mutex_); SCOPE_LOCK_GUARD
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); auto new_name = string::Sprintf("%p.%d", this, vars_.size());
RenameInternal(origin_name, new_name); RenameInternal(origin_name, new_name);
return new_name; return new_name;
...@@ -189,5 +202,46 @@ Variable* Scope::FindVarLocally(const std::string& name) const { ...@@ -189,5 +202,46 @@ Variable* Scope::FindVarLocally(const std::string& name) const {
return nullptr; return nullptr;
} }
std::string GenScopeTreeDebugInfo(Scope* root) {
std::stringstream os;
if (!root) return "";
// level traversal
std::queue<Scope*> queue;
queue.push(root);
std::vector<Scope*> scopes;
while (!queue.empty()) {
auto* end = queue.back();
Scope* q = nullptr;
while (q != end) {
q = queue.front();
queue.pop();
os << q << " ";
scopes.push_back(q);
for (auto* c : q->kids()) {
queue.push(c);
}
}
// end of a level
os << "\n------------------------------------------\n";
}
os << "\nDetails:\n\n";
for (Scope* q : scopes) {
os << "====\n";
os << q << ":\n";
for (auto& var : q->LocalVarNames()) {
os << " - " << var << "\n";
}
}
return os.str();
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -78,11 +78,11 @@ class Scope { ...@@ -78,11 +78,11 @@ class Scope {
/// Drop all kids scopes belonged to this scope. /// Drop all kids scopes belonged to this scope.
void DropKids(); void DropKids();
std::list<Scope*>& kids() const { return kids_; }
/// Find if a scope exists in the kid scopes /// Find if a scope exists in the kid scopes
bool HasKid(const Scope* scope) const; bool HasKid(const Scope* scope) const;
const std::list<Scope*>& kids() const { return kids_; }
// enumerate all the variables current contains. // enumerate all the variables current contains.
std::vector<std::string> LocalVarNames() const; std::vector<std::string> LocalVarNames() const;
...@@ -118,12 +118,17 @@ class Scope { ...@@ -118,12 +118,17 @@ class Scope {
// Scope in `kids_` are owned by this class. // Scope in `kids_` are owned by this class.
mutable std::list<Scope*> kids_; mutable std::list<Scope*> kids_;
Scope const* parent_{nullptr}; const Scope* parent_{nullptr};
DISABLE_COPY_AND_ASSIGN(Scope); DISABLE_COPY_AND_ASSIGN(Scope);
private: private:
mutable std::mutex mutex_; mutable std::mutex mutex_;
}; };
// Generate some debug string about the inherience structure of scope, quite
// naive.
std::string GenScopeTreeDebugInfo(Scope*);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -63,6 +63,26 @@ struct TensorCopyVisitor { ...@@ -63,6 +63,26 @@ struct TensorCopyVisitor {
int64_t size_; int64_t size_;
}; };
struct TensorFillVisitor {
TensorFillVisitor(framework::Tensor* dst, int64_t dst_offset, int64_t size,
float value)
: dst_(dst), dst_offset_(dst_offset), size_(size) {}
template <typename T>
void apply() const {
// TODO(qiao): support other place
platform::CPUPlace cpu;
auto* tensor_data = dst_->mutable_data<T>(cpu);
auto* start = tensor_data + dst_offset_;
auto* end = start + size_;
std::fill(start, end, static_cast<T>(0.0));
}
framework::Tensor* dst_;
int64_t dst_offset_;
int64_t size_;
};
void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
const platform::DeviceContext& dev_ctx) { const platform::DeviceContext& dev_ctx) {
{ // the 1st field, uint32_t version { // the 1st field, uint32_t version
...@@ -120,7 +140,17 @@ bool SelectedRows::HasKey(int64_t key) const { ...@@ -120,7 +140,17 @@ bool SelectedRows::HasKey(int64_t key) const {
: true; : true;
} }
int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) { int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown,
bool is_test) {
if (is_test) {
auto iter = id_to_index_.find(key);
if (iter == id_to_index_.end()) {
return -1;
} else {
return iter->second;
}
}
rwlock_->RDLock(); rwlock_->RDLock();
auto iter = id_to_index_.find(key); auto iter = id_to_index_.find(key);
if (iter == id_to_index_.end()) { if (iter == id_to_index_.end()) {
...@@ -172,7 +202,7 @@ void SelectedRows::SyncIndex() { ...@@ -172,7 +202,7 @@ void SelectedRows::SyncIndex() {
} }
void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
bool auto_grown) { bool auto_grown, bool is_test) {
PADDLE_ENFORCE(value->IsInitialized(), PADDLE_ENFORCE(value->IsInitialized(),
"The value tensor should be initialized."); "The value tensor should be initialized.");
if (ids.numel() == 0) { if (ids.numel() == 0) {
...@@ -183,11 +213,19 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, ...@@ -183,11 +213,19 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
"output tensor should have the same shape with table " "output tensor should have the same shape with table "
"except the dims[0]."); "except the dims[0].");
for (int i = 0; i < ids.numel(); ++i) { for (int i = 0; i < ids.numel(); ++i) {
int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown); auto id = ids.data<int64_t>()[i];
framework::VisitDataType( int64_t index = AutoGrownIndex(id, auto_grown, is_test);
framework::ToDataType(value_->type()), if (index < 0) {
TensorCopyVisitor(value, i * value_width, *value_.get(), VLOG(5) << "id " << id << " not in the table, return 0";
index * value_width, value_width)); framework::VisitDataType(
framework::ToDataType(value_->type()),
TensorFillVisitor(value, i * value_width, value_width, 0.0));
} else {
framework::VisitDataType(
framework::ToDataType(value_->type()),
TensorCopyVisitor(value, i * value_width, *value_.get(),
index * value_width, value_width));
}
} }
} }
} }
......
...@@ -105,7 +105,7 @@ class SelectedRows { ...@@ -105,7 +105,7 @@ class SelectedRows {
* the value * the value
*/ */
void Get(const framework::Tensor& ids, framework::Tensor* value, void Get(const framework::Tensor& ids, framework::Tensor* value,
bool auto_grown = false); bool auto_grown = false, bool is_test = false);
/* /*
* @brief Get the index of the key from id_to_index_ map. If the key not * @brief Get the index of the key from id_to_index_ map. If the key not
...@@ -118,7 +118,7 @@ class SelectedRows { ...@@ -118,7 +118,7 @@ class SelectedRows {
* *
* @return index of the key. * @return index of the key.
*/ */
int64_t AutoGrownIndex(int64_t key, bool auto_grown); int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);
void SyncIndex(); void SyncIndex();
......
...@@ -84,10 +84,14 @@ TEST(SelectedRows, SparseTable) { ...@@ -84,10 +84,14 @@ TEST(SelectedRows, SparseTable) {
data[i * embedding_width + j] = static_cast<float>(i); data[i * embedding_width + j] = static_cast<float>(i);
} }
} }
ASSERT_EQ(table.AutoGrownIndex(10, true), 0); ASSERT_EQ(table.AutoGrownIndex(10, true, false), 0);
ASSERT_EQ(table.AutoGrownIndex(8, true), 1); ASSERT_EQ(table.AutoGrownIndex(8, true, false), 1);
ASSERT_EQ(table.AutoGrownIndex(8, true), 1); ASSERT_EQ(table.AutoGrownIndex(8, true, false), 1);
ASSERT_EQ(table.AutoGrownIndex(6, true), 2); ASSERT_EQ(table.AutoGrownIndex(6, true, false), 2);
for (int64_t i = 11; i < 20; i++) {
ASSERT_EQ(table.AutoGrownIndex(i, true, true), -1);
ASSERT_TRUE(!table.HasKey(i));
}
ASSERT_TRUE(table.HasKey(10)); ASSERT_TRUE(table.HasKey(10));
ASSERT_TRUE(table.HasKey(8)); ASSERT_TRUE(table.HasKey(8));
ASSERT_TRUE(table.HasKey(6)); ASSERT_TRUE(table.HasKey(6));
......
...@@ -32,10 +32,9 @@ size_t Tensor::memory_size() const { ...@@ -32,10 +32,9 @@ size_t Tensor::memory_size() const {
} }
void* Tensor::mutable_data(platform::Place place, std::type_index type, void* Tensor::mutable_data(platform::Place place, std::type_index type,
memory::Allocator::Attr attr,
size_t requested_size) { size_t requested_size) {
if (holder_ != nullptr) { type_ = type;
holder_->set_type(type);
}
PADDLE_ENFORCE_GE(numel(), 0, PADDLE_ENFORCE_GE(numel(), 0,
"When calling this method, the Tensor's numel must be " "When calling this method, the Tensor's numel must be "
"equal or larger than zero. " "equal or larger than zero. "
...@@ -48,35 +47,18 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type, ...@@ -48,35 +47,18 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
/* some versions of boost::variant don't have operator!= */ /* some versions of boost::variant don't have operator!= */
if (holder_ == nullptr || !(holder_->place() == place) || if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) { holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) { holder_ = memory::AllocShared(place, size, attr);
holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW(
"CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
}
#else
if (platform::is_gpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(place), size, type));
} else if (platform::is_cuda_pinned_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
boost::get<platform::CUDAPinnedPlace>(place), size, type));
}
}
#endif
offset_ = 0; offset_ = 0;
} }
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
} }
void* Tensor::mutable_data(platform::Place place, size_t requested_size) { void* Tensor::mutable_data(platform::Place place, memory::Allocator::Attr attr,
size_t requested_size) {
PADDLE_ENFORCE(this->holder_ != nullptr, PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing."); "Cannot invoke mutable data if current hold nothing.");
return mutable_data(place, holder_->type(), requested_size); return mutable_data(place, type_, attr, requested_size);
} }
Tensor& Tensor::ShareDataWith(const Tensor& src) { Tensor& Tensor::ShareDataWith(const Tensor& src) {
...@@ -101,6 +83,7 @@ Tensor Tensor::Slice(int begin_idx, int end_idx) const { ...@@ -101,6 +83,7 @@ Tensor Tensor::Slice(int begin_idx, int end_idx) const {
Tensor dst; Tensor dst;
dst.holder_ = holder_; dst.holder_ = holder_;
dst.set_layout(layout_); dst.set_layout(layout_);
dst.type_ = type_;
DDim dst_dims = dims_; DDim dst_dims = dims_;
dst_dims[0] = end_idx - begin_idx; dst_dims[0] = end_idx - begin_idx;
dst.Resize(dst_dims); dst.Resize(dst_dims);
......
...@@ -67,12 +67,7 @@ class Tensor { ...@@ -67,12 +67,7 @@ class Tensor {
friend struct EigenVector; friend struct EigenVector;
public: public:
Tensor() : offset_(0) {} Tensor() : type_(typeid(float)), offset_(0) {}
/*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place) : offset_(0) {
holder_->set_place(place);
}
/*! Return a pointer to mutable memory block. */ /*! Return a pointer to mutable memory block. */
template <typename T> template <typename T>
...@@ -89,12 +84,17 @@ class Tensor { ...@@ -89,12 +84,17 @@ class Tensor {
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
T* mutable_data(platform::Place place, size_t requested_size = 0); T* mutable_data(platform::Place place,
memory::Allocator::Attr attr = memory::Allocator::kDefault,
size_t requested_size = 0);
void* mutable_data(platform::Place place, std::type_index type, void* mutable_data(platform::Place place, std::type_index type,
memory::Allocator::Attr attr = memory::Allocator::kDefault,
size_t requested_size = 0); size_t requested_size = 0);
void* mutable_data(platform::Place place, size_t requested_size = 0); void* mutable_data(platform::Place place,
memory::Allocator::Attr attr = memory::Allocator::kDefault,
size_t requested_size = 0);
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
...@@ -106,7 +106,9 @@ class Tensor { ...@@ -106,7 +106,9 @@ class Tensor {
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0); T* mutable_data(DDim dims, platform::Place place,
memory::Allocator::Attr attr = memory::Allocator::kDefault,
size_t requested_size = 0);
/*! Return the dimensions of the memory block. */ /*! Return the dimensions of the memory block. */
const DDim& dims() const; const DDim& dims() const;
...@@ -139,7 +141,7 @@ class Tensor { ...@@ -139,7 +141,7 @@ class Tensor {
std::type_index type() const { std::type_index type() const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
holder_, "Tensor not initialized yet when Tensor::type() is called."); holder_, "Tensor not initialized yet when Tensor::type() is called.");
return holder_->type(); return type_;
} }
// memory size returns the holding memory size in byte. // memory size returns the holding memory size in byte.
...@@ -153,56 +155,13 @@ class Tensor { ...@@ -153,56 +155,13 @@ class Tensor {
void clear() { holder_ = nullptr; } void clear() { holder_ = nullptr; }
private: const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
/** size_t offset() const { return offset_; }
* @note Placeholder hides type T, so it doesn't appear as a template
* parameter of Variable.
*/
struct Placeholder {
virtual ~Placeholder() = default;
virtual void* ptr() const = 0;
virtual size_t size() const = 0;
virtual std::type_index type() const = 0;
virtual platform::Place place() const = 0;
virtual void set_type(std::type_index type) = 0;
virtual void set_place(platform::Place place) = 0;
};
template <typename Place>
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place place, size_t size, std::type_index type)
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
memory::PODDeleter<uint8_t, Place>(place)),
place_(place),
size_(size),
type_(type) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU"));
}
virtual size_t size() const { return size_; }
virtual platform::Place place() const { return place_; }
virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; }
virtual void set_place(platform::Place place) { place_ = place; }
/*! the pointer of memory block. */
std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
/*! the place of memory block. */
platform::Place place_;
/*! the size of memory block. */
size_t size_;
/* the current type of memory */
std::type_index type_;
};
private:
/*! holds the memory block if allocated. */ /*! holds the memory block if allocated. */
std::shared_ptr<Placeholder> holder_; std::shared_ptr<memory::Allocation> holder_;
std::type_index type_;
/** /**
* @brief points to elements dimensions. * @brief points to elements dimensions.
* *
......
...@@ -23,10 +23,10 @@ namespace framework { ...@@ -23,10 +23,10 @@ namespace framework {
template <typename T> template <typename T>
inline const T* Tensor::data() const { inline const T* Tensor::data() const {
check_memory_size(); check_memory_size();
bool valid = std::is_same<T, void>::value || bool valid =
holder_->type() == std::type_index(typeid(T)); std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
this->holder_->type()); type_.name());
return reinterpret_cast<const T*>( return reinterpret_cast<const T*>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_); reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
...@@ -37,26 +37,30 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; } ...@@ -37,26 +37,30 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
template <typename T> template <typename T>
inline T* Tensor::data() { inline T* Tensor::data() {
check_memory_size(); check_memory_size();
bool valid = std::is_same<T, void>::value || bool valid =
holder_->type() == std::type_index(typeid(T)); std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
this->holder_->type().name()); type_.name());
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(DDim dims, platform::Place place, inline T* Tensor::mutable_data(DDim dims, platform::Place place,
memory::Allocator::Attr attr,
size_t requested_size) { size_t requested_size) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
Resize(dims); Resize(dims);
return mutable_data<T>(place, requested_size); return mutable_data<T>(place, attr, requested_size);
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) { inline T* Tensor::mutable_data(platform::Place place,
memory::Allocator::Attr attr,
size_t requested_size) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size)); return reinterpret_cast<T*>(
mutable_data(place, typeid(T), attr, requested_size));
} }
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
......
...@@ -379,7 +379,9 @@ TEST(Tensor, FromAndToStream) { ...@@ -379,7 +379,9 @@ TEST(Tensor, FromAndToStream) {
TensorToStream(oss, gpu_tensor, gpu_ctx); TensorToStream(oss, gpu_tensor, gpu_ctx);
std::istringstream iss(oss.str()); std::istringstream iss(oss.str());
TensorFromStream(iss, &dst_tensor, gpu_ctx); TensorFromStream(
iss, &dst_tensor,
*platform::DeviceContextPool::Instance().Get(platform::CPUPlace()));
int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace()); int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
for (int i = 0; i < 6; ++i) { for (int i = 0; i < 6; ++i) {
......
...@@ -13,12 +13,24 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor) ...@@ -13,12 +13,24 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal? # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
cc_library(paddle_fluid_api cc_library(paddle_fluid_api
SRCS io.cc SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
get_property(fluid_third_partys GLOBAL PROPERTY FLUID_THRID_PARTYS)
if (WIN32)
list(APPEND fluid_third_partys gflags glog protobuf cblas)
endif(WIN32)
# paddle_fluid_origin exclude inference api interface # paddle_fluid_origin exclude inference api interface
cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) if(WIN32)
sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
if(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_fluid_origin ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
else(WIN32)
cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
endif(WIN32)
add_subdirectory(api) add_subdirectory(api)
...@@ -27,13 +39,17 @@ set(SHARED_INFERENCE_SRCS ...@@ -27,13 +39,17 @@ set(SHARED_INFERENCE_SRCS
io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc) ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
if (WITH_GPU AND TENSORRT_FOUND)
set(STATIC_INFERENCE_APIS ${STATIC_INFERENCE_APIS} paddle_inference_tensorrt_subgraph_engine)
set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/api/api_tensorrt_subgraph_engine.cc)
endif()
# Create static library if(WIN32)
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
analysis_config paddle_pass_builder)
if(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_fluid ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
else(WIN32)
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
analysis_config paddle_pass_builder)
endif(WIN32)
if(NOT APPLE) if(NOT APPLE)
# TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
...@@ -42,11 +58,20 @@ if(NOT APPLE) ...@@ -42,11 +58,20 @@ if(NOT APPLE)
endif() endif()
# Create shared library # Create shared library
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} if(WIN32)
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
target_link_libraries(paddle_fluid_shared shlwapi)
if(WITH_GPU AND NOT WITH_DSO)
target_link_libraries(paddle_fluid_origin ${cuda_modules})
endif(WITH_GPU AND NOT WITH_DSO)
else(WIN32)
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
endif()
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
if(NOT APPLE) if(NOT APPLE AND NOT WIN32)
# TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac. # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map") set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
......
cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) unset(analysis_deps CACHE)
set(analysis_deps set(analysis_deps # analysis_deps can be extended accross the project
framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) framework_proto proto_desc graph pass paddle_fluid_api executor pretty_log
ir_pass_manager
CACHE INTERNAL "")
cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc add_subdirectory(ir_passes)
add_subdirectory(passes)
cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES})
cc_library(argument SRCS argument.cc DEPS scope proto_desc)
cc_library(analysis_pass SRCS analysis_pass.cc DEPS proto_desc)
cc_library(analysis SRCS
analyzer.cc analyzer.cc
helper.cc helper.cc
# passes analysis_pass
analysis_pass.cc DEPS ${analysis_deps}
fluid_to_data_flow_graph_pass.cc )
data_flow_graph_to_fluid_pass.cc
dfg_graphviz_draw_pass.cc
tensorrt_subgraph_pass.cc
tensorrt_subgraph_node_mark_pass.cc
fluid_to_ir_pass.cc
model_store_pass.cc
DEPS ${analysis_deps})
cc_test(test_node SRCS node_tester.cc DEPS analysis)
cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid)
function(inference_analysis_test TARGET) function(inference_analysis_test TARGET)
if(WITH_TESTING) if(WITH_TESTING)
...@@ -34,13 +35,3 @@ function(inference_analysis_test TARGET) ...@@ -34,13 +35,3 @@ function(inference_analysis_test TARGET)
endfunction(inference_analysis_test) endfunction(inference_analysis_test)
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
...@@ -19,42 +19,36 @@ limitations under the License. */ ...@@ -19,42 +19,36 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/inference/analysis/argument.h" #include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/node.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
/*
* AnalysisPass is a pass used to control the IR passes.
*/
class AnalysisPass { class AnalysisPass {
public: public:
AnalysisPass() = default; AnalysisPass() = default;
virtual ~AnalysisPass() = default; virtual ~AnalysisPass() = default;
// Mutable Pass.
virtual bool Initialize(Argument *argument) { return false; }
// Readonly Pass.
virtual bool Initialize(const Argument &argument) { return false; }
// Virtual method overriden by subclasses to do any necessary clean up after // Run on a single Graph.
// all passes have run. void Run(Argument* argument) { RunImpl(argument); }
virtual bool Finalize() { return false; }
// Create a debugger Pass that draw the DFG by graphviz toolkit.
virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }
// Run on a single DataFlowGraph.
virtual void Run(DataFlowGraph *x) = 0;
// Human-readable short representation. // Human-readable short representation.
virtual std::string repr() const = 0; virtual std::string repr() const = 0;
// Human-readable long description. // Human-readable long description.
virtual std::string description() const { return "No DOC"; } virtual std::string description() const { return "No DOC"; }
};
// GraphPass processes on any GraphType. protected:
class DataFlowGraphPass : public AnalysisPass {}; // User should implement these.
virtual void RunImpl(Argument* argument) = 0;
Argument* argument_{nullptr};
};
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
......
...@@ -15,138 +15,23 @@ ...@@ -15,138 +15,23 @@
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/passes/passes.h"
#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
#include "paddle/fluid/inference/analysis/model_store_pass.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
DEFINE_bool(IA_enable_tensorrt_subgraph_engine, false,
"Enable subgraph to TensorRT engine for acceleration");
DEFINE_bool(IA_enable_ir, false, "Turn on IR support");
DEFINE_string(IA_graphviz_log_root, "./",
"Graphviz debuger for data flow graphs.");
DEFINE_string(IA_output_storage_path, "", "optimized model output path");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
class DfgPassManagerImpl final : public DfgPassManager { Analyzer::Analyzer() {}
public:
DfgPassManagerImpl() {
// TODO(Superjomn) set the key with pass reprs.
if (!FLAGS_IA_enable_ir) {
AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
} else {
AddPass("fluid-to-ir-pass", new FluidToIrPass);
}
TryAddTensorRtPass();
AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
if (!FLAGS_IA_output_storage_path.empty()) {
AddPass("model-store-pass", new ModelStorePass);
}
}
std::string repr() const override { return "dfg-pass-manager"; } void Analyzer::Run(Argument *argument) { RunIrAnalysis(argument); }
std::string description() const override { return "DFG pass manager."; }
private: void Analyzer::RunIrAnalysis(Argument *argument) {
void AddPass(const std::string& name, AnalysisPass* pass) { std::vector<std::string> passes({"ir_analysis_compose_pass"});
VLOG(30) << "Adding pass " << name;
Register(name, pass);
AddGraphvizDebugerPass(pass);
}
void TryAddTensorRtPass() { for (auto &pass : passes) {
if (FLAGS_IA_enable_tensorrt_subgraph_engine) { PassRegistry::Global().Retreive(pass)->Run(argument);
auto trt_teller = [&](const Node* node) {
std::unordered_set<std::string> teller_set(
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "dropout"});
if (!node->IsFunction()) return false;
const auto* func = static_cast<const Function*>(node);
if (teller_set.count(func->func_type())) {
return true;
} else {
return false;
}
};
AddPass("tensorrt-subgraph-marker",
new TensorRTSubgraphNodeMarkPass(trt_teller));
AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
}
}
// Add the graphviz debuger pass if the parent pass has one.
void AddGraphvizDebugerPass(AnalysisPass* pass) {
auto* debuger_pass = pass->CreateGraphvizDebugerPass();
if (debuger_pass) {
Register(debuger_pass->repr(), debuger_pass);
}
} }
};
Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
void Analyzer::Run(Argument* argument) {
std::vector<std::string> passes;
passes.push_back("graph_viz_pass"); // add graphviz for debug.
#ifdef PADDLE_WITH_MKLDNN
if (use_mkldnn_) {
VLOG(30) << "Adding MKL-DNN placement pass";
passes.push_back("mkldnn_placement_pass");
}
#endif
// infer_clean_graph_pass should be the first default pass
// after mkldnn_placement_pass.
passes.push_back("infer_clean_graph_pass");
passes.push_back("graph_viz_pass"); // add graphviz for debug.
for (auto& pass : ir_passes_) {
// skip mkldnn pass when use_mkldnn_ = false;
bool skip_pass = (!use_mkldnn_) && pass.find("mkldnn") != std::string::npos;
if (!disabled_ir_passes_.count(pass) && !skip_pass) {
passes.push_back(pass);
passes.push_back("graph_viz_pass"); // add graphviz for debug.
}
}
argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));
for (auto& x : data_) {
PADDLE_ENFORCE(x->Initialize(argument));
x->RunAll();
PADDLE_ENFORCE(x->Finalize());
}
}
Analyzer& Analyzer::IncludeAllIrPasses() {
ir_passes_ = all_ir_passes_;
return *this;
}
Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
disabled_ir_passes_.insert(passes.begin(), passes.end());
return *this;
}
Analyzer& Analyzer::IncludeIrPasses(const std::vector<std::string>& passes) {
ir_passes_ = passes;
return *this;
}
Analyzer& Analyzer::SetUseMkldnn(bool use_mkldnn) {
use_mkldnn_ = use_mkldnn;
return *this;
} }
} // namespace analysis } // namespace analysis
......
...@@ -40,56 +40,21 @@ limitations under the License. */ ...@@ -40,56 +40,21 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/flags.h" #include "paddle/fluid/inference/analysis/flags.h"
#include "paddle/fluid/inference/analysis/pass_manager.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
class Analyzer : public OrderedRegistry<PassManager> { class Analyzer final {
public: public:
// Register all the pass-managers.
Analyzer(); Analyzer();
void Run(Argument* argument); void Run(Argument* argument);
Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
Analyzer& IncludeIrPasses(const std::vector<std::string>& passes);
Analyzer& IncludeAllIrPasses();
Analyzer& SetUseMkldnn(bool use_mkldnn);
DISABLE_COPY_AND_ASSIGN(Analyzer); DISABLE_COPY_AND_ASSIGN(Analyzer);
private: protected:
// All avaiable IR passes. void RunIrAnalysis(Argument* argument);
// The bigger fuse comes first, so that the small operators prefer to be
// merged in a larger fuse op. The small fusion will not break the pattern of
// larger fusion.
const std::vector<std::string> all_ir_passes_{{
// Manual update the passes here.
"attention_lstm_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", //
"embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", //
"mul_lstm_fuse_pass", //
"fc_gru_fuse_pass", //
"mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
#ifdef PADDLE_WITH_MKLDNN
"depthwise_conv_mkldnn_pass", //
"conv_bias_mkldnn_fuse_pass", //
"conv_relu_mkldnn_fuse_pass", //
"conv_elementwise_add_mkldnn_fuse_pass", //
#endif
}};
std::unordered_set<std::string> disabled_ir_passes_;
// Ir passes to run
std::vector<std::string> ir_passes_;
bool use_mkldnn_;
}; };
} // namespace analysis } // namespace analysis
......
...@@ -27,21 +27,21 @@ namespace analysis { ...@@ -27,21 +27,21 @@ namespace analysis {
using namespace framework; // NOLINT using namespace framework; // NOLINT
TEST(Analyzer, analysis_without_tensorrt) { TEST(Analyzer, analysis_without_tensorrt) {
FLAGS_IA_enable_tensorrt_subgraph_engine = false;
Argument argument; Argument argument;
argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); argument.SetModelDir(FLAGS_inference_model_dir);
argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
Analyzer analyser; Analyzer analyser;
analyser.Run(&argument); analyser.Run(&argument);
} }
TEST(Analyzer, analysis_with_tensorrt) { TEST(Analyzer, analysis_with_tensorrt) {
FLAGS_IA_enable_tensorrt_subgraph_engine = true;
Argument argument; Argument argument;
argument.Set<int>("minimum_subgraph_size", new int(0)); argument.SetTensorRtMaxBatchSize(3);
argument.Set<int>("max_batch_size", new int(3)); argument.SetTensorRtWorkspaceSize(1 << 20);
argument.Set<int>("workspace_size", new int(1 << 20)); argument.SetModelDir(FLAGS_inference_model_dir);
argument.Set<std::string>("precision_mode", new std::string("FP32")); argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
Analyzer analyser; Analyzer analyser;
analyser.Run(&argument); analyser.Run(&argument);
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file implements the transformation from fluid ProgramDesc to data flow
* graph.
*/
#pragma once
#include <string>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
namespace paddle {
namespace inference {
namespace analysis {
class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
public:
DataFlowGraphToFluidPass() = default;
bool Initialize(Argument *argument) override;
bool Finalize() override;
void Run(DataFlowGraph *graph) override;
std::string repr() const override { return "DFG to fluid"; }
std::string description() const override {
return "Transform a DFG to a Fluid ProgramDesc";
}
AnalysisPass *CreateGraphvizDebugerPass() const override;
protected:
// Add a Fluid Op into the ProgramDesc.
void AddFluidOp(Node *node);
// Add a EngineOp into the ProgramDesc.
void AddEngineOp(Node *node);
private:
framework::proto::ProgramDesc *desc_;
Argument *argument_;
};
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
#include <glog/logging.h>
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/io.h"
namespace paddle {
namespace inference {
namespace analysis {
TEST(DataFlowGraph, Test) {
Argument argument(FLAGS_inference_model_dir);
FluidToDataFlowGraphPass pass0;
DataFlowGraphToFluidPass pass1;
ASSERT_TRUE(pass0.Initialize(&argument));
ASSERT_TRUE(pass1.Initialize(&argument));
pass0.Run(argument.main_dfg.get());
pass1.Run(argument.main_dfg.get());
pass0.Finalize();
pass1.Finalize();
LOG(INFO) << argument.main_dfg->nodes.size();
}
}; // namespace analysis
}; // namespace inference
}; // namespace paddle
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <memory> #include <memory>
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector)
set(analysis_deps ${analysis_deps}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL "")
set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册