“62840afa6012e9c6a1605cb85e80dd468bf6a920”上不存在“paddle/phi/kernels/increment_kernel.h”
提交 92da467c 编写于 作者: P peizhilin

Merge remote-tracking branch 'upstream/develop' into windows/fixgpuissue

...@@ -55,6 +55,7 @@ option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF) ...@@ -55,6 +55,7 @@ option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF)
option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF) option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF)
option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF) option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF)
option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF)
option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF)
option(WITH_DOC "Compile PaddlePaddle with documentation" OFF) option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
...@@ -261,6 +262,12 @@ if (WITH_PROFILER) ...@@ -261,6 +262,12 @@ if (WITH_PROFILER)
add_definitions(-DWITH_GPERFTOOLS) add_definitions(-DWITH_GPERFTOOLS)
endif() endif()
if (WITH_JEMALLOC)
find_package(JeMalloc REQUIRED)
include_directories(${JEMALLOC_INCLUDE_DIR})
add_definitions(-DWITH_JEMALLOC)
endif()
include(generic) # simplify cmake module include(generic) # simplify cmake module
include(package) # set paddle packages include(package) # set paddle packages
include(ccache) # set ccache for compilation include(ccache) # set ccache for compilation
...@@ -290,7 +297,7 @@ if(WITH_PSLIB) ...@@ -290,7 +297,7 @@ if(WITH_PSLIB)
list(APPEND EXTERNAL_LIBS pslib_brpc) list(APPEND EXTERNAL_LIBS pslib_brpc)
list(APPEND EXTERNAL_LIBS libmct) list(APPEND EXTERNAL_LIBS libmct)
endif(WITH_PSLIB) endif(WITH_PSLIB)
if(WITH_AMD_GPU) if(WITH_AMD_GPU)
find_package(HIP) find_package(HIP)
include(hip) include(hip)
......
# - Find JeMalloc library
# Find the native JeMalloc includes and library
#
# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc.
# JEMALLOC_LIBRARIES - List of libraries when using jemalloc.
# JEMALLOC_FOUND - True if jemalloc found.
find_path(JEMALLOC_INCLUDE_DIR
NAMES jemalloc/jemalloc.h
HINTS ${JEMALLOC_ROOT_DIR}/include)
find_library(JEMALLOC_LIBRARIES
NAMES jemalloc
HINTS ${JEMALLOC_ROOT_DIR}/lib)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR)
mark_as_advanced(
JEMALLOC_LIBRARIES
JEMALLOC_INCLUDE_DIR)
if (JEMALLOC_FOUND)
add_library(jemalloc::jemalloc UNKNOWN IMPORTED)
set_target_properties(jemalloc::jemalloc PROPERTIES
IMPORTED_LOCATION ${JEMALLOC_LIBRARIES}
INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}")
endif()
...@@ -134,6 +134,7 @@ if(WITH_GPU) ...@@ -134,6 +134,7 @@ if(WITH_GPU)
message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF") message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE) set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
endif() endif()
add_definitions(-DWITH_ANAKIN)
endif() endif()
if(WITH_ANAKIN) if(WITH_ANAKIN)
# NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
......
...@@ -5,6 +5,8 @@ endif() ...@@ -5,6 +5,8 @@ endif()
set(paddle_known_gpu_archs "30 35 50 52 60 61 70") set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs7 "30 35 50 52") set(paddle_known_gpu_archs7 "30 35 50 52")
set(paddle_known_gpu_archs8 "30 35 50 52 60 61") set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
###################################################################################### ######################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled) # A function for automatic detection of GPUs installed (if autodetection is enabled)
...@@ -59,7 +61,7 @@ endfunction() ...@@ -59,7 +61,7 @@ endfunction()
# select_nvcc_arch_flags(out_variable) # select_nvcc_arch_flags(out_variable)
function(select_nvcc_arch_flags out_variable) function(select_nvcc_arch_flags out_variable)
# List of arch names # List of arch names
set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual") set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
set(archs_name_default "All") set(archs_name_default "All")
if(NOT CMAKE_CROSSCOMPILING) if(NOT CMAKE_CROSSCOMPILING)
list(APPEND archs_names "Auto") list(APPEND archs_names "Auto")
...@@ -93,6 +95,8 @@ function(select_nvcc_arch_flags out_variable) ...@@ -93,6 +95,8 @@ function(select_nvcc_arch_flags out_variable)
set(cuda_arch_bin "60 61") set(cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
set(cuda_arch_bin "70") set(cuda_arch_bin "70")
elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
set(cuda_arch_bin "75")
elseif(${CUDA_ARCH_NAME} STREQUAL "All") elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs}) set(cuda_arch_bin ${paddle_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
...@@ -153,6 +157,16 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x ...@@ -153,6 +157,16 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
# warning for now. # warning for now.
list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
add_definitions("-DPADDLE_CUDA_BINVER=\"80\"") add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
add_definitions("-DPADDLE_CUDA_BINVER=\"90\"")
elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
endif() endif()
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})
......
...@@ -23,11 +23,8 @@ set(BOOST_PROJECT "extern_boost") ...@@ -23,11 +23,8 @@ set(BOOST_PROJECT "extern_boost")
# checked that the devtools package of CentOS 6 installs boost 1.41.0. # checked that the devtools package of CentOS 6 installs boost 1.41.0.
# So we use 1.41.0 here. # So we use 1.41.0 here.
set(BOOST_VER "1.41.0") set(BOOST_VER "1.41.0")
if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL)) set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
message(STATUS "use pre defined download url") set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
endif()
MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
......
...@@ -55,7 +55,7 @@ ExternalProject_Add( ...@@ -55,7 +55,7 @@ ExternalProject_Add(
${MKLDNN_PROJECT} ${MKLDNN_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS} DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git"
GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a" GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a"
PREFIX ${MKLDNN_SOURCES_DIR} PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
......
...@@ -16,6 +16,12 @@ IF(NOT ${WITH_MKLML}) ...@@ -16,6 +16,12 @@ IF(NOT ${WITH_MKLML})
return() return()
ENDIF(NOT ${WITH_MKLML}) ENDIF(NOT ${WITH_MKLML})
IF(APPLE)
MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
return()
ENDIF()
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(MKLML_DST_DIR "mklml") SET(MKLML_DST_DIR "mklml")
SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
...@@ -23,32 +29,24 @@ SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) ...@@ -23,32 +29,24 @@ SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
SET(MKLML_ROOT ${MKLML_INSTALL_DIR}) SET(MKLML_ROOT ${MKLML_INSTALL_DIR})
SET(MKLML_INC_DIR ${MKLML_ROOT}/include) SET(MKLML_INC_DIR ${MKLML_ROOT}/include)
SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib)
if(WIN32) SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
SET(TIME_VERSION "2019.0.1.20181227")
IF(WIN32)
SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll)
else() ELSE()
SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
endif() ENDIF()
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
MESSAGE(STATUS "use pre defined download url")
if(WIN32)
SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
elseif(APPLE)
SET(MKLML_VER "mklml_mac_2019.0.1.20180928" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
else()
SET(MKLML_VER "mklml_lnx_2019.0.1.20180928" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
ENDIF()
endif()
SET(MKLML_PROJECT "extern_mklml") SET(MKLML_PROJECT "extern_mklml")
MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
......
...@@ -37,14 +37,18 @@ INCLUDE(GNUInstallDirs) ...@@ -37,14 +37,18 @@ INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph") SET(NGRAPH_PROJECT "extern_ngraph")
SET(NGRAPH_GIT_TAG "v0.10.1") SET(NGRAPH_GIT_TAG "20bd8bbc79ae3a81c57313846a2be7313e5d1dab")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
SET(NGRAPH_SHARED_LIB_NAME libngraph.so) SET(NGRAPH_SHARED_LIB_NAME libngraph.so)
SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so)
SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) if(CMAKE_BUILD_TYPE STREQUAL "Debug")
SET(NGRAPH_TBB_LIB_NAME libtbb_debug.so.2)
else()
SET(NGRAPH_TBB_LIB_NAME libtbb.so.2)
endif()
SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git")
SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME}) SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME}) SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
...@@ -66,16 +70,7 @@ ExternalProject_Add( ...@@ -66,16 +70,7 @@ ExternalProject_Add(
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
) CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
# Workaround for nGraph expecting mklml to be in mkldnn install directory.
ExternalProject_Add_Step(
${NGRAPH_PROJECT}
PrepareMKL
COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_LIB} ${MKLDNN_INSTALL_DIR}/lib/libmklml_intel.so
COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_IOMP_LIB} ${MKLDNN_INSTALL_DIR}/lib/libiomp5.so
DEPENDEES download
DEPENDERS configure
) )
add_dependencies(ngraph ${NGRAPH_PROJECT}) add_dependencies(ngraph ${NGRAPH_PROJECT})
......
...@@ -115,6 +115,10 @@ function(common_link TARGET_NAME) ...@@ -115,6 +115,10 @@ function(common_link TARGET_NAME)
if (WITH_PROFILER) if (WITH_PROFILER)
target_link_libraries(${TARGET_NAME} gperftools::profiler) target_link_libraries(${TARGET_NAME} gperftools::profiler)
endif() endif()
if (WITH_JEMALLOC)
target_link_libraries(${TARGET_NAME} jemalloc::jemalloc)
endif()
endfunction() endfunction()
...@@ -228,7 +232,7 @@ function(merge_static_libs TARGET_NAME) ...@@ -228,7 +232,7 @@ function(merge_static_libs TARGET_NAME)
# Get the file names of the libraries to be merged # Get the file names of the libraries to be merged
set(libfiles ${libfiles} $<TARGET_FILE:${lib}>) set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
endforeach() endforeach()
# msvc will put libarary in directory of "/Release/xxxlib" by default # msvc will put libarary in directory of "/Release/xxxlib" by default
# COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib" # COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}" COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}"
......
...@@ -405,28 +405,50 @@ paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None ...@@ -405,28 +405,50 @@ paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.optimizer.SGDOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.SGDOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdamaxOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.DecayedAdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)) paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None))
paddle.fluid.optimizer.FtrlOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.FtrlOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.RMSPropOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)) paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)) paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.LarsMomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)) paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
......
...@@ -94,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS ...@@ -94,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass fuse_elewise_add_act_pass multi_batch_merge_pass
memory_optimize_pass) memory_optimize_pass lock_free_optimize_pass)
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include "paddle/fluid/framework/details/memory_reuse_types.h" #include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/sequential_execution_pass.h" #include "paddle/fluid/framework/details/sequential_execution_pass.h"
...@@ -86,10 +86,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -86,10 +86,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
if (strategy.memory_optimize_) { if (strategy.memory_optimize_) {
auto analysis_var_pass = AppendPass("analysis_var_pass"); auto analysis_var_pass = AppendPass("analysis_var_pass");
} }
// Convert graph to run on multi-devices.
auto multi_devices_pass = AppendPass("multi_devices_pass"); AppendMultiDevPass(strategy);
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
&strategy_);
// Add a graph print pass to record a graph with device info. // Add a graph print pass to record a graph with device info.
if (!strategy_.debug_graphviz_path_.empty()) { if (!strategy_.debug_graphviz_path_.empty()) {
...@@ -115,6 +113,25 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -115,6 +113,25 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
} }
} }
// Convert graph to run on multi-devices.
void AppendMultiDevPass(const BuildStrategy &strategy) {
ir::Pass *multi_devices_pass;
if (strategy_.is_distribution_) {
multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
} else {
if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
multi_devices_pass =
AppendPass("allreduce_mode_multi_devices_pass").get();
} else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
} else {
PADDLE_THROW("Unknown reduce strategy.");
}
}
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
&strategy_);
}
private: private:
BuildStrategy strategy_; BuildStrategy strategy_;
}; };
...@@ -131,6 +148,10 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy( ...@@ -131,6 +148,10 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
return pass_builder_; return pass_builder_;
} }
bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0;
}
std::unique_ptr<ir::Graph> BuildStrategy::Apply( std::unique_ptr<ir::Graph> BuildStrategy::Apply(
const ProgramDesc &main_program, const std::vector<platform::Place> &places, const ProgramDesc &main_program, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::vector<Scope *> &local_scopes, const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
...@@ -145,22 +166,23 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -145,22 +166,23 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program)); std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) { for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
if (pass->Type() == "multi_devices_pass") { if (IsMultiDevPass(pass->Type())) {
pass->Erase("places"); pass->Erase(kPlaces);
pass->SetNotOwned<const std::vector<platform::Place>>("places", &places); pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
pass->Erase("loss_var_name"); pass->Erase(kLossVarName);
pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name); pass->SetNotOwned<const std::string>(kLossVarName, &loss_var_name);
pass->Erase("local_scopes"); pass->Erase(kLocalScopes);
pass->SetNotOwned<const std::vector<Scope *>>("local_scopes", pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
&local_scopes); &local_scopes);
pass->Erase("nranks"); pass->Erase(kNRanks);
pass->Set<size_t>("nranks", new size_t(nranks)); pass->Set<size_t>(kNRanks, new size_t(nranks));
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
pass->Erase("nccl_ctxs"); pass->Erase("nccl_ctxs");
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx); pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif #endif
} else if (pass->Type() == "analysis_var_pass") { } else if (pass->Type() == "analysis_var_pass") {
const std::vector<OpDesc *> *all_op_descs = const std::vector<OpDesc *> *all_op_descs =
new std::vector<OpDesc *>(main_program.Block(0).AllOps()); new std::vector<OpDesc *>(main_program.Block(0).AllOps());
...@@ -201,10 +223,13 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -201,10 +223,13 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
USE_PASS(fuse_elewise_add_act_pass); USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass); USE_PASS(graph_viz_pass);
USE_PASS(multi_batch_merge_pass); USE_PASS(multi_batch_merge_pass);
USE_PASS(multi_devices_pass); USE_PASS(reduce_mode_multi_devices_pass);
USE_PASS(allreduce_mode_multi_devices_pass);
USE_PASS(dist_multi_devices_pass);
USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass); USE_PASS(multi_devices_print_pass);
USE_PASS(analysis_var_pass); USE_PASS(analysis_var_pass);
USE_PASS(sequential_execution_pass); USE_PASS(sequential_execution_pass);
USE_PASS(all_reduce_deps_pass); USE_PASS(all_reduce_deps_pass);
USE_PASS(modify_op_lock_and_record_event_pass); USE_PASS(modify_op_lock_and_record_event_pass);
USE_PASS(lock_free_optimize_pass);
...@@ -74,8 +74,6 @@ struct BuildStrategy { ...@@ -74,8 +74,6 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false}; bool fuse_elewise_add_act_ops_{false};
bool enable_data_balance_{false};
bool memory_optimize_{false}; bool memory_optimize_{false};
bool memory_early_delete_{false}; bool memory_early_delete_{false};
...@@ -84,6 +82,10 @@ struct BuildStrategy { ...@@ -84,6 +82,10 @@ struct BuildStrategy {
bool fuse_broadcast_op_{false}; bool fuse_broadcast_op_{false};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model.
bool is_distribution_{false};
int num_trainers_{1}; int num_trainers_{1};
int trainer_id_{0}; int trainer_id_{0};
std::vector<std::string> trainers_endpoints_; std::vector<std::string> trainers_endpoints_;
...@@ -104,6 +106,8 @@ struct BuildStrategy { ...@@ -104,6 +106,8 @@ struct BuildStrategy {
bool IsFinalized() const { return is_finalized_; } bool IsFinalized() const { return is_finalized_; }
bool IsMultiDevPass(const std::string &pass_name) const;
// Apply the passes built by the pass_builder_. The passes will be // Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph. // applied to the Program and output an ir::Graph.
std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program, std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include <string> #include <string>
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
...@@ -21,68 +21,78 @@ namespace paddle { ...@@ -21,68 +21,78 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { class SSAGraghBuilderWithChecker : public ir::Pass {
std::unordered_map<OpHandleBase *, size_t> pending_ops; protected:
std::unordered_set<VarHandleBase *> pending_vars; std::unique_ptr<ir::Graph> ApplyImpl(
std::unordered_set<VarHandleBase *> ready_vars; std::unique_ptr<ir::Graph> graph) const override {
std::unordered_set<OpHandleBase *> ready_ops; PADDLE_ENFORCE(IsValidGraph(graph.get()));
return graph;
}
auto insert_pending_var = [&](VarHandleBase *var) { bool IsValidGraph(const ir::Graph *graph) const {
pending_vars.insert(var); std::unordered_map<OpHandleBase *, size_t> pending_ops;
if (var->GeneratedOp() == nullptr) { std::unordered_set<VarHandleBase *> pending_vars;
ready_vars.emplace(var); std::unordered_set<VarHandleBase *> ready_vars;
} std::unordered_set<OpHandleBase *> ready_ops;
};
for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) { auto insert_pending_var = [&](VarHandleBase *var) {
for (auto &name_pair : var_map) { pending_vars.insert(var);
for (auto &version_pair : name_pair.second) { if (var->GeneratedOp() == nullptr) {
insert_pending_var(version_pair); ready_vars.emplace(var);
} }
} };
}
for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) { for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
insert_pending_var(var); for (auto &name_pair : var_map) {
} for (auto &version_pair : name_pair.second) {
insert_pending_var(version_pair);
}
}
}
for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) { for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
if (op->Inputs().empty()) { insert_pending_var(var);
ready_ops.insert(op);
} else {
pending_ops.insert({op, op->NoDupInputSize()});
} }
}
auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) { for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) {
for (auto *op : set) { if (op->Inputs().empty()) {
for (auto out : op->Outputs()) { ready_ops.insert(op);
ready_vars.emplace(out); } else {
pending_ops.insert({op, op->NoDupInputSize()});
} }
} }
set.clear();
};
while (!pending_vars.empty()) { auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
run_all_ops(ready_ops); for (auto *op : set) {
for (auto out : op->Outputs()) {
ready_vars.emplace(out);
}
}
set.clear();
};
if (ready_vars.empty()) { while (!pending_vars.empty()) {
return false; run_all_ops(ready_ops);
}
for (auto ready_var : ready_vars) { if (ready_vars.empty()) {
pending_vars.erase(ready_var); return false;
for (auto *op : ready_var->PendingOps()) { }
auto &deps = --pending_ops[op];
if (deps == 0) { for (auto ready_var : ready_vars) {
ready_ops.insert(op); pending_vars.erase(ready_var);
for (auto *op : ready_var->PendingOps()) {
auto &deps = --pending_ops[op];
if (deps == 0) {
ready_ops.insert(op);
}
} }
} }
ready_vars.clear();
} }
ready_vars.clear(); return true;
} }
return true; };
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -30,78 +31,70 @@ namespace framework { ...@@ -30,78 +31,70 @@ namespace framework {
class Scope; class Scope;
namespace details { namespace details {
class MultiDevSSAGraphBuilder : public ir::Pass { constexpr char kLossVarName[] = "loss_var_name";
constexpr char kPlaces[] = "places";
constexpr char kLocalScopes[] = "local_scopes";
constexpr char kStrategy[] = "strategy";
constexpr char kNRanks[] = "nranks";
class MultiDevSSAGraphBuilderBase : public ir::Pass {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl( std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override; std::unique_ptr<ir::Graph> graph) const override;
private: virtual void Init() const;
void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
size_t device_id) const;
void Init() const;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
mutable platform::NCCLContextMap *nccl_ctxs_;
#endif
int GetVarDeviceID( virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &varname, const std::string &g_name) const = 0;
const std::unordered_map<std::string, int> &sharded_var_device) const;
bool IsScaleLossOp(ir::Node *node) const; virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0;
virtual void InsertPostprocessOps(ir::Graph *result) const = 0;
int CreateRPCOp( bool UseGPU() const;
ir::Graph *result, ir::Node *node,
std::unordered_map<std::string, int> *sharded_var_device) const; bool NeedCollectiveOps() const;
int CreateDistTrainOp(
ir::Graph *result, ir::Node *node, bool IsScaleLossOp(ir::Node *node) const;
std::unordered_map<std::string, int> *sharded_var_device) const;
void CreateComputationalOps(ir::Graph *result, ir::Node *node, void CreateComputationalOps(ir::Graph *result, ir::Node *node,
size_t num_places) const; size_t num_places) const;
void CreateScaleLossGradOp(ir::Graph *result, void CreateScaleLossGradOp(ir::Graph *result,
const std::string &loss_grad_name, const std::string &loss_grad_name,
ir::Node *out_var_node, ir::Node *out_var_node, size_t loss_scale,
proto::VarType::Type dtype) const; proto::VarType::Type dtype) const;
VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
int dst_dev_id) const; int dst_dev_id) const;
void CreateComputationalOp(ir::Graph *result, ir::Node *node, void CreateComputationalOp(ir::Graph *result, ir::Node *node,
int dev_id) const; int dev_id) const;
int GetOpDeviceID( bool IsSparseGradient(const std::string &og) const;
ir::Node *node,
const std::unordered_map<std::string, int> &sharded_var_device) const;
void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
void InsertDataBalanceOp(ir::Graph *result, void CreateAllReduceOp(ir::Graph *result, const std::string &og) const;
const std::vector<std::string> &datas) const;
void CreateBroadcastOp(ir::Graph *result, const std::string &p_name, void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
size_t src_dev_id) const; size_t src_dev_id) const;
void InsertScaleLossGradOp(ir::Graph *result, const ir::Node *node) const;
void CreateFusedBroadcastOp( void CreateFusedBroadcastOp(
ir::Graph *result, ir::Graph *result,
const std::vector<std::unordered_set<std::string>> &bcast_varnames) const; const std::vector<std::unordered_set<std::string>> &bcast_varnames) const;
bool IsSparseGradient(const std::string &og) const;
size_t GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const;
void SetCommunicationContext(OpHandleBase *op_handle, void SetCommunicationContext(OpHandleBase *op_handle,
const platform::Place &p) const; const platform::Place &p) const;
std::vector<ir::Node *> SortForReduceMode( void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
const std::vector<ir::Node *> &) const; size_t device_id) const;
int GetOpDeviceID( #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
ir::Node *node, mutable platform::NCCLContextMap *nccl_ctxs_;
const std::unordered_map<std::string, int> &shared_var_device, #endif
std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
const;
mutable std::string loss_var_name_; mutable std::string loss_var_name_;
mutable std::vector<platform::Place> places_; mutable std::vector<platform::Place> places_;
...@@ -109,8 +102,83 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -109,8 +102,83 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
mutable BuildStrategy strategy_; mutable BuildStrategy strategy_;
mutable std::unordered_map<std::string, VarDesc *> all_vars_; mutable std::unordered_map<std::string, VarDesc *> all_vars_;
};
class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
protected:
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
return false;
}
virtual void InsertPostprocessOps(ir::Graph *result) const {}
};
class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
protected:
int GetVarDeviceID(const std::string &varname) const;
int GetOpDeviceID(ir::Node *node) const;
size_t GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const;
virtual void ResetState() const;
mutable std::unordered_map<std::string, int> sharded_var_device_;
mutable std::vector<int64_t> balance_vars_; mutable std::vector<int64_t> balance_vars_;
}; };
class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
protected:
virtual void Init() const;
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
virtual void InsertPostprocessOps(ir::Graph *result) const;
virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
virtual void ResetState() const;
int GetOpDeviceID(ir::Node *node,
std::unordered_map<std::string, std::vector<ir::Node *>>
*delay_ops) const;
std::vector<ir::Node *> SortForReduceMode(
const std::vector<ir::Node *> &topo_ops) const;
mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
};
class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
protected:
virtual void Init() const;
virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
virtual void InsertPostprocessOps(ir::Graph *result) const;
virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
const std::string &g_name) const;
virtual void ResetState() const;
int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
mutable bool need_broadcast_var_{false};
};
std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) ...@@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
pass_library(graph_to_program_pass base) pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass base) pass_library(graph_viz_pass base)
pass_library(lock_free_optimize_pass base)
pass_library(fc_fuse_pass inference) pass_library(fc_fuse_pass inference)
pass_library(attention_lstm_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference)
pass_library(infer_clean_graph_pass inference) pass_library(infer_clean_graph_pass inference)
...@@ -41,6 +42,7 @@ pass_library(seq_concat_fc_fuse_pass inference) ...@@ -41,6 +42,7 @@ pass_library(seq_concat_fc_fuse_pass inference)
pass_library(multi_batch_merge_pass base) pass_library(multi_batch_merge_pass base)
pass_library(conv_bn_fuse_pass inference) pass_library(conv_bn_fuse_pass inference)
pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference)
pass_library(seqpool_concat_fuse_pass inference)
pass_library(is_test_pass base) pass_library(is_test_pass base)
pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add_act_fuse_pass inference)
pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference)
......
...@@ -109,7 +109,6 @@ class Graph { ...@@ -109,7 +109,6 @@ class Graph {
attr_dels_[attr_name] = []() {}; attr_dels_[attr_name] = []() {};
} }
template <typename AttrType>
void Erase(const std::string &attr_name) { void Erase(const std::string &attr_name) {
PADDLE_ENFORCE(attrs_.count(attr_name) != 0, "%s not set in the graph", PADDLE_ENFORCE(attrs_.count(attr_name) != 0, "%s not set in the graph",
attr_name); attr_name);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/lock_free_optimize_pass.h"
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
const char kSumGradOpName[] = "sum";
// TODO(minqiyang): only support sgd at current time, please add
// other optimizers later.
const char kOptimizerType[] = "sgd";
std::unique_ptr<ir::Graph> LockFreeOptimizePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
// We could collect all weights' name from SGD, where
// W1 <- SGD(W0, Grad0)
std::unordered_set<std::string> weight_var_set;
for (auto* node : graph->Nodes()) {
if (IsOpNamed(node, kOptimizerType)) {
auto& param_out_vars = node->Op()->Output("ParamOut");
PADDLE_ENFORCE(param_out_vars.size() == 1u);
weight_var_set.insert(param_out_vars[0]);
}
}
// find all grad's merge op via weight name, where
// Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
std::unordered_set<ir::Node*> grad_sum_op_set;
for (ir::Node* node : graph->Nodes()) {
if (IsOpNamed(node, kSumGradOpName)) {
for (ir::Node* output : node->outputs) {
// strip the last grad suffix @GRAD
std::string var_name = output->Name();
const std::string suffix(kGradVarSuffix);
if (var_name != suffix && var_name.size() > suffix.size() &&
var_name.substr(var_name.size() - suffix.size()) == suffix) {
// if so then strip them off
var_name = var_name.substr(0, var_name.size() - suffix.size());
if (weight_var_set.find(var_name) != weight_var_set.end()) {
grad_sum_op_set.insert(node);
break;
}
}
}
}
}
// get the forward op and backward op pairs, where
// out <- forward(X, W)
// Grad1 <- backward(out, X')
// Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
// W0 <- SGD(W1, Grad0)
for (ir::Node* node : grad_sum_op_set) {
for (ir::Node* merged_grad_var : node->outputs) {
// find the optimizers connected with sum op
if (IsVarNameEndsWith(merged_grad_var, kGradVarSuffix) &&
merged_grad_var->outputs.size() == 1u) {
ir::Node* opt_node = merged_grad_var->outputs[0];
VLOG(3) << "Found opt node " << opt_node->Name();
// find the backward op connected with sum op
for (ir::Node* unmerged_grad_var : node->inputs) {
if (IsVarNameContains(unmerged_grad_var, kGradVarSuffix) &&
unmerged_grad_var->inputs.size() == 1u) {
ir::Node* backward_op = unmerged_grad_var->inputs[0];
VLOG(3) << "Found backward_op " << backward_op->Name();
// find the forward op related to the backward op
ir::Node* forward_op =
FindForwardOpViaBackwardOp(graph.get(), backward_op);
VLOG(3) << "Found forward_op " << forward_op->Name();
PADDLE_ENFORCE(forward_op);
Node* new_optimizer_node = CreateNewSGDNode(
graph.get(), forward_op, backward_op, node, opt_node);
PADDLE_ENFORCE(new_optimizer_node);
}
}
}
}
}
// Remove the sum_op and its' outputs and connected Optimizers
for (Node* sum_op : grad_sum_op_set) {
for (Node* sum_op_output : sum_op->outputs) {
for (Node* optimize_op : sum_op_output->outputs) {
if (optimize_op->NodeType() == Node::Type::kOperation &&
optimize_op->Name() == kOptimizerType) {
VLOG(3) << "remove optimize_op: " << optimize_op->Name() << "_"
<< optimize_op->id();
graph->RemoveNode(optimize_op);
}
}
VLOG(3) << "remove sum_op_output: " << sum_op_output->Name() << "_"
<< sum_op_output->id();
graph->RemoveNode(sum_op_output);
}
VLOG(3) << "remove sum_op: " << sum_op->Name() << "_" << sum_op->id();
graph->RemoveNode(sum_op);
}
for (auto* node : graph->Nodes()) {
for (Node* output_node : node->outputs) {
if (output_node->Name() == "sgd") {
VLOG(3) << "Node link to SGD: " << node->Name() << "_" << node->id()
<< " --> " << output_node->Name() << "_" << output_node->id();
for (Node* input_node : node->inputs) {
VLOG(3) << "SGD Input link: " << input_node->Name() << "_"
<< input_node->id() << " --> " << node->Name() << "_"
<< node->id();
}
}
}
}
return graph;
}
ir::Node* LockFreeOptimizePass::CreateNewSGDNode(
ir::Graph* graph, ir::Node* forward_node, ir::Node* backward_node,
ir::Node* grad_sum_node, ir::Node* optimize_node) const {
PADDLE_ENFORCE(graph);
PADDLE_ENFORCE(forward_node);
PADDLE_ENFORCE(backward_node);
PADDLE_ENFORCE(grad_sum_node);
PADDLE_ENFORCE(optimize_node);
// find the grad var node between the grad sum node and backward_node
std::vector<ir::Node*> grad_vars =
FindConnectedNode(backward_node, grad_sum_node);
ir::Node* grad_node = nullptr;
for (ir::Node* node : grad_vars) {
if (!ir::IsControlDepVar(*node)) {
grad_node = node;
}
}
PADDLE_ENFORCE(grad_node);
// create a new SGD node
OpDesc* old_desc = optimize_node->Op();
// keep with the same block between new optimizer and the old one
OpDesc new_desc(*old_desc, old_desc->Block());
new_desc.SetInput("Param", old_desc->Input("Param"));
new_desc.SetInput("LearningRate", old_desc->Input("LearningRate"));
new_desc.SetInput("Grad", std::vector<std::string>({grad_node->Name()}));
new_desc.SetOutput("ParamOut", old_desc->Output("ParamOut"));
std::vector<std::string> op_role_vars = boost::get<std::vector<std::string>>(
new_desc.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName()));
// replace the second op role var, because the grad name was
// changed in new optimizer
op_role_vars.pop_back();
op_role_vars.push_back(grad_node->Name());
new_desc.SetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(),
op_role_vars);
new_desc.SetType(kOptimizerType);
// set backward op's op role var, this will be used to
// set device_id in multi_device_pass
backward_node->Op()->SetAttr(
framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), op_role_vars);
// backward_node->Op()->SetAttr(
// framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), {});
// keep with the same output nodes between new optimizer and the
// old one
Node* sgd_node = graph->CreateOpNode(&new_desc);
// change all outputs of the optimize_node to the new one
ReplaceAllDownstreamNode(optimize_node, sgd_node);
// find connected node between forward node and optimize node
// and replace the optimize node to new sgd node
std::vector<ir::Node*> forward_opt_connected_nodes =
FindConnectedNode(forward_node, optimize_node);
for (ir::Node* node : forward_opt_connected_nodes) {
ReplaceUpstreamNode(node, optimize_node, sgd_node);
}
// find connected node between backward node and optimize node
// and replace the optimize node to new sgd node
std::vector<ir::Node*> backward_opt_connected_nodes =
FindConnectedNode(backward_node, optimize_node);
for (ir::Node* node : backward_opt_connected_nodes) {
ReplaceUpstreamNode(node, optimize_node, sgd_node);
}
// SGD must have only one param and LR in
PADDLE_ENFORCE(old_desc->Input("LearningRate").size() == 1u);
PADDLE_ENFORCE(old_desc->Input("Param").size() == 1u);
// LR and weight nodes should be copied
for (Node* upstream_node : optimize_node->inputs) {
if (upstream_node->Name() == old_desc->Input("LearningRate")[0] ||
upstream_node->Name() == old_desc->Input("Param")[0]) {
ReplaceUpstreamNode(upstream_node, optimize_node, sgd_node);
}
}
VLOG(3) << "Create new opt node" << sgd_node->Name() << "_" << sgd_node->id();
return sgd_node;
}
std::vector<ir::Node*> LockFreeOptimizePass::FindConnectedNode(
ir::Node* upstream_node, ir::Node* downstream_node) const {
std::vector<ir::Node*> result;
for (ir::Node* out_node : upstream_node->outputs) {
for (ir::Node* in_node : downstream_node->inputs) {
if (in_node == out_node) {
result.push_back(in_node);
}
}
}
return result;
}
void LockFreeOptimizePass::ReplaceUpstreamNode(
ir::Node* upstream_node, ir::Node* old_optimizer_node,
ir::Node* new_optimizer_node) const {
PADDLE_ENFORCE(upstream_node);
PADDLE_ENFORCE(old_optimizer_node);
PADDLE_ENFORCE(new_optimizer_node);
// Remove the old_optimizer_node from upstream_node's outputs vector
auto& output_node_vec = upstream_node->outputs;
for (auto output_node_iter = output_node_vec.begin();
output_node_iter != output_node_vec.end();) {
if (*output_node_iter == old_optimizer_node) {
output_node_vec.erase(output_node_iter);
break;
} else {
++output_node_iter;
}
}
// Add the new_optimizer_node to upstream_node's outputs vector
output_node_vec.emplace_back(new_optimizer_node);
new_optimizer_node->inputs.emplace_back(upstream_node);
}
void LockFreeOptimizePass::ReplaceAllDownstreamNode(
ir::Node* old_optimizer_node, ir::Node* new_optimizer_node) const {
PADDLE_ENFORCE(old_optimizer_node);
PADDLE_ENFORCE(new_optimizer_node);
for (ir::Node* downstream_node : old_optimizer_node->outputs) {
// Remove the old_optimizer_node from downstream_node's inputs vector
auto& input_node_vec = downstream_node->inputs;
for (auto input_node_iter = input_node_vec.begin();
input_node_iter != input_node_vec.end();) {
if (*input_node_iter == old_optimizer_node) {
input_node_vec.erase(input_node_iter);
break;
} else {
++input_node_iter;
}
}
// Add the new_optimizer_node to downstream_node's inputs vector
input_node_vec.emplace_back(new_optimizer_node);
new_optimizer_node->outputs.emplace_back(downstream_node);
}
}
ir::Node* LockFreeOptimizePass::FindForwardOpViaBackwardOp(
ir::Graph* graph, ir::Node* backward_node) const {
PADDLE_ENFORCE(graph);
PADDLE_ENFORCE(backward_node);
// strip the suffix _grad of backward_node's name
std::string forward_op_name = backward_node->Name();
const std::string suffix("_grad");
if (forward_op_name != suffix && forward_op_name.size() > suffix.size() &&
forward_op_name.substr(forward_op_name.size() - suffix.size()) ==
suffix) {
// if so then strip them off
forward_op_name =
forward_op_name.substr(0, forward_op_name.size() - suffix.size());
} else {
LOG(WARNING) << "Illegal backward node's name " << backward_node->Name()
<< " id " << backward_node->id();
return nullptr;
}
for (ir::Node* node : graph->Nodes()) {
if (node->Name() == forward_op_name) {
if (node->outputs.size() == 0u) {
// if forward_node has no output, then it has NO grad op
continue;
}
// check whether all inputs of the backward_op that ends_with @GRAD
// comes from the output of forward_op is the input of the backward_op
bool is_related_forward_node = true;
for (ir::Node* backward_input : backward_node->inputs) {
if (IsVarNameEndsWith(backward_input, kGradVarSuffix)) {
bool meets_correct_output = false;
for (ir::Node* forward_output : node->outputs) {
if (forward_output->Name() + kGradVarSuffix ==
backward_input->Name()) {
meets_correct_output = true;
break;
}
}
if (!meets_correct_output) {
is_related_forward_node = false;
break;
}
}
}
if (is_related_forward_node) {
return node;
}
}
}
return nullptr;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(lock_free_optimize_pass,
paddle::framework::ir::LockFreeOptimizePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
#define PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
#include <string>
#include <vector>
#include <boost/algorithm/string/predicate.hpp>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
/*
* Remove the sum op of all gradients of the backward op.
* And remove the dependecies of the optimizer related to the
* same backward op.
*
* Before this pass:
*
* forward_op1 forward_op2
* | |
* grad_op1 grad_op2
* \ /
* \ /
* sum_op
* |
* sgd_op
*
* After this pass:
* forward_op1 forward_op2
* | |
* grad_op1 grad_op2
* | |
* sgd_op1 sgd_op2
*
* sgd_op1 and sgd_op2 will update the same weight which holds the same
* memory, so we could benefits from the acceleration
*/
class LockFreeOptimizePass : public Pass {
public:
virtual ~LockFreeOptimizePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
private:
// Create a new sgd node via current optimizer node
ir::Node* CreateNewSGDNode(ir::Graph* graph, ir::Node* forward_node,
ir::Node* backward_node, ir::Node* grad_sum_node,
ir::Node* optimize_node) const;
// Replace the input weight's optimizers
void ReplaceUpstreamNode(ir::Node* upstream_node,
ir::Node* old_optimizer_node,
ir::Node* new_optimizer_node) const;
// Replace the output weight's optimizers
void ReplaceAllDownstreamNode(ir::Node* old_optimizer_node,
ir::Node* new_optimizer_node) const;
// Find all weight variables in graph
bool FindAllWeightVars(ir::Graph* graph) const;
// Find the forward_op node via the backward_op node
ir::Node* FindForwardOpViaBackwardOp(ir::Graph* graph,
ir::Node* backward_node) const;
std::vector<ir::Node*> FindConnectedNode(ir::Node* upstream_node,
ir::Node* downstream_node) const;
inline bool IsOpNamed(ir::Node* node, const std::string& name) const {
PADDLE_ENFORCE(node);
return node->NodeType() == Node::Type::kOperation && node->Name() == name;
}
inline bool IsVarNamed(ir::Node* node, const std::string& name) const {
PADDLE_ENFORCE(node);
return node->NodeType() == Node::Type::kVariable && node->Name() == name;
}
inline bool IsVarNameEndsWith(ir::Node* node, const std::string& name) const {
PADDLE_ENFORCE(node);
return node->NodeType() == Node::Type::kVariable &&
boost::algorithm::ends_with(node->Name(), name);
}
inline bool IsVarNameContains(ir::Node* node, const std::string& name) const {
PADDLE_ENFORCE(node);
return node->NodeType() == Node::Type::kVariable &&
node->Name().find(name) != std::string::npos;
}
inline bool IsControlDepFrom(ir::Node* ctrl_dep_node, ir::Node* node) const {
PADDLE_ENFORCE(ctrl_dep_node);
PADDLE_ENFORCE(node);
return IsControlDepVar(*ctrl_dep_node) &&
ctrl_dep_node->inputs.size() >= 1u &&
ctrl_dep_node->inputs[0] == node;
}
};
} // namespace ir
} // namespace framework
} // namespace paddle
#endif // PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#define MAX_CONCAT_INPUTS 200
namespace paddle {
namespace framework {
namespace ir {
PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
const std::string& name_scope,
int num_inputs) {
auto is_concat_op_with_inputs = [](Node* x, int num) -> bool {
return x && x->IsOp() && x->Op()->Type() == "concat" &&
x->Op()->Input("X").size() == static_cast<size_t>(num);
};
auto is_nth_input_var_of_concat = [=](Node* x, int idx) -> bool {
return x && x->IsVar() && VarLinksToOp(x, "concat") &&
x->outputs.size() == 1 && IsNthInput(x, x->outputs[0], "X", idx) &&
is_concat_op_with_inputs(x->outputs[0], num_inputs);
};
auto is_seqpool_op_with_pootype_of_nth_input_of_concat = [=](
Node* x, const std::string& type, int idx) -> bool {
bool ok = x && x->IsOp() && x->Op()->Type() == "sequence_pool" &&
x->Op()->HasAttr("pooltype") &&
boost::get<std::string>(x->Op()->GetAttr("pooltype")) == type &&
x->outputs.size() == 2; // seqpool should only have 2 outputs
if (ok) {
// only one output of seqpool_op is nth_input_var of concat
// the other one should be unused empty var
if (is_nth_input_var_of_concat(x->outputs[0], idx)) {
ok = ok && x->outputs[1]->IsVar() && x->outputs[1]->outputs.size() == 0;
} else {
ok = ok && is_nth_input_var_of_concat(x->outputs[1], idx) &&
x->outputs[0]->IsVar() && x->outputs[0]->outputs.size() == 0;
}
}
return ok;
};
auto* concat_op = pattern->NewNode(
[=](Node* x) { return is_concat_op_with_inputs(x, num_inputs); },
name_scope + "/concat_op");
concat_op->assert_op_attr<int>("axis", 1);
auto* concat_out_var = pattern->NewNode(
[=](Node* x) {
return x && x->IsVar() && VarLinksFromOp(x, "concat") &&
x->inputs.size() == 1 &&
is_concat_op_with_inputs(x->inputs[0], num_inputs);
},
name_scope + "/concat_out_var");
concat_out_var->assert_is_only_output_of_op("concat");
std::vector<PDNode*> seqpool_ops_input_var(num_inputs);
std::vector<PDNode*> seqpool_ops_output_var(num_inputs);
std::vector<PDNode*> seqpool_ops(num_inputs);
for (int i = 0; i < num_inputs; ++i) {
seqpool_ops_output_var[i] = pattern->NewNode(
[=](Node* x) {
return x && x->IsVar() && is_nth_input_var_of_concat(x, i) &&
x->inputs.size() == 1 &&
is_seqpool_op_with_pootype_of_nth_input_of_concat(x->inputs[0],
"SUM", i);
},
name_scope + "/sequence_pool_out_" + std::to_string(i));
seqpool_ops[i] = pattern->NewNode(
[=](Node* x) {
return x && x->IsOp() &&
is_seqpool_op_with_pootype_of_nth_input_of_concat(x, "SUM", i);
},
name_scope + "/sequence_pool_op_" + std::to_string(i));
seqpool_ops_input_var[i] = pattern->NewNode(
[=](Node* x) {
return x && x->IsVar() && x->outputs.size() >= 1 &&
is_seqpool_op_with_pootype_of_nth_input_of_concat(
x->outputs[0], "SUM", i);
},
name_scope + "/sequence_pool_in_" + std::to_string(i));
// Links
seqpool_ops[i]
->LinksFrom({seqpool_ops_input_var[i]})
.LinksTo({seqpool_ops_output_var[i]});
}
concat_op->LinksFrom(seqpool_ops_output_var).LinksTo({concat_out_var});
return concat_out_var;
}
int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
int num_inputs) {
GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern();
BuildSeqPoolConcatPattern(pattern, name_scope, num_inputs);
auto retrieve_node = [](const std::string& name,
const GraphPatternDetector::subgraph_t& subgraph,
const PDPattern& pat) -> Node* {
PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)),
"pattern has no Node called %s", name.c_str());
Node* p = subgraph.at(pat.RetrieveNode(name));
PADDLE_ENFORCE_NOT_NULL(p, "subgraph has no node %s", name.c_str());
return p;
};
int fusion_count{0};
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle SeqPool Concat fuse";
std::vector<std::string> input_names(num_inputs);
std::vector<Node*> input_vars(num_inputs);
auto& fused_pattern = gpd.pattern();
for (int i = 0; i < num_inputs; ++i) {
input_vars[i] =
retrieve_node(name_scope + "/sequence_pool_in_" + std::to_string(i),
subgraph, fused_pattern);
input_names[i] = input_vars[i]->Name();
}
auto* concat_op =
retrieve_node(name_scope + "/concat_op", subgraph, fused_pattern);
auto* concat_out_var =
retrieve_node(name_scope + "/concat_out_var", subgraph, fused_pattern);
auto* seqpool_op0 = retrieve_node(name_scope + "/sequence_pool_op_0",
subgraph, fused_pattern);
// Create New OpDesc
OpDesc op_desc;
op_desc.SetType("fusion_seqpool_concat");
op_desc.SetInput("X", input_names);
op_desc.SetAttr("pooltype", seqpool_op0->Op()->GetAttr("pooltype"));
op_desc.SetAttr("axis", concat_op->Op()->GetAttr("axis"));
op_desc.SetOutput("Out", {concat_out_var->Name()});
auto* op = graph->CreateOpNode(&op_desc);
for (size_t i = 0; i < input_vars.size(); ++i) {
IR_NODE_LINK_TO(input_vars[i], op);
}
IR_NODE_LINK_TO(op, concat_out_var);
std::unordered_set<const Node*> marked_nodes;
for (auto& item : subgraph) {
marked_nodes.insert(item.second);
}
for (size_t i = 0; i < input_vars.size(); ++i) {
marked_nodes.erase(input_vars[i]);
}
marked_nodes.erase(concat_out_var);
GraphSafeRemoveNodes(graph, marked_nodes);
++fusion_count;
};
gpd(graph, handler);
return fusion_count;
}
std::unique_ptr<ir::Graph> SeqPoolConcatFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init(name_scope_, graph.get());
int fusion_count = 0;
for (int i = MAX_CONCAT_INPUTS; i > 0; --i) {
fusion_count += BuildFusion(
graph.get(), name_scope_ + "/" + std::to_string(i), param_scope(), i);
}
AddStatis(fusion_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(seqpool_concat_fuse_pass,
paddle::framework::ir::SeqPoolConcatFusePass);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
class SeqPoolConcatFusePass : public FusePassBase {
public:
virtual ~SeqPoolConcatFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"seqpool_concat_fuse"};
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -40,14 +40,14 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, ...@@ -40,14 +40,14 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
void NaiveExecutor::Run() { void NaiveExecutor::Run() {
#ifndef PADDLE_ON_INFERENCE #ifndef PADDLE_ON_INFERENCE
LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the " LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the "
"cmake flag ON_INFER is not set."; "cmake flag ON_INFER is not set.";
LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and " LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and "
"variables will be reused to save the allocation " "variables will be reused to save the allocation "
"overhead."; "overhead.";
LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by " LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by "
"setting the cmake flag ON_INFER=ON if you are " "setting the cmake flag ON_INFER=ON if you are "
"running Paddle Inference"; "running Paddle Inference";
#endif // PADDLE_ON_INFERENCE #endif // PADDLE_ON_INFERENCE
for (auto &op : ops_) { for (auto &op : ops_) {
VLOG(3) << std::this_thread::get_id() << " run " << op->Type() VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
......
...@@ -32,8 +32,11 @@ std::map<std::string, ...@@ -32,8 +32,11 @@ std::map<std::string,
std::string, std::shared_ptr<ngraph::Node>>>)>> std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = { NgraphBridge::NG_NODE_MAP = {
{"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode}, {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode},
{"mean", paddle::operators::ngraphs::BuildMeanNode},
{"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode},
{"mul", paddle::operators::ngraphs::BuildMulNode}, {"mul", paddle::operators::ngraphs::BuildMulNode},
{"mul_grad", paddle::operators::ngraphs::BuildMulGradNode}, {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
{"scale", paddle::operators::ngraphs::BuildScaleNode},
{"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>}, {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
{"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>}, {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>},
{"top_k", paddle::operators::ngraphs::BuildTopKNode}}; {"top_k", paddle::operators::ngraphs::BuildTopKNode}};
......
...@@ -539,7 +539,7 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { ...@@ -539,7 +539,7 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
} }
} }
backend_->call(ngraph_function_, t_out, t_in); backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
} // NgraphEngine::RunImpl } // NgraphEngine::RunImpl
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -193,15 +193,14 @@ ParallelExecutor::ParallelExecutor( ...@@ -193,15 +193,14 @@ ParallelExecutor::ParallelExecutor(
const std::unordered_set<std::string> &bcast_vars, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, const std::vector<Scope *> &local_scopes, Scope *scope, const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy)
size_t num_trainers, size_t trainer_id)
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
member_->use_cuda_ = exec_strategy.use_cuda_; member_->use_cuda_ = exec_strategy.use_cuda_;
member_->build_strategy_ = build_strategy; member_->build_strategy_ = build_strategy;
member_->use_all_reduce_ = member_->use_all_reduce_ =
build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
member_->nranks_ = num_trainers * places.size(); member_->nranks_ = build_strategy.num_trainers_ * places.size();
if (!member_->use_all_reduce_) { if (!member_->use_all_reduce_) {
PADDLE_ENFORCE(places.size() > 1, PADDLE_ENFORCE(places.size() > 1,
...@@ -253,7 +252,8 @@ ParallelExecutor::ParallelExecutor( ...@@ -253,7 +252,8 @@ ParallelExecutor::ParallelExecutor(
} }
member_->nccl_ctxs_.reset(new platform::NCCLContextMap( member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
member_->places_, nccl_id, num_trainers, trainer_id)); member_->places_, nccl_id, build_strategy.num_trainers_,
build_strategy.trainer_id_));
#else #else
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
#endif #endif
......
...@@ -50,8 +50,7 @@ class ParallelExecutor { ...@@ -50,8 +50,7 @@ class ParallelExecutor {
const std::string &loss_var_name, Scope *scope, const std::string &loss_var_name, Scope *scope,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy, const ExecutionStrategy &exec_strategy,
const BuildStrategy &build_strategy, const BuildStrategy &build_strategy);
size_t num_trainers = 1, size_t trainer_id = 0);
~ParallelExecutor(); ~ParallelExecutor();
......
...@@ -87,11 +87,12 @@ Variable* Scope::Var(const std::string& name) { ...@@ -87,11 +87,12 @@ Variable* Scope::Var(const std::string& name) {
} }
Variable* Scope::Var(std::string* name) { Variable* Scope::Var(std::string* name) {
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); SCOPE_VARS_WRITER_LOCK
auto new_name = std::to_string(reinterpret_cast<uintptr_t>(this)) + "." +
std::to_string(vars_.size());
if (name != nullptr) { if (name != nullptr) {
*name = new_name; *name = new_name;
} }
SCOPE_VARS_WRITER_LOCK
return VarInternal(new_name); return VarInternal(new_name);
} }
......
...@@ -105,13 +105,15 @@ struct VarIdToTypeIndexMapHolder { ...@@ -105,13 +105,15 @@ struct VarIdToTypeIndexMapHolder {
} // namespace detail } // namespace detail
const std::type_index &ToTypeIndex(int var_id) { const std::type_index &VarTraitIdToTypeIndex(int var_id) {
return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id); return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
} }
const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); } const char *ToTypeName(int var_id) {
return VarTraitIdToTypeIndex(var_id).name();
}
int ToTypeId(const std::type_index &type) { int TypeIndexToVarTraitId(const std::type_index &type) {
return detail::VarIdToTypeIndexMapHolder::ToTypeId(type); return detail::VarIdToTypeIndexMapHolder::ToTypeId(type);
} }
......
...@@ -66,8 +66,8 @@ namespace paddle { ...@@ -66,8 +66,8 @@ namespace paddle {
namespace framework { namespace framework {
const char *ToTypeName(int var_id); const char *ToTypeName(int var_id);
const std::type_index &ToTypeIndex(int var_id); const std::type_index &VarTraitIdToTypeIndex(int var_id);
int ToTypeId(const std::type_index &type); int TypeIndexToVarTraitId(const std::type_index &type);
namespace detail { namespace detail {
......
...@@ -45,10 +45,11 @@ struct TypeIndexChecker { ...@@ -45,10 +45,11 @@ struct TypeIndexChecker {
constexpr auto kId = VarTypeTrait<Type>::kId; constexpr auto kId = VarTypeTrait<Type>::kId;
std::type_index actual_type(typeid(Type)); std::type_index actual_type(typeid(Type));
EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name())); EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
EXPECT_EQ(ToTypeIndex(kId), actual_type); EXPECT_EQ(VarTraitIdToTypeIndex(kId), actual_type);
EXPECT_EQ(ToTypeId(actual_type), kId); EXPECT_EQ(TypeIndexToVarTraitId(actual_type), kId);
EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type); EXPECT_EQ(VarTraitIdToTypeIndex(TypeIndexToVarTraitId(actual_type)),
EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId); actual_type);
EXPECT_EQ(TypeIndexToVarTraitId(VarTraitIdToTypeIndex(kId)), kId);
EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT
EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT
......
...@@ -77,6 +77,7 @@ class PreparedOp { ...@@ -77,6 +77,7 @@ class PreparedOp {
framework::OperatorWithKernel::OpKernelFunc func; framework::OperatorWithKernel::OpKernelFunc func;
platform::DeviceContext* dev_ctx; platform::DeviceContext* dev_ctx;
}; };
class OpBase; class OpBase;
class VarBase { class VarBase {
......
...@@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) { ...@@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
i++) { i++) {
LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i] LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i]
<< " result: " << result[i]; << " result: " << result[i];
PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i], EXPECT_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
result[i]); 1e-3);
} }
} }
......
...@@ -123,8 +123,6 @@ struct Argument { ...@@ -123,8 +123,6 @@ struct Argument {
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller,
std::function<bool(const framework::ir::Node*)>);
DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
......
...@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument,
for (const std::string &pass_name : passes) { for (const std::string &pass_name : passes) {
auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
// Set some pass attributes.
if (pass_name == "ir_analysis_pass") {
pass->Set("tensorrt_node_teller",
new SubgraphDetector::NodeInsideSubgraphTeller(
argument->tensorrt_node_teller()));
}
if (pass_name == "graph_viz_pass") { if (pass_name == "graph_viz_pass") {
std::string dot_file_path = std::to_string(pass_num) + "_ir_" + std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
(pre_pass.empty() ? "origin" : pre_pass) + (pre_pass.empty() ? "origin" : pre_pass) +
...@@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument,
} }
if (pass_name == "tensorrt_subgraph_pass") { if (pass_name == "tensorrt_subgraph_pass") {
PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());
pass->SetNotOwned("tensorrt_node_teller",
argument->tensorrt_node_teller_ptr());
pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
pass->Set("min_subgraph_size", pass->Set("min_subgraph_size",
......
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector)
set(analysis_deps ${analysis_deps}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL "")
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) if (TENSORRT_FOUND)
file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
set(analysis_deps ${analysis_deps}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL "")
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
endif()
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( ...@@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const { std::unique_ptr<framework::ir::Graph> graph) const {
framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
auto teller = auto teller = [](const framework::ir::Node *node) {
Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller"); if (!node->IsOp() || !node->Op()) return false;
return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
};
SubGraphFuser fuser(graph.get(), teller, SubGraphFuser fuser(graph.get(), teller,
Get<int>("min_subgraph_size") /*min subgraph size*/); Get<int>("min_subgraph_size") /*min subgraph size*/);
...@@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters( ...@@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters(
REGISTER_PASS(tensorrt_subgraph_pass, REGISTER_PASS(tensorrt_subgraph_pass,
paddle::inference::analysis::TensorRtSubgraphPass) paddle::inference::analysis::TensorRtSubgraphPass)
.RequirePassAttr("tensorrt_node_teller")
.RequirePassAttr("max_batch_size") .RequirePassAttr("max_batch_size")
.RequirePassAttr("workspace_size") .RequirePassAttr("workspace_size")
.RequirePassAttr("min_subgraph_size"); .RequirePassAttr("min_subgraph_size");
...@@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps} ...@@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps}
ir_graph_build_pass ir_graph_build_pass
ir_analysis_pass ir_analysis_pass
analysis_passes analysis_passes
subgraph_detector
CACHE INTERNAL "") CACHE INTERNAL "")
...@@ -27,9 +27,6 @@ namespace analysis { ...@@ -27,9 +27,6 @@ namespace analysis {
void IrAnalysisComposePass::RunImpl(Argument *argument) { void IrAnalysisComposePass::RunImpl(Argument *argument) {
ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
InitTensorRTAttrs(argument);
}
ApplyIrPasses(argument); ApplyIrPasses(argument);
CollectFusionStatis(argument); CollectFusionStatis(argument);
} }
...@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const { ...@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const {
return "ir-analysis-compose-pass"; return "ir-analysis-compose-pass";
} }
void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
LOG(INFO) << "Initing TensorRT pass";
argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) {
std::unordered_set<std::string> teller_set(
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"conv2d_transpose", "leaky_relu"});
if (!node->IsOp()) return false;
if (teller_set.count(node->Op()->Type())) {
return true;
} else {
return false;
}
});
}
}
void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) { void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
std::vector<std::string> passes({ std::vector<std::string> passes({
"ir_graph_build_pass", "ir_analysis_pass", "ir_graph_build_pass", "ir_analysis_pass",
......
...@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass { ...@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass {
std::string repr() const override; std::string repr() const override;
private: private:
void InitTensorRTAttrs(Argument* argument);
void ApplyIrPasses(Argument* argument); void ApplyIrPasses(Argument* argument);
void CollectFusionStatis(Argument* argument); void CollectFusionStatis(Argument* argument);
......
...@@ -14,86 +14,101 @@ ...@@ -14,86 +14,101 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle_pass_builder.h" // NOLINT #include "paddle/fluid/platform/gpu_info.h"
namespace paddle { namespace paddle {
PassStrategy *contrib::AnalysisConfig::pass_builder() const { PassStrategy *contrib::AnalysisConfig::pass_builder() const {
PADDLE_ENFORCE( if (!pass_builder_.get()) {
pass_builder_.get(), if (use_gpu_) {
"Should call constructor first, that will init the pass_builder_."); LOG(INFO) << "Create GPU IR passes";
pass_builder_.reset(new GpuPassStrategy);
} else {
LOG(INFO) << "Create CPU IR passes";
pass_builder_.reset(new CpuPassStrategy);
}
} else if (pass_builder_->use_gpu() ^ use_gpu()) {
LOG(WARNING) << "The use_gpu flag is not compatible between Config and "
"PassBuilder, the flags are "
<< use_gpu() << " " << pass_builder_->use_gpu();
LOG(WARNING) << "Please make them compatible, still use the existing "
"PassBuilder.";
}
return pass_builder_.get(); return pass_builder_.get();
} }
contrib::AnalysisConfig::AnalysisConfig(bool use_gpu) { contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
this->use_gpu = use_gpu; model_dir_ = model_dir;
if (use_gpu) { }
pass_builder_.reset(new GpuPassStrategy); contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
} else { const std::string &params_file) {
pass_builder_.reset(new CpuPassStrategy); prog_file_ = prog_file;
} params_file_ = params_file;
}
void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
const std::string &params_file_path) {
prog_file_ = prog_file_path;
params_file_ = params_file_path;
}
void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
int device_id) {
#ifdef PADDLE_WITH_CUDA
use_gpu_ = true;
memory_pool_init_size_mb_ = memory_pool_init_size_mb;
device_id_ = device_id;
#else
LOG(ERROR) << "Please compile with gpu to EnableGpu";
use_gpu_ = false;
#endif
} }
void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; }
contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
// fields from Config #define CP_MEMBER(member__) member__ = other.member__;
model_dir = other.model_dir;
// fields from NativeConfig // Model related.
use_gpu = other.use_gpu; CP_MEMBER(model_dir_);
device = other.device; CP_MEMBER(prog_file_);
fraction_of_gpu_memory = other.fraction_of_gpu_memory; CP_MEMBER(params_file_);
prog_file = other.prog_file; CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and
param_file = other.param_file; // params_file_ fields.
specify_input_name = other.specify_input_name; // Gpu releated.
cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; CP_MEMBER(use_gpu_);
// fields from this. CP_MEMBER(device_id_);
enable_ir_optim = other.enable_ir_optim; CP_MEMBER(memory_pool_init_size_mb_);
// For mkldnn // TensorRT releated.
use_mkldnn_ = other.use_mkldnn_; CP_MEMBER(use_tensorrt_);
mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; CP_MEMBER(tensorrt_workspace_size_);
CP_MEMBER(tensorrt_max_batchsize_);
use_feed_fetch_ops = other.use_feed_fetch_ops; CP_MEMBER(tensorrt_min_subgraph_size_);
use_tensorrt_ = other.use_tensorrt_; // MKLDNN releated.
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; CP_MEMBER(use_mkldnn_);
tensorrt_workspace_size_ = other.tensorrt_workspace_size_; CP_MEMBER(mkldnn_enabled_op_types_);
tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
model_from_memory_ = other.model_from_memory_; // Ir related.
CP_MEMBER(enable_ir_optim_);
if (use_gpu) { CP_MEMBER(use_feed_fetch_ops_);
CP_MEMBER(ir_debug_);
CP_MEMBER(specify_input_name_);
CP_MEMBER(cpu_math_library_num_threads_);
CP_MEMBER(serialized_info_cache_);
if (use_gpu_) {
pass_builder_.reset(new GpuPassStrategy( pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(other.pass_builder()))); *static_cast<GpuPassStrategy *>(other.pass_builder())));
} else { } else {
pass_builder_.reset(new CpuPassStrategy( pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(other.pass_builder()))); *static_cast<CpuPassStrategy *>(other.pass_builder())));
} }
}
contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { #undef CP_MEMBER
// fields from Config
model_dir = other.model_dir;
// fields from NativeConfig
use_gpu = other.use_gpu;
device = other.device;
fraction_of_gpu_memory = other.fraction_of_gpu_memory;
prog_file = other.prog_file;
param_file = other.param_file;
specify_input_name = other.specify_input_name;
cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
// fields from this.
enable_ir_optim = other.enable_ir_optim;
// For mkldnn
use_mkldnn_ = other.use_mkldnn_;
mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
use_feed_fetch_ops = other.use_feed_fetch_ops;
use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
model_from_memory_ = other.model_from_memory_;
pass_builder_ = std::move(other.pass_builder_);
} }
void contrib::AnalysisConfig::EnableMKLDNN() { void contrib::AnalysisConfig::EnableMKLDNN() {
...@@ -112,17 +127,90 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, ...@@ -112,17 +127,90 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
use_tensorrt_ = true; use_tensorrt_ = true;
tensorrt_workspace_size_ = workspace_size; tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size; tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size; }
// Append after the conv+affine_channel fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); void contrib::AnalysisConfig::Update() {
auto info = SerializeInfoCache();
if (info == serialized_info_cache_) return;
if (use_gpu_) {
pass_builder_.reset(new GpuPassStrategy);
} else {
pass_builder_.reset(new CpuPassStrategy);
}
if (use_tensorrt_) {
if (!use_gpu_) {
LOG(ERROR)
<< "TensorRT engine is not available when EnableGpu() not actived.";
} else {
// Append after the infer_clean pass.
pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
}
}
if (use_mkldnn_) {
if (!enable_ir_optim_) {
LOG(ERROR)
<< "EnableMKLDNN() only works when IR optimization is enabled.";
}
#ifdef PADDLE_WITH_MKLDNN
pass_builder()->EnableMKLDNN();
use_mkldnn_ = true;
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
use_mkldnn_ = false;
#endif
}
if (ir_debug_) {
pass_builder()->TurnOnDebug();
}
}
std::string contrib::AnalysisConfig::SerializeInfoCache() {
std::stringstream ss;
ss << use_gpu_;
ss << memory_pool_init_size_mb_;
ss << use_tensorrt_;
ss << tensorrt_workspace_size_;
ss << tensorrt_max_batchsize_;
ss << use_mkldnn_;
ss << enable_ir_optim_;
ss << use_feed_fetch_ops_;
ss << ir_debug_;
return ss.str();
}
void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
int cpu_math_library_num_threads) {
cpu_math_library_num_threads_ = cpu_math_library_num_threads;
}
float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#ifdef PADDLE_WITH_CUDA
// Get the GPU memory details and calculate the fraction of memory for the
// GPU memory pool.
size_t gpu_used, gpu_available;
platform::GpuMemoryUsage(&gpu_used, &gpu_available);
double total_gpu_memory = (gpu_used + gpu_available) / 1024. / 1024.;
float fraction_of_gpu_memory =
static_cast<double>(memory_pool_init_size_mb()) / total_gpu_memory;
return fraction_of_gpu_memory;
#else
return 0.;
#endif
} }
void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
size_t prog_buffer_size, size_t prog_buffer_size,
const char *param_buffer, const char *param_buffer,
size_t param_buffer_size) { size_t param_buffer_size) {
prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size); prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
param_file = std::string(param_buffer, param_buffer + param_buffer_size); params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
model_from_memory_ = true; model_from_memory_ = true;
} }
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DECLARE_bool(profile); DECLARE_bool(profile);
...@@ -59,8 +60,8 @@ bool AnalysisPredictor::Init( ...@@ -59,8 +60,8 @@ bool AnalysisPredictor::Init(
if (FLAGS_profile) { if (FLAGS_profile) {
LOG(WARNING) << "Profiler is actived, might affect the performance"; LOG(WARNING) << "Profiler is actived, might affect the performance";
LOG(INFO) << "You can turn off by set gflags '-profile false'"; LOG(INFO) << "You can turn off by set gflags '-profile false'";
auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll
: platform::ProfilerState::kCPU; : platform::ProfilerState::kCPU;
platform::EnableProfiler(tracking_device); platform::EnableProfiler(tracking_device);
} }
...@@ -112,7 +113,7 @@ bool AnalysisPredictor::PrepareProgram( ...@@ -112,7 +113,7 @@ bool AnalysisPredictor::PrepareProgram(
// Optimize the program, and load parameters and modify them in the // Optimize the program, and load parameters and modify them in the
// scope_. // scope_.
// This will change the scope_ address. // This will change the scope_ address.
if (config_.enable_ir_optim) { if (config_.ir_optim()) {
status_ir_optim_enabled_ = true; status_ir_optim_enabled_ = true;
OptimizeInferenceProgram(); OptimizeInferenceProgram();
} else { } else {
...@@ -140,9 +141,9 @@ bool AnalysisPredictor::PrepareProgram( ...@@ -140,9 +141,9 @@ bool AnalysisPredictor::PrepareProgram(
return true; return true;
} }
bool AnalysisPredictor::CreateExecutor() { bool AnalysisPredictor::CreateExecutor() {
if (config_.use_gpu) { if (config_.use_gpu_) {
status_use_gpu_ = true; status_use_gpu_ = true;
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device_id_);
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
} }
...@@ -151,7 +152,7 @@ bool AnalysisPredictor::CreateExecutor() { ...@@ -151,7 +152,7 @@ bool AnalysisPredictor::CreateExecutor() {
} }
bool AnalysisPredictor::PrepareExecutor() { bool AnalysisPredictor::PrepareExecutor() {
executor_->Prepare(sub_scope_, *inference_program_, 0, executor_->Prepare(sub_scope_, *inference_program_, 0,
config_.use_feed_fetch_ops); config_.use_feed_fetch_ops_);
PADDLE_ENFORCE_NOT_NULL(sub_scope_); PADDLE_ENFORCE_NOT_NULL(sub_scope_);
...@@ -250,7 +251,7 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -250,7 +251,7 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
} }
input.set_lod(lod); input.set_lod(lod);
int idx = -1; int idx = -1;
if (config_.specify_input_name) { if (config_.specify_input_name_) {
auto name = inputs[i].name; auto name = inputs[i].name;
if (feed_names_.find(name) == feed_names_.end()) { if (feed_names_.find(name) == feed_names_.end()) {
LOG(ERROR) << "feed names from program do not have name: [" << name LOG(ERROR) << "feed names from program do not have name: [" << name
...@@ -314,22 +315,22 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs, ...@@ -314,22 +315,22 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
void AnalysisPredictor::OptimizeInferenceProgram() { void AnalysisPredictor::OptimizeInferenceProgram() {
status_program_optimized_ = true; status_program_optimized_ = true;
argument_.SetUseGPU(config_.use_gpu); argument_.SetUseGPU(config_.use_gpu());
argument_.SetGPUDeviceId(config_.device); argument_.SetGPUDeviceId(config_.gpu_device_id());
argument_.SetModelFromMemory(config_.model_from_memory_); argument_.SetModelFromMemory(config_.model_from_memory_);
// Analyze inference_program // Analyze inference_program
if (!config_.model_dir.empty()) { if (!config_.model_dir().empty()) {
argument_.SetModelDir(config_.model_dir); argument_.SetModelDir(config_.model_dir());
} else { } else {
PADDLE_ENFORCE( PADDLE_ENFORCE(
!config_.param_file.empty(), !config_.params_file().empty(),
"Either model_dir or (param_file, prog_file) should be set."); "Either model_dir or (param_file, prog_file) should be set.");
PADDLE_ENFORCE(!config_.prog_file.empty()); PADDLE_ENFORCE(!config_.prog_file().empty());
argument_.SetModelProgramPath(config_.prog_file); argument_.SetModelProgramPath(config_.prog_file());
argument_.SetModelParamsPath(config_.param_file); argument_.SetModelParamsPath(config_.params_file());
} }
if (config_.use_gpu && config_.use_tensorrt_) { if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
argument_.SetUseTensorRT(true); argument_.SetUseTensorRT(true);
argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
...@@ -341,7 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -341,7 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
auto passes = config_.pass_builder()->AllPasses(); auto passes = config_.pass_builder()->AllPasses();
if (!config_.enable_ir_optim) passes.clear(); if (!config_.ir_optim()) passes.clear();
argument_.SetIrAnalysisPasses(passes); argument_.SetIrAnalysisPasses(passes);
argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get())); argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
Analyzer().Run(&argument_); Analyzer().Run(&argument_);
...@@ -358,18 +359,26 @@ template <> ...@@ -358,18 +359,26 @@ template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
VLOG(3) << "create AnalysisConfig"; VLOG(3) << "create AnalysisConfig";
if (config.use_gpu) { if (config.use_gpu()) {
// 1. GPU memeroy // 1. GPU memeroy
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
config.fraction_of_gpu_memory, 0.f, PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
"fraction_of_gpu_memory in the config should be set to range (0., 1.]"); config.gpu_device_id());
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
std::vector<std::string> flags; std::vector<std::string> flags;
if (config.fraction_of_gpu_memory >= 0.0f ||
config.fraction_of_gpu_memory <= 0.95f) { float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
if (fraction_of_gpu_memory > 0.95f) {
LOG(ERROR)
<< "Allocate too much memory for the GPU memory pool, assigned "
<< config.memory_pool_init_size_mb() << " MB";
LOG(ERROR)
<< "Try to shink the value by setting AnalysisConfig::EnableGpu(...)";
}
if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
flags.push_back("dummpy"); flags.push_back("dummpy");
std::string flag = "--fraction_of_gpu_memory_to_use=" + std::string flag = "--fraction_of_gpu_memory_to_use=" +
std::to_string(config.fraction_of_gpu_memory); std::to_string(fraction_of_gpu_memory);
flags.push_back(flag); flags.push_back(flag);
VLOG(3) << "set flag: " << flag; VLOG(3) << "set flag: " << flag;
framework::InitGflags(flags); framework::InitGflags(flags);
...@@ -443,22 +452,22 @@ bool AnalysisPredictor::ZeroCopyRun() { ...@@ -443,22 +452,22 @@ bool AnalysisPredictor::ZeroCopyRun() {
bool AnalysisPredictor::LoadProgramDesc() { bool AnalysisPredictor::LoadProgramDesc() {
// Initialize the inference program // Initialize the inference program
std::string filename; std::string filename;
if (!config_.model_dir.empty()) { if (!config_.model_dir().empty()) {
filename = config_.model_dir + "/__model__"; filename = config_.model_dir() + "/__model__";
} else if (!config_.prog_file.empty() && !config_.param_file.empty()) { } else if (!config_.prog_file().empty() && !config_.params_file().empty()) {
// All parameters are saved in a single file. // All parameters are saved in a single file.
// The file names should be consistent with that used // The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`. // in Python API `fluid.io.save_inference_model`.
filename = config_.prog_file; filename = config_.prog_file();
} else { } else {
if (config_.model_dir.empty() && config_.prog_file.empty()) { if (config_.model_dir().empty() && config_.prog_file().empty()) {
LOG(ERROR) LOG(ERROR)
<< "Either model_dir or (prog_file, param_file) should be set."; << "Either model_dir or (prog_file, param_file) should be set.";
return false; return false;
} }
LOG(ERROR) << string::Sprintf( LOG(ERROR) << string::Sprintf(
"not valid model path '%s' or program path '%s'.", config_.model_dir, "not valid model path '%s' or program path '%s'.", config_.model_dir(),
config_.param_file); config_.params_file());
return false; return false;
} }
...@@ -478,7 +487,7 @@ bool AnalysisPredictor::LoadProgramDesc() { ...@@ -478,7 +487,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
proto.ParseFromString(pb_content); proto.ParseFromString(pb_content);
} else { } else {
proto.ParseFromString(config_.prog_file); proto.ParseFromString(config_.prog_file());
} }
inference_program_.reset(new framework::ProgramDesc(proto)); inference_program_.reset(new framework::ProgramDesc(proto));
return true; return true;
...@@ -508,27 +517,27 @@ bool AnalysisPredictor::LoadParameters() { ...@@ -508,27 +517,27 @@ bool AnalysisPredictor::LoadParameters() {
new_var->SetLoDLevel(var->GetLoDLevel()); new_var->SetLoDLevel(var->GetLoDLevel());
new_var->SetPersistable(true); new_var->SetPersistable(true);
if (!config_.param_file.empty()) { if (!config_.params_file().empty()) {
params.push_back(new_var->Name()); params.push_back(new_var->Name());
} else { } else {
// append_op // append_op
framework::OpDesc *op = load_block->AppendOp(); framework::OpDesc *op = load_block->AppendOp();
op->SetType("load"); op->SetType("load");
op->SetOutput("Out", {new_var->Name()}); op->SetOutput("Out", {new_var->Name()});
op->SetAttr("file_path", {config_.model_dir + "/" + new_var->Name()}); op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()});
op->CheckAttrs(); op->CheckAttrs();
} }
} }
} }
if (!config_.param_file.empty()) { if (!config_.params_file().empty()) {
// sort paramlist to have consistent ordering // sort paramlist to have consistent ordering
std::sort(params.begin(), params.end()); std::sort(params.begin(), params.end());
// append just the load_combine op // append just the load_combine op
framework::OpDesc *op = load_block->AppendOp(); framework::OpDesc *op = load_block->AppendOp();
op->SetType("load_combine"); op->SetType("load_combine");
op->SetOutput("Out", params); op->SetOutput("Out", params);
op->SetAttr("file_path", {config_.param_file}); op->SetAttr("file_path", {config_.params_file()});
op->CheckAttrs(); op->CheckAttrs();
} }
......
...@@ -35,8 +35,11 @@ using framework::proto::ProgramDesc; ...@@ -35,8 +35,11 @@ using framework::proto::ProgramDesc;
using framework::NaiveExecutor; using framework::NaiveExecutor;
using contrib::AnalysisConfig; using contrib::AnalysisConfig;
/* This predictor is based on the original native predictor with IR and Analysis /** \brief This predictor is based on the original native predictor with IR and
* support. It will optimize IR and Parameters in the runtime. * Analysis support.
*
* It will optimize IR and Parameters in the runtime.
*
* TODO(Superjomn) Replace the Navive predictor? * TODO(Superjomn) Replace the Navive predictor?
*/ */
class AnalysisPredictor : public PaddlePredictor { class AnalysisPredictor : public PaddlePredictor {
......
...@@ -25,9 +25,9 @@ namespace paddle { ...@@ -25,9 +25,9 @@ namespace paddle {
using contrib::AnalysisConfig; using contrib::AnalysisConfig;
TEST(AnalysisPredictor, analysis_off) { TEST(AnalysisPredictor, analysis_off) {
AnalysisConfig config(false); AnalysisConfig config;
config.model_dir = FLAGS_dirname; config.SetModel(FLAGS_dirname);
config.enable_ir_optim = false; config.SwitchIrOptim(false);
auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config); auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get()); auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
...@@ -55,14 +55,14 @@ TEST(AnalysisPredictor, analysis_off) { ...@@ -55,14 +55,14 @@ TEST(AnalysisPredictor, analysis_off) {
} }
TEST(AnalysisPredictor, analysis_on) { TEST(AnalysisPredictor, analysis_on) {
AnalysisConfig config;
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim(true);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
AnalysisConfig config(true); config.EnableUseGpu(100, 0);
config.fraction_of_gpu_memory = 0.15;
#else #else
AnalysisConfig config; config.DisableGpu();
#endif #endif
config.model_dir = FLAGS_dirname;
config.enable_ir_optim = true;
auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config); auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get()); auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
...@@ -89,7 +89,8 @@ TEST(AnalysisPredictor, analysis_on) { ...@@ -89,7 +89,8 @@ TEST(AnalysisPredictor, analysis_on) {
} }
// compare with NativePredictor // compare with NativePredictor
auto naive_predictor = CreatePaddlePredictor<NativeConfig>(config); auto naive_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
std::vector<PaddleTensor> naive_outputs; std::vector<PaddleTensor> naive_outputs;
ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs)); ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs));
ASSERT_EQ(naive_outputs.size(), 1UL); ASSERT_EQ(naive_outputs.size(), 1UL);
...@@ -98,9 +99,8 @@ TEST(AnalysisPredictor, analysis_on) { ...@@ -98,9 +99,8 @@ TEST(AnalysisPredictor, analysis_on) {
TEST(AnalysisPredictor, ZeroCopy) { TEST(AnalysisPredictor, ZeroCopy) {
AnalysisConfig config; AnalysisConfig config;
config.model_dir = FLAGS_dirname; config.SetModel(FLAGS_dirname);
config.use_feed_fetch_ops = false; config.SwitchUseFeedFetchOps(false);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config); auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
auto w0 = predictor->GetInputTensor("firstw"); auto w0 = predictor->GetInputTensor("firstw");
...@@ -137,9 +137,9 @@ TEST(AnalysisPredictor, ZeroCopy) { ...@@ -137,9 +137,9 @@ TEST(AnalysisPredictor, ZeroCopy) {
TEST(AnalysisPredictor, Clone) { TEST(AnalysisPredictor, Clone) {
AnalysisConfig config; AnalysisConfig config;
config.model_dir = FLAGS_dirname; config.SetModel(FLAGS_dirname);
config.use_feed_fetch_ops = true; config.SwitchUseFeedFetchOps(true);
config.enable_ir_optim = true; config.SwitchIrOptim(true);
std::vector<std::unique_ptr<PaddlePredictor>> predictors; std::vector<std::unique_ptr<PaddlePredictor>> predictors;
predictors.emplace_back(CreatePaddlePredictor(config)); predictors.emplace_back(CreatePaddlePredictor(config));
......
...@@ -19,8 +19,6 @@ limitations under the License. */ ...@@ -19,8 +19,6 @@ limitations under the License. */
#pragma once #pragma once
#define WITH_ANAKIN
#include <vector> #include <vector>
#include "framework/core/net/net.h" #include "framework/core/net/net.h"
......
...@@ -288,7 +288,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -288,7 +288,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
VLOG(3) << "create NativePaddlePredictor"; VLOG(3) << "create NativePaddlePredictor";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memeroy
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GE(
config.fraction_of_gpu_memory, 0.f, config.fraction_of_gpu_memory, 0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]"); "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
......
...@@ -19,7 +19,6 @@ limitations under the License. */ ...@@ -19,7 +19,6 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
......
...@@ -295,7 +295,8 @@ TEST(inference_api_native, image_classification_gpu) { ...@@ -295,7 +295,8 @@ TEST(inference_api_native, image_classification_gpu) {
#endif #endif
TEST(PassBuilder, Delete) { TEST(PassBuilder, Delete) {
contrib::AnalysisConfig config(false); contrib::AnalysisConfig config;
config.DisableGpu();
config.pass_builder()->DeletePass("attention_lstm_fuse_pass"); config.pass_builder()->DeletePass("attention_lstm_fuse_pass");
const auto& passes = config.pass_builder()->AllPasses(); const auto& passes = config.pass_builder()->AllPasses();
auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass"); auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass");
......
...@@ -116,6 +116,10 @@ D ...@@ -116,6 +116,10 @@ D
--modeldir=$DATA_DIR/mobilenet/model \ --modeldir=$DATA_DIR/mobilenet/model \
--data=$DATA_DIR/mobilenet/data.txt \ --data=$DATA_DIR/mobilenet/data.txt \
--refer=$DATA_DIR/mobilenet/result.txt --refer=$DATA_DIR/mobilenet/result.txt
if [ $? -ne 0 ]; then
echo "trt demo trt_mobilenet_demo runs fail."
exit 1
fi
fi fi
done done
set +x set +x
...@@ -36,12 +36,11 @@ namespace demo { ...@@ -36,12 +36,11 @@ namespace demo {
*/ */
void Main() { void Main() {
std::unique_ptr<PaddlePredictor> predictor; std::unique_ptr<PaddlePredictor> predictor;
paddle::contrib::AnalysisConfig config(true); paddle::contrib::AnalysisConfig config;
config.param_file = FLAGS_modeldir + "/__params__"; config.EnableUseGpu(100, 0);
config.prog_file = FLAGS_modeldir + "/__model__"; config.SetModel(FLAGS_modeldir + "/__model__",
config.device = 0; FLAGS_modeldir + "/__params__");
config.EnableTensorRtEngine(); config.EnableTensorRtEngine();
config.fraction_of_gpu_memory = 0.1; // set by yourself
predictor = CreatePaddlePredictor(config); predictor = CreatePaddlePredictor(config);
VLOG(3) << "begin to process data"; VLOG(3) << "begin to process data";
......
...@@ -40,15 +40,14 @@ using contrib::AnalysisConfig; ...@@ -40,15 +40,14 @@ using contrib::AnalysisConfig;
*/ */
void Main(bool use_gpu) { void Main(bool use_gpu) {
std::unique_ptr<PaddlePredictor> predictor, analysis_predictor; std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
AnalysisConfig config(use_gpu); AnalysisConfig config;
config.param_file = FLAGS_modeldir + "/__params__"; if (use_gpu) {
config.prog_file = FLAGS_modeldir + "/__model__"; config.EnableUseGpu(100, 0);
config.device = 0;
if (FLAGS_use_gpu) {
config.fraction_of_gpu_memory = 0.1; // set by yourself
} }
config.SetModel(FLAGS_modeldir + "/__model__",
FLAGS_modeldir + "/__params__");
predictor = CreatePaddlePredictor<NativeConfig>(config); predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
analysis_predictor = CreatePaddlePredictor(config); analysis_predictor = CreatePaddlePredictor(config);
// Just a single batch of data. // Just a single batch of data.
......
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
/*! \file */
// Here we include some header files with relative paths, for that in deploy, // Here we include some header files with relative paths, for that in deploy,
// the abstract path of this header file will be changed. // the abstract path of this header file will be changed.
#include "paddle_api.h" // NOLINT #include "paddle_api.h" // NOLINT
...@@ -34,41 +36,188 @@ class AnalysisPredictor; ...@@ -34,41 +36,188 @@ class AnalysisPredictor;
namespace contrib { namespace contrib {
// NOTE WIP, not stable yet. // NOTE WIP, not stable yet.
struct AnalysisConfig : public NativeConfig { struct AnalysisConfig {
explicit AnalysisConfig(bool use_gpu = false); AnalysisConfig() = default;
explicit AnalysisConfig(const AnalysisConfig& other); explicit AnalysisConfig(const AnalysisConfig& other);
explicit AnalysisConfig(AnalysisConfig&& other); explicit AnalysisConfig(const std::string& model_dir);
explicit AnalysisConfig(const std::string& prog_file,
const std::string& params_file);
// Determine whether to perform graph optimization. /** Set model with a directory.
bool enable_ir_optim = true; */
void SetModel(const std::string& model_dir) { model_dir_ = model_dir; }
/** Set model with two specific pathes for program and parameters.
*/
void SetModel(const std::string& prog_file_path,
const std::string& params_file_path);
/** Set program file path.
*/
void SetProgFile(const std::string& x) { prog_file_ = x; }
/** Set parameter composed file path.
*/
void SetParamsFile(const std::string& x) { params_file_ = x; }
/** Get the model directory path.
*/
const std::string& model_dir() const { return model_dir_; }
/** Get the program file path.
*/
const std::string& prog_file() const { return prog_file_; }
/** Get the composed parameters file.
*/
const std::string& params_file() const { return params_file_; }
// Get a pass builder for customize the passes in IR analysis phase. // GPU related.
PassStrategy* pass_builder() const;
/**
* \brief Turn on GPU.
* @param memory_pool_init_size_mb initial size of the GPU memory pool in MB.
* @param device_id the GPU card to use (default is 0).
*/
void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
/** Turn off the GPU.
*/
void DisableGpu();
/** A bool state telling whether the GPU is turned on.
*/
bool use_gpu() const { return use_gpu_; }
/** Get the GPU device id.
*/
int gpu_device_id() const { return device_id_; }
/** Get the initial size in MB of the GPU memory pool.
*/
int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; }
/** Get the proportion of the initial memory pool size compared to the device.
*/
float fraction_of_gpu_memory_for_pool() const;
/** \brief Control whether to perform IR graph optimization.
*
* If turned off, the AnalysisConfig will act just like a NativeConfig.
*/
void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; }
/** A boolean state tell whether the ir graph optimization is actived.
*/
bool ir_optim() const { return enable_ir_optim_; }
// NOT stable yet. /** \brief INTERNAL Determine whether to use the feed and fetch operators.
bool use_feed_fetch_ops{true}; * Just for internal development, not stable yet.
* When ZeroCopyTensor is used, this should turned off.
*/
void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; }
/** A boolean state telling whether to use the feed and fetch operators.
*/
bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; }
/** \brief Control whether to specify the inputs' names.
*
* The PaddleTensor type has a `name` member, assign it with the corresponding
* variable name. This is used only when the input PaddleTensors passed to the
* `PaddlePredictor.Run(...)` cannot follow the order in the training phase.
*/
void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; }
/** A boolean state tell whether the input PaddleTensor names specified should
* be used to reorder the inputs in `PaddlePredictor.Run(...)`.
*/
bool specify_input_name() const { return specify_input_name_; }
/**
* \brief Turn on the TensorRT engine.
*
* The TensorRT engine will accelerate some subgraphes in the original Fluid
* computation graph. In some models such as TensorRT50, GoogleNet and so on,
* it gains significant performance acceleration.
*
* @param workspace_size the memory size(in byte) used for TensorRT workspace.
* @param max_batch_size the maximum batch size of this prediction task,
* better set as small as possible, or performance loss.
* @param min_subgrpah_size the minimum TensorRT subgraph size needed, if a
* subgraph is less than this, it will not transfer to TensorRT engine.
*/
void EnableTensorRtEngine(int workspace_size = 1 << 20, void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3); int max_batch_size = 1, int min_subgraph_size = 3);
bool use_tensorrt() const { return use_tensorrt_; } /** A boolean state telling whether the TensorRT engine is used.
*/
bool tensorrt_engine_enabled() const { return use_tensorrt_; }
/** Control whther to debug IR graph analysis phase.
*/
void SwitchIrDebug(int x = true) { ir_debug_ = x; }
/** Turn on MKLDNN.
*/
void EnableMKLDNN(); void EnableMKLDNN();
bool use_mkldnn() const { return use_mkldnn_; } /** A boolean state telling whether to use the MKLDNN.
*/
bool mkldnn_enabled() const { return use_mkldnn_; }
/** Set and get the number of cpu math library threads.
*/
void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads);
/** An int state telling how many threads are used in the CPU math library.
*/
int cpu_math_library_num_threads() const {
return cpu_math_library_num_threads_;
}
/** Transform the AnalysisConfig to NativeConfig.
*/
NativeConfig ToNativeConfig() const {
NativeConfig config;
config.model_dir = model_dir_;
config.prog_file = prog_file_;
config.param_file = params_file_;
config.use_gpu = use_gpu_;
config.device = device_id_;
config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
config.specify_input_name = specify_input_name_;
return config;
}
/** Specify the operator type list to use MKLDNN acceleration.
* @param op_list the operator type list.
*/
void SetMKLDNNOp(std::unordered_set<std::string> op_list) { void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
mkldnn_enabled_op_types_ = op_list; mkldnn_enabled_op_types_ = op_list;
} }
// Specify the memory buffer of program and parameter /** Specify the memory buffer of program and parameter
* @param prog_buffer the memory buffer of program.
* @param prog_buffer_size the size of the data.
* @param params_buffer the memory buffer of the composed parameters file.
* @param params_buffer_size the size of the commposed parameters data.
*/
void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size, void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
const char* program_buffer, size_t program_buffer_size); const char* params_buffer, size_t params_buffer_size);
/** A boolean state telling whether the model is set from the CPU memory.
*/
bool model_from_memory() const { return model_from_memory_; } bool model_from_memory() const { return model_from_memory_; }
friend class ::paddle::AnalysisPredictor; friend class ::paddle::AnalysisPredictor;
/** NOTE just for developer, not an official API, easily to be broken.
* Get a pass builder for customize the passes in IR analysis phase.
*/
PassStrategy* pass_builder() const;
protected: protected:
// Update the config.
void Update();
std::string SerializeInfoCache();
protected:
// Model pathes.
std::string model_dir_;
std::string prog_file_;
std::string params_file_;
// GPU releated.
bool use_gpu_{false};
int device_id_{0};
uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB.
// TensorRT releated.
bool use_tensorrt_{false}; bool use_tensorrt_{false};
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
// For workspace_size, refer it from here: // For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
int tensorrt_workspace_size_; int tensorrt_workspace_size_;
...@@ -82,17 +231,24 @@ struct AnalysisConfig : public NativeConfig { ...@@ -82,17 +231,24 @@ struct AnalysisConfig : public NativeConfig {
// We set this variable to control the minimum number of nodes in the // We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value. // subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3}; int tensorrt_min_subgraph_size_{3};
std::unique_ptr<PassStrategy> pass_builder_;
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
bool model_from_memory_{false}; bool model_from_memory_{false};
};
// Configurations for Anakin engine. bool enable_ir_optim_{true};
struct AnakinConfig : public PaddlePredictor::Config { bool use_feed_fetch_ops_{true};
enum TargetType { NVGPU = 0, X86 }; bool ir_debug_{false};
int device;
std::string model_file; bool specify_input_name_{false};
int max_batch_size{-1};
TargetType target_type; int cpu_math_library_num_threads_{1};
// A runtime cache, shouldn't be transferred to others.
std::string serialized_info_cache_;
mutable std::unique_ptr<PassStrategy> pass_builder_;
}; };
} // namespace contrib } // namespace contrib
......
...@@ -13,61 +13,76 @@ ...@@ -13,61 +13,76 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
/*! \file paddle_api.h
*/
#include <cassert> #include <cassert>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
/*! \namespace paddle
*/
namespace paddle { namespace paddle {
// Data type. /** paddle data type.
*/
enum PaddleDType { enum PaddleDType {
FLOAT32, FLOAT32,
INT64, INT64,
// TODO(Superjomn) support more data types if needed. // TODO(Superjomn) support more data types if needed.
}; };
/* /**
* Memory menage for PaddleTensor. *\brief Memory menager for PaddleTensor.
* The PaddleBuf holds a buffer for data input or output. The memory can be
* allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
* should be reused for better performance.
* *
* For user allocated memory, the following API can be used: *The PaddleBuf holds a buffer for data input or output. The memory can be
* - PaddleBuf(void* data, size_t length) to set an external memory by *allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
* specifying *should be reused for better performance.
* the memory address and length.
* - Reset(void* data, size_t length) to reset the PaddleBuf with an external
* memory.
* ATTENTION, for user allocated memory, deallocation should be done by users
* externally after the program finished. The PaddleBuf won't do any allocation
* or deallocation.
* *
* To have the PaddleBuf allocate and manage the memory: *For user allocated memory, the following API can be used:
* - PaddleBuf(size_t length) will allocate a memory of size `length`. *- PaddleBuf(void* data, size_t length) to set an external memory by
* - Resize(size_t length) resize the memory to no less than `length`, ATTENTION *specifying
* if the allocated memory is larger than `length`, nothing will done. * the memory address and length.
*- Reset(void* data, size_t length) to reset the PaddleBuf with an external
*memory.
*ATTENTION, for user allocated memory, deallocation should be done by users
*externally after the program finished. The PaddleBuf won't do any allocation
*or deallocation.
*
*To have the PaddleBuf allocate and manage the memory:
*- PaddleBuf(size_t length) will allocate a memory of size `length`.
*- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* if the allocated memory is larger than `length`, nothing will done.
*/ */
class PaddleBuf { class PaddleBuf {
public: public:
// PaddleBuf allocate memory internally, and manage it. /** PaddleBuf allocate memory internally, and manage it.
*/
explicit PaddleBuf(size_t length) explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {} : data_(new char[length]), length_(length), memory_owned_(true) {}
// Set external memory, the PaddleBuf won't manage it. /** Set external memory, the PaddleBuf won't manage it.
*/
PaddleBuf(void* data, size_t length) PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {} : data_(data), length_(length), memory_owned_{false} {}
// Copy only available when memory is managed externally. /** Copy only available when memory is managed externally.
*/
explicit PaddleBuf(const PaddleBuf&); explicit PaddleBuf(const PaddleBuf&);
// Resize the memory. /** Resize the memory.
*/
void Resize(size_t length); void Resize(size_t length);
// Reset to external memory, with address and length set. /** Reset to external memory, with address and length set.
*/
void Reset(void* data, size_t length); void Reset(void* data, size_t length);
// Tell whether the buffer is empty. /** Tell whether the buffer is empty.
*/
bool empty() const { return length_ == 0; } bool empty() const { return length_ == 0; }
// Get the memory address. /** Get the memory address.
*/
void* data() const { return data_; } void* data() const { return data_; }
// Get the memory length. /** Get the memory length.
*/
size_t length() const { return length_; } size_t length() const { return length_; }
~PaddleBuf() { Free(); } ~PaddleBuf() { Free(); }
...@@ -83,7 +98,8 @@ class PaddleBuf { ...@@ -83,7 +98,8 @@ class PaddleBuf {
bool memory_owned_{true}; bool memory_owned_{true};
}; };
// Basic input and output data structure for PaddlePredictor. /** Basic input and output data structure for PaddlePredictor.
*/
struct PaddleTensor { struct PaddleTensor {
PaddleTensor() = default; PaddleTensor() = default;
std::string name; // variable name. std::string name; // variable name.
...@@ -94,19 +110,22 @@ struct PaddleTensor { ...@@ -94,19 +110,22 @@ struct PaddleTensor {
}; };
enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
// Tensor without copy, currently only supports AnalysisPredictor. /** Tensor without copy, currently only supports AnalysisPredictor.
*/
class ZeroCopyTensor { class ZeroCopyTensor {
public: public:
void Reshape(const std::vector<int>& shape); void Reshape(const std::vector<int>& shape);
// Get the memory in CPU or GPU with specific data type, should Reshape first /** Get the memory in CPU or GPU with specific data type, should Reshape first
// to tell the data size. * to tell the data size.
// Once can directly call this data to feed the data. * Once can directly call this data to feed the data.
// This is for write the input tensor. * This is for write the input tensor.
*/
template <typename T> template <typename T>
T* mutable_data(PaddlePlace place); T* mutable_data(PaddlePlace place);
// Get the memory directly, will return the place and memory size by pointer. /** Get the memory directly, will return the place and memory size by pointer.
// This is for reading the output tensor. * This is for reading the output tensor.
*/
template <typename T> template <typename T>
T* data(PaddlePlace* place, int* size) const; T* data(PaddlePlace* place, int* size) const;
...@@ -128,8 +147,7 @@ class ZeroCopyTensor { ...@@ -128,8 +147,7 @@ class ZeroCopyTensor {
void* scope_{nullptr}; void* scope_{nullptr};
}; };
/* /** A simple Inference API for Paddle.
* A simple Inference API for Paddle.
*/ */
class PaddlePredictor { class PaddlePredictor {
public: public:
...@@ -138,18 +156,20 @@ class PaddlePredictor { ...@@ -138,18 +156,20 @@ class PaddlePredictor {
PaddlePredictor(const PaddlePredictor&) = delete; PaddlePredictor(const PaddlePredictor&) = delete;
PaddlePredictor& operator=(const PaddlePredictor&) = delete; PaddlePredictor& operator=(const PaddlePredictor&) = delete;
// Predict an record. /** Predict an record.
// The caller should be responsible for allocating and releasing the memory of * The caller should be responsible for allocating and releasing the memory of
// `inputs`. `inputs` should be available until Run returns. Caller should be * `inputs`. `inputs` should be available until Run returns. Caller should be
// responsible for the output tensor's buffer, either allocated or passed from * responsible for the output tensor's buffer, either allocated or passed from
// outside. * outside.
*/
virtual bool Run(const std::vector<PaddleTensor>& inputs, virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data, std::vector<PaddleTensor>* output_data,
int batch_size = -1) = 0; int batch_size = -1) = 0;
// Zero copy input and output optimization. /** Zero copy input and output optimization.
// Get the input or output tensors, and operate on their memory directly, * Get the input or output tensors, and operate on their memory directly,
// without copy. * without copy.
*/
virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor( virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
const std::string& name) { const std::string& name) {
return nullptr; return nullptr;
...@@ -160,16 +180,19 @@ class PaddlePredictor { ...@@ -160,16 +180,19 @@ class PaddlePredictor {
} }
virtual bool ZeroCopyRun() { return false; } virtual bool ZeroCopyRun() { return false; }
// Clone a predictor that share the model weights, the Cloned predictor should /** Clone a predictor that share the model weights, the Cloned predictor
// be thread-safe. * should be thread-safe.
*/
virtual std::unique_ptr<PaddlePredictor> Clone() = 0; virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
// Destroy the Predictor. /** Destroy the Predictor.
*/
virtual ~PaddlePredictor() = default; virtual ~PaddlePredictor() = default;
// The common configs for all the predictors. /** The common configs for all the predictors.
*/
struct Config { struct Config {
std::string model_dir; // path to the model directory. std::string model_dir; /*!< path to the model directory. */
}; };
}; };
...@@ -177,17 +200,21 @@ struct NativeConfig : public PaddlePredictor::Config { ...@@ -177,17 +200,21 @@ struct NativeConfig : public PaddlePredictor::Config {
// GPU related fields. // GPU related fields.
bool use_gpu{false}; bool use_gpu{false};
int device{0}; int device{0};
float fraction_of_gpu_memory{-1.f}; // Change to a float in (0,1] if needed. float fraction_of_gpu_memory{
-1.f}; /*!< Change to a float in (0,1] if needed. */
// Specify the exact path of program and parameter files. // Specify the exact path of program and parameter files.
std::string prog_file; std::string prog_file;
std::string param_file; std::string param_file;
// Specify the variable's name of each input if input tensors don't follow the /** Specify the variable's name of each input if input tensors don't follow
// `feeds` and `fetches` of the phase `save_inference_model`. * the
* `feeds` and `fetches` of the phase `save_inference_model`.
*/
bool specify_input_name{false}; bool specify_input_name{false};
// Set and get the number of cpu math library threads. /** Set and get the number of cpu math library threads.
*/
void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) { void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
cpu_math_library_num_threads_ = cpu_math_library_num_threads; cpu_math_library_num_threads_ = cpu_math_library_num_threads;
} }
...@@ -201,28 +228,33 @@ struct NativeConfig : public PaddlePredictor::Config { ...@@ -201,28 +228,33 @@ struct NativeConfig : public PaddlePredictor::Config {
int cpu_math_library_num_threads_{1}; int cpu_math_library_num_threads_{1};
}; };
// A factory to help create different predictors. /*! \fn std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&
// * config);
// Usage: *
// * \brief A factory to help create different predictors.
// NativeConfig config; *
// ... // change the configs. * Usage:
// auto native_predictor = CreatePaddlePredictor(config); *
// * NativeConfig config;
// FOR EXTENSION DEVELOPER: * ... // change the configs.
// Different predictors are designated by config type. Similar configs can be * auto native_predictor = CreatePaddlePredictor(config);
// merged, but there shouldn't be a huge config containing different fields for *
// more than one kind of predictors. * FOR EXTENSION DEVELOPER:
* Different predictors are designated by config type. Similar configs can be
* merged, but there shouldn't be a huge config containing different fields for
* more than one kind of predictors.
*/
template <typename ConfigT> template <typename ConfigT>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config); std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
// NOTE The following APIs are too trivial, we will discard it in the following /** NOTE The following APIs are too trivial, we will discard it in the following
// versions. * versions.
*/
enum class PaddleEngineKind { enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility. kNative = 0, /*!< Use the native Fluid facility. */
kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT. kAutoMixedTensorRT, /*!< Automatically mix Fluid with TensorRT. */
kAnalysis, // More optimization. kAnalysis, /*!< More optimization. */
kAnakin // Use Anakin for inference, not mature yet. kAnakin /*!< Use Anakin for inference, not mature yet. */
}; };
template <typename ConfigT, PaddleEngineKind engine> template <typename ConfigT, PaddleEngineKind engine>
......
...@@ -26,9 +26,8 @@ limitations under the License. */ ...@@ -26,9 +26,8 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle_api.h" // NOLINT
#ifndef WITH_ANAKIN
#include "paddle_analysis_config.h" // NOLINT #include "paddle_analysis_config.h" // NOLINT
#else #include "paddle_api.h" // NOLINT
#ifdef WITH_ANAKIN
#include "paddle_anakin_config.h" // NOLINT #include "paddle_anakin_config.h" // NOLINT
#endif #endif
...@@ -18,30 +18,39 @@ ...@@ -18,30 +18,39 @@
#include <string> #include <string>
#include <vector> #include <vector>
/*! \file */
/*! \namespace paddle */
namespace paddle { namespace paddle {
/*
* This is a pass builder based on string. It is part of inference API. /** This is a pass builder based on string. It is part of inference API.
*/ */
class PaddlePassBuilder { class PaddlePassBuilder {
public: public:
explicit PaddlePassBuilder(const std::vector<std::string> &passes) explicit PaddlePassBuilder(const std::vector<std::string> &passes)
: passes_(passes) {} : passes_(passes) {}
/** Append a pass to the end of the passes. */
void AppendPass(const std::string &pass_type); void AppendPass(const std::string &pass_type);
/** Insert a pass to a specific position.
* @param idx the position to insert.
* @param pass_type the pass key.
*/
void InsertPass(size_t idx, const std::string &pass_type); void InsertPass(size_t idx, const std::string &pass_type);
// Delete the `idx`-th pass. /** Delete the `idx`-th pass. */
void DeletePass(size_t idx); void DeletePass(size_t idx);
// Delete all the passes that has type `pass_type`. /** Delete all the passes that has type `pass_type`. */
void DeletePass(const std::string &pass_type); void DeletePass(const std::string &pass_type);
// Visualize the computation graph after each pass by generating a DOT /** Visualize the computation graph after each pass by generating a DOT
// language file, one can draw them with the Graphviz toolkit. * language file, one can draw them with the Graphviz toolkit.
*/
void TurnOnDebug(); void TurnOnDebug();
// Human-readible information. /** Human-readible information. */
std::string DebugString(); std::string DebugString();
const std::vector<std::string> &AllPasses() const { return passes_; } const std::vector<std::string> &AllPasses() const { return passes_; }
...@@ -50,23 +59,27 @@ class PaddlePassBuilder { ...@@ -50,23 +59,27 @@ class PaddlePassBuilder {
std::vector<std::string> passes_; std::vector<std::string> passes_;
}; };
/* /**Pass strategy to help control the IR passes.
* Pass strategy to help control the IR passes.
*/ */
class PassStrategy : public PaddlePassBuilder { class PassStrategy : public PaddlePassBuilder {
public: public:
explicit PassStrategy(const std::vector<std::string> &passes) explicit PassStrategy(const std::vector<std::string> &passes)
: PaddlePassBuilder(passes) {} : PaddlePassBuilder(passes) {}
// The MKLDNN control exists in both CPU and GPU mode, because there can be /** The MKLDNN control exists in both CPU and GPU mode, because there can be
// still some CPU kernels running in CPU mode. * still some CPU kernels running in CPU mode.
*/
virtual void EnableMKLDNN() = 0; virtual void EnableMKLDNN() = 0;
bool use_gpu() const { return use_gpu_; }
virtual ~PassStrategy() = default; virtual ~PassStrategy() = default;
protected:
bool use_gpu_{false};
}; };
/* /** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
* The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
*/ */
class CpuPassStrategy : public PassStrategy { class CpuPassStrategy : public PassStrategy {
public: public:
...@@ -76,6 +89,7 @@ class CpuPassStrategy : public PassStrategy { ...@@ -76,6 +89,7 @@ class CpuPassStrategy : public PassStrategy {
passes_.assign({ passes_.assign({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
"attention_lstm_fuse_pass", // "attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", // "seqconv_eltadd_relu_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", // // "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", // "fc_lstm_fuse_pass", //
...@@ -88,6 +102,7 @@ class CpuPassStrategy : public PassStrategy { ...@@ -88,6 +102,7 @@ class CpuPassStrategy : public PassStrategy {
"conv_eltwiseadd_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", //
"is_test_pass", // "is_test_pass", //
}); });
use_gpu_ = false;
} }
virtual ~CpuPassStrategy() = default; virtual ~CpuPassStrategy() = default;
...@@ -111,8 +126,7 @@ class CpuPassStrategy : public PassStrategy { ...@@ -111,8 +126,7 @@ class CpuPassStrategy : public PassStrategy {
CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {} CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {}
}; };
/* /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
* The GPU passes strategy, it is used in
*/ */
class GpuPassStrategy : public PassStrategy { class GpuPassStrategy : public PassStrategy {
public: public:
...@@ -126,10 +140,14 @@ class GpuPassStrategy : public PassStrategy { ...@@ -126,10 +140,14 @@ class GpuPassStrategy : public PassStrategy {
"conv_elementwise_add2_act_fuse_pass", // "conv_elementwise_add2_act_fuse_pass", //
"conv_elementwise_add_fuse_pass", // "conv_elementwise_add_fuse_pass", //
}); });
use_gpu_ = true;
} }
GpuPassStrategy(const GpuPassStrategy &other) GpuPassStrategy(const GpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {} : PassStrategy(other.AllPasses()) {
use_gpu_ = true;
}
void EnableMKLDNN() override; void EnableMKLDNN() override;
......
nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
add_subdirectory(plugin) add_subdirectory(plugin)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace paddle {
namespace inference {
namespace tensorrt {
// Just tell by the op_types.
struct SimpleOpTypeSetTeller : public Teller {
SimpleOpTypeSetTeller() {}
bool operator()(const std::string& op_type,
const framework::OpDesc& desc) override {
return teller_set.count(op_type);
}
private:
std::unordered_set<std::string> teller_set{
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"conv2d_transpose", "leaky_relu"}};
};
bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
for (auto& teller : tellers_) {
if ((*teller)(op_type, desc)) return true;
}
return false;
}
OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* Single Op teller definition.
* One can override this and define a more complex tell logic, considerring more
* issues such as op_desc.
*/
struct Teller {
virtual bool operator()(const std::string& op_type,
const framework::OpDesc& desc) = 0;
virtual ~Teller() = default;
};
/*
* A real example:
*
* struct SomeTeller : public Teller {
* bool operator()(const std::string& op_type,
* const framework::OpDesc& desc) override {
* return op_type == "fc" && desc.Inputs().size() == 2;
* }
*};
*/
/*
* class OpTeller helps to tell whether a fluid
* operator can be transformed to a TensorRT layer.
*/
class OpTeller {
public:
static OpTeller& Global() {
static std::unique_ptr<OpTeller> x(new OpTeller);
return *x;
}
bool Tell(const std::string& op_type, const framework::OpDesc& desc);
private:
OpTeller();
private:
std::vector<std::unique_ptr<Teller>> tellers_;
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -41,7 +41,7 @@ endfunction() ...@@ -41,7 +41,7 @@ endfunction()
if(NOT APPLE AND WITH_MKLML) if(NOT APPLE AND WITH_MKLML)
set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz") download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc) inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc SERIAL)
else() else()
# TODO: fix this test on MACOS and OPENBLAS, the reason is that # TODO: fix this test on MACOS and OPENBLAS, the reason is that
# fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
...@@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2 ...@@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
# normal DAM # normal DAM
set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc) inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL)
# small DAM # small DAM
set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz") download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1) ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1 SERIAL)
# chinese_ner # chinese_ner
set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
...@@ -111,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ...@@ -111,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
# resnet50 # resnet50
inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
"${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL)
# mobilenet with depthwise_conv op # mobilenet with depthwise_conv op
inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
"${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)
# anakin # anakin
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
......
...@@ -165,12 +165,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -165,12 +165,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
void SetConfig(contrib::AnalysisConfig *cfg) { void SetConfig(contrib::AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
cfg->param_file = FLAGS_infer_model + "/param"; cfg->SwitchSpecifyInputNames();
cfg->use_gpu = false; cfg->SwitchIrOptim(true);
cfg->device = 0;
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -105,11 +105,10 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -105,11 +105,10 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model; cfg->SetModel(FLAGS_infer_model);
cfg->use_gpu = false; cfg->DisableGpu();
cfg->device = 0; cfg->SwitchSpecifyInputNames();
cfg->specify_input_name = true; cfg->SwitchIrOptim();
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -76,11 +76,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -76,11 +76,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
void SetConfig(contrib::AnalysisConfig *cfg) { void SetConfig(contrib::AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model; cfg->SetModel(FLAGS_infer_model);
cfg->use_gpu = false; cfg->DisableGpu();
cfg->device = 0; cfg->SwitchSpecifyInputNames();
cfg->specify_input_name = true; cfg->SwitchIrOptim();
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -84,13 +84,12 @@ void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) { ...@@ -84,13 +84,12 @@ void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0], cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
buffer_param.size()); buffer_param.size());
} else { } else {
cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->SetModel(FLAGS_infer_model + "/__model__",
cfg->param_file = FLAGS_infer_model + "/param"; FLAGS_infer_model + "/param");
} }
cfg->use_gpu = false; cfg->DisableGpu();
cfg->device = 0; cfg->SwitchSpecifyInputNames();
cfg->specify_input_name = true; cfg->SwitchIrOptim();
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -21,12 +21,10 @@ namespace inference { ...@@ -21,12 +21,10 @@ namespace inference {
namespace analysis { namespace analysis {
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->param_file = FLAGS_infer_model + "/params"; cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
cfg->prog_file = FLAGS_infer_model + "/model"; cfg->DisableGpu();
cfg->use_gpu = false; cfg->SwitchIrOptim();
cfg->device = 0; cfg->SwitchSpecifyInputNames();
cfg->enable_ir_optim = true;
cfg->specify_input_name = true;
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
} }
......
...@@ -204,12 +204,10 @@ void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor, ...@@ -204,12 +204,10 @@ void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
cfg->param_file = FLAGS_infer_model + "/param"; cfg->DisableGpu();
cfg->use_gpu = false; cfg->SwitchSpecifyInputNames();
cfg->device = 0; cfg->SwitchIrOptim();
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
...@@ -225,10 +223,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -225,10 +223,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently. // Easy for profiling independently.
TEST(Analyzer_rnn1, profile) { TEST(Analyzer_rnn1, profile) {
contrib::AnalysisConfig cfg(false); contrib::AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg.fraction_of_gpu_memory = 0.1; cfg.DisableGpu();
cfg.pass_builder()->TurnOnDebug(); cfg.SwitchIrDebug();
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
...@@ -285,7 +283,7 @@ TEST(Analyzer_rnn1, multi_thread) { ...@@ -285,7 +283,7 @@ TEST(Analyzer_rnn1, multi_thread) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg), TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, 4 /* multi_thread */); input_slots_all, &outputs, 2 /* multi_thread */);
} }
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
...@@ -293,16 +291,18 @@ TEST(Analyzer_rnn1, multi_thread) { ...@@ -293,16 +291,18 @@ TEST(Analyzer_rnn1, multi_thread) {
TEST(Analyzer_rnn1, ZeroCopy) { TEST(Analyzer_rnn1, ZeroCopy) {
AnalysisConfig config; AnalysisConfig config;
SetConfig(&config); SetConfig(&config);
config.use_feed_fetch_ops = false; config.SwitchUseFeedFetchOps(false);
PaddlePlace place; PaddlePlace place;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config); auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
config.use_feed_fetch_ops = true; config.SwitchUseFeedFetchOps(true);
auto native_predictor = CreatePaddlePredictor<NativeConfig>(config); auto native_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch. config.SwitchUseFeedFetchOps(
true); // the analysis predictor needs feed/fetch.
auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config); auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
#define NEW_TENSOR(name__) \ #define NEW_TENSOR(name__) \
...@@ -362,7 +362,7 @@ TEST(Analyzer_rnn1, ZeroCopy) { ...@@ -362,7 +362,7 @@ TEST(Analyzer_rnn1, ZeroCopy) {
TEST(Analyzer_rnn1, ZeroCopyMultiThread) { TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
AnalysisConfig config; AnalysisConfig config;
SetConfig(&config); SetConfig(&config);
config.use_feed_fetch_ops = false; config.SwitchUseFeedFetchOps(false);
#define NEW_TENSOR(name__) \ #define NEW_TENSOR(name__) \
auto name__##_tensor = predictor->GetInputTensor(#name__); auto name__##_tensor = predictor->GetInputTensor(#name__);
......
...@@ -105,12 +105,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -105,12 +105,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
cfg->param_file = FLAGS_infer_model + "/param"; cfg->DisableGpu();
cfg->use_gpu = false; cfg->SwitchSpecifyInputNames();
cfg->device = 0; cfg->SwitchIrOptim();
cfg->specify_input_name = true;
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -89,11 +89,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -89,11 +89,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model; cfg->SetModel(FLAGS_infer_model);
cfg->use_gpu = false; cfg->DisableGpu();
cfg->device = 0; cfg->SwitchSpecifyInputNames();
cfg->specify_input_name = true; cfg->SwitchIrOptim();
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -122,12 +122,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) { ...@@ -122,12 +122,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->param_file = FLAGS_infer_model + "/params"; cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
cfg->prog_file = FLAGS_infer_model + "/model"; cfg->DisableGpu();
cfg->use_gpu = false; cfg->SwitchSpecifyInputNames();
cfg->device = 0;
cfg->enable_ir_optim = true;
cfg->specify_input_name = true;
cfg->pass_builder()->TurnOnDebug(); cfg->pass_builder()->TurnOnDebug();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
} }
...@@ -180,8 +177,12 @@ TEST(Analyzer_seq_pool1, fuse_statis) { ...@@ -180,8 +177,12 @@ TEST(Analyzer_seq_pool1, fuse_statis) {
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis( auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops); static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
LOG(INFO) << "num_ops: " << num_ops; LOG(INFO) << "num_ops: " << num_ops;
EXPECT_EQ(num_ops, 349); EXPECT_EQ(num_ops, 195);
} }
} // namespace analysis } // namespace analysis
......
...@@ -47,11 +47,10 @@ struct DataReader { ...@@ -47,11 +47,10 @@ struct DataReader {
}; };
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->model_dir = FLAGS_infer_model; cfg->SetModel(FLAGS_infer_model);
cfg->use_gpu = false; cfg->DisableGpu();
cfg->device = 0; cfg->SwitchSpecifyInputNames();
cfg->specify_input_name = true; cfg->SwitchIrOptim();
cfg->enable_ir_optim = true;
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
......
...@@ -51,12 +51,11 @@ Record ProcessALine(const std::string &line) { ...@@ -51,12 +51,11 @@ Record ProcessALine(const std::string &line) {
} }
void SetConfig(AnalysisConfig *cfg) { void SetConfig(AnalysisConfig *cfg) {
cfg->param_file = FLAGS_infer_model + "/__params__"; cfg->SetModel(FLAGS_infer_model + "/__model__",
cfg->prog_file = FLAGS_infer_model + "/__model__"; FLAGS_infer_model + "/__params__");
cfg->use_gpu = false; cfg->DisableGpu();
cfg->device = 0; cfg->SwitchIrDebug();
cfg->enable_ir_optim = true; cfg->SwitchSpecifyInputNames();
cfg->specify_input_name = true;
// TODO(TJ): fix fusion gru // TODO(TJ): fix fusion gru
cfg->pass_builder()->DeletePass("fc_gru_fuse_pass"); cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
} }
......
...@@ -62,21 +62,25 @@ std::ostream &operator<<(std::ostream &os, ...@@ -62,21 +62,25 @@ std::ostream &operator<<(std::ostream &os,
const contrib::AnalysisConfig &config) { const contrib::AnalysisConfig &config) {
os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n"; os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
num_spaces++; num_spaces++;
os << *reinterpret_cast<const NativeConfig *>(&config); os << config.ToNativeConfig();
if (!config.model_from_memory()) { if (!config.model_from_memory()) {
os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n"; os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n";
os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; os << GenSpaces(num_spaces) << "param_file: " << config.params_file()
<< "\n";
} else { } else {
os << GenSpaces(num_spaces) os << GenSpaces(num_spaces)
<< "prog_file and param_file: load from memory \n"; << "prog_file and param_file: load from memory \n";
} }
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
<< "\n"; << "\n";
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
<< "\n";
os << GenSpaces(num_spaces)
<< "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";
os << GenSpaces(num_spaces) os << GenSpaces(num_spaces)
<< "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n"; << "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n";
os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt() os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled()
<< "\n"; << "\n";
os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n";
num_spaces--; num_spaces--;
os << GenSpaces(num_spaces) << "}\n"; os << GenSpaces(num_spaces) << "}\n";
return os; return os;
......
...@@ -54,11 +54,13 @@ namespace paddle { ...@@ -54,11 +54,13 @@ namespace paddle {
namespace inference { namespace inference {
void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
const auto *analysis_config =
reinterpret_cast<const contrib::AnalysisConfig *>(config);
if (use_analysis) { if (use_analysis) {
LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config); LOG(INFO) << *analysis_config;
return; return;
} }
LOG(INFO) << *reinterpret_cast<const NativeConfig *>(config); LOG(INFO) << analysis_config->ToNativeConfig();
} }
void CompareResult(const std::vector<PaddleTensor> &outputs, void CompareResult(const std::vector<PaddleTensor> &outputs,
...@@ -96,12 +98,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -96,12 +98,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
std::unique_ptr<PaddlePredictor> CreateTestPredictor( std::unique_ptr<PaddlePredictor> CreateTestPredictor(
const PaddlePredictor::Config *config, bool use_analysis = true) { const PaddlePredictor::Config *config, bool use_analysis = true) {
const auto *analysis_config =
reinterpret_cast<const contrib::AnalysisConfig *>(config);
if (use_analysis) { if (use_analysis) {
return CreatePaddlePredictor<contrib::AnalysisConfig>( return CreatePaddlePredictor<contrib::AnalysisConfig>(*analysis_config);
*(reinterpret_cast<const contrib::AnalysisConfig *>(config)));
} }
return CreatePaddlePredictor<NativeConfig>( auto native_config = analysis_config->ToNativeConfig();
*(reinterpret_cast<const NativeConfig *>(config))); return CreatePaddlePredictor<NativeConfig>(native_config);
} }
size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
......
...@@ -46,22 +46,20 @@ void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config, ...@@ -46,22 +46,20 @@ void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
std::string model_dir, bool use_gpu, std::string model_dir, bool use_gpu,
bool use_tensorrt, int batch_size) { bool use_tensorrt, int batch_size) {
if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
config->prog_file = model_dir + "/" + FLAGS_prog_filename; config->SetModel(model_dir + "/" + FLAGS_prog_filename,
config->param_file = model_dir + "/" + FLAGS_param_filename; model_dir + "/" + FLAGS_param_filename);
} else { } else {
config->model_dir = model_dir; config->SetModel(model_dir);
} }
if (use_gpu) { if (use_gpu) {
config->use_gpu = true; config->EnableUseGpu(100, 0);
config->device = 0;
config->fraction_of_gpu_memory = 0.15;
if (use_tensorrt) { if (use_tensorrt) {
config->EnableTensorRtEngine(1 << 10, batch_size); config->EnableTensorRtEngine(1 << 10, batch_size);
config->pass_builder()->DeletePass("conv_bn_fuse_pass"); config->pass_builder()->DeletePass("conv_bn_fuse_pass");
config->pass_builder()->DeletePass("fc_fuse_pass"); config->pass_builder()->DeletePass("fc_fuse_pass");
config->pass_builder()->TurnOnDebug(); config->pass_builder()->TurnOnDebug();
} else { } else {
config->enable_ir_optim = true; config->SwitchIrOptim();
} }
} }
} }
...@@ -77,7 +75,8 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { ...@@ -77,7 +75,8 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
if (use_analysis || use_tensorrt) { if (use_analysis || use_tensorrt) {
contrib::AnalysisConfig config(true); contrib::AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.pass_builder()->TurnOnDebug(); config.pass_builder()->TurnOnDebug();
SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt, SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
FLAGS_batch_size); FLAGS_batch_size);
...@@ -100,23 +99,12 @@ void compare(std::string model_dir, bool use_tensorrt) { ...@@ -100,23 +99,12 @@ void compare(std::string model_dir, bool use_tensorrt) {
SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
} }
std::vector<PaddleTensor> native_outputs; contrib::AnalysisConfig analysis_config;
NativeConfig native_config;
SetConfig<NativeConfig>(&native_config, model_dir, true, false,
FLAGS_batch_size);
TestOneThreadPrediction(
reinterpret_cast<PaddlePredictor::Config*>(&native_config), inputs_all,
&native_outputs, false);
std::vector<PaddleTensor> analysis_outputs;
contrib::AnalysisConfig analysis_config(true);
SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true, SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
use_tensorrt, FLAGS_batch_size); use_tensorrt, FLAGS_batch_size);
TestOneThreadPrediction( CompareNativeAndAnalysis(
reinterpret_cast<PaddlePredictor::Config*>(&analysis_config), inputs_all, reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config),
&analysis_outputs, true); inputs_all);
CompareResult(native_outputs, analysis_outputs);
} }
TEST(TensorRT_mobilenet, compare) { TEST(TensorRT_mobilenet, compare) {
...@@ -154,9 +142,9 @@ TEST(TensorRT_mobilenet, analysis) { ...@@ -154,9 +142,9 @@ TEST(TensorRT_mobilenet, analysis) {
TEST(AnalysisPredictor, use_gpu) { TEST(AnalysisPredictor, use_gpu) {
std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
AnalysisConfig config(true); AnalysisConfig config;
config.model_dir = model_dir; config.EnableUseGpu(100, 0);
config.fraction_of_gpu_memory = 0.15; config.SetModel(model_dir);
config.pass_builder()->TurnOnDebug(); config.pass_builder()->TurnOnDebug();
std::vector<std::vector<PaddleTensor>> inputs_all; std::vector<std::vector<PaddleTensor>> inputs_all;
......
...@@ -297,6 +297,21 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -297,6 +297,21 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>( cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
layout, framework::vectorize2int(filter->dims()), groups); layout, framework::vectorize2int(filter->dims()), groups);
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Enable Tensor Core for cudnn backward
if (dev_ctx.GetComputeCapability() >= 70 &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
VLOG(5) << "use cudnn_tensor_op_math for backward";
} else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
VLOG(5) << "NOT use cudnn_tensor_op_math for backward";
}
#endif
int input_channels = input->dims()[1]; int input_channels = input->dims()[1];
int input_height, input_width, input_depth; int input_height, input_width, input_depth;
if (input->dims().size() == 5) { if (input->dims().size() == 5) {
......
...@@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
#endif #endif
auto input_data_type = ctx.Input<Tensor>("Input")->type(); auto input_data_type = ctx.Input<Tensor>("Input")->type();
auto filter_data_type = ctx.Input<Tensor>("Filter")->type(); if (input_data_type != framework::proto::VarType::INT8 &&
PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, input_data_type != framework::proto::VarType::UINT8) {
"input and filter data type should be consistent"); auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
"input and filter data type should be consistent");
}
if (input_data_type == framework::proto::VarType::FP16) { if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN, PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
"float16 can only be used when CUDNN is used"); "float16 can only be used when CUDNN is used");
...@@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() { ...@@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() {
"whenever convolution output is as an input to residual " "whenever convolution output is as an input to residual "
"connection.") "connection.")
.SetDefault(false); .SetDefault(false);
AddAttr<float>("Scale_in",
"Scale_in to be used for int8 input data."
"Only used with MKL-DNN INT8.")
.SetDefault(1.0f);
AddAttr<float>("Scale_out",
"Scale_out to be used for int8 output data."
"Only used with MKL-DNN INT8.")
.SetDefault(1.0f);
AddAttr<float>("Scale_in_eltwise",
"Scale_in_eltwise to be used for int8 eltwise input data."
"Only used with MKL-DNN INT8.")
.SetDefault(1.0f);
AddAttr<std::vector<float>>("Scale_weights",
"Scale_weights to be used for int8 weights data."
"Only used with MKL-DNN INT8.")
.SetDefault({1.0f});
AddAttr<bool>("force_fp32_output",
"(bool, default false) Force INT8 kernel output FP32, only "
"used in MKL-DNN INT8")
.SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"data_format", "data_format",
"(string, default NCHW) Only used in " "(string, default NCHW) Only used in "
...@@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() { ...@@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() {
"Defaults to \"NHWC\". Specify the data format of the output data, " "Defaults to \"NHWC\". Specify the data format of the output data, "
"the input will be transformed automatically. ") "the input will be transformed automatically. ")
.SetDefault("AnyLayout"); .SetDefault("AnyLayout");
AddAttr<bool>("force_fp32_output",
"(bool, default false) Only used in mkldnn INT8 kernel")
.SetDefault(false);
// TODO(dzhwinter): need to registered layout transform function // TODO(dzhwinter): need to registered layout transform function
AddAttr<int>("workspace_size_MB", AddAttr<int>("workspace_size_MB",
"Only used in cudnn kernel. workspace size for cudnn, in MB, " "Only used in cudnn kernel. workspace size for cudnn, in MB, "
......
...@@ -29,6 +29,7 @@ namespace operators { ...@@ -29,6 +29,7 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
constexpr int kConvMKLDNNFP32 = 1; constexpr int kConvMKLDNNFP32 = 1;
constexpr int kConvMKLDNNINT8 = 2; constexpr int kConvMKLDNNINT8 = 2;
constexpr int MaxKeyLength = 256;
// Base convolution operator definations for other conv // Base convolution operator definations for other conv
// like operators to reuse the implementation. // like operators to reuse the implementation.
......
...@@ -32,7 +32,7 @@ namespace paddle { ...@@ -32,7 +32,7 @@ namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows; using SelectedRows = framework::SelectedRows;
using DDim = framework::DDim; using DDim = framework::DDim;
...@@ -117,6 +117,12 @@ static void MergeMultipleVarsIntoOneBySection( ...@@ -117,6 +117,12 @@ static void MergeMultipleVarsIntoOneBySection(
auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>(); auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
auto* out_tensor = auto* out_tensor =
scope->FindVar(out_name)->GetMutable<framework::LoDTensor>(); scope->FindVar(out_name)->GetMutable<framework::LoDTensor>();
PADDLE_ENFORCE_GT(
out_tensor->numel(), 0,
"When calling this method, the LoDTensor's numel must larger than zero. "
"Please check LoDTensor::Resize has been called first.");
auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place()); auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place());
bool is_on_cpu_place = true; bool is_on_cpu_place = true;
...@@ -138,7 +144,7 @@ static void MergeMultipleVarsIntoOneBySection( ...@@ -138,7 +144,7 @@ static void MergeMultipleVarsIntoOneBySection(
auto row_numel = dims[1]; auto row_numel = dims[1];
for (size_t i = 0; i < dims[0]; ++i) { for (int64_t i = 0; i < dims[0]; ++i) {
auto id = ids_in_this_section[i]; auto id = ids_in_this_section[i];
auto origin_id = id + abs_sections[section_idx]; auto origin_id = id + abs_sections[section_idx];
auto& offsets = id_to_offset[origin_id]; auto& offsets = id_to_offset[origin_id];
...@@ -172,8 +178,9 @@ void prefetch(const std::string& id_name, const std::string& out_name, ...@@ -172,8 +178,9 @@ void prefetch(const std::string& id_name, const std::string& out_name,
const std::vector<std::string>& table_names, const std::vector<std::string>& table_names,
const std::vector<std::string>& epmap, const std::vector<std::string>& epmap,
const std::vector<int>& height_sections, const std::vector<int>& height_sections,
const framework::ExecutionContext& context) { const framework::ExecutionContext& context,
auto& local_scope = context.scope().NewScope(); const framework::Scope& scope) {
auto& local_scope = scope.NewScope();
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& cpu_ctx = *pool.Get(platform::CPUPlace()); auto& cpu_ctx = *pool.Get(platform::CPUPlace());
...@@ -190,11 +197,11 @@ void prefetch(const std::string& id_name, const std::string& out_name, ...@@ -190,11 +197,11 @@ void prefetch(const std::string& id_name, const std::string& out_name,
out_var_names.push_back(out_name + "@" + epmap[i]); out_var_names.push_back(out_name + "@" + epmap[i]);
} }
auto& id_tensor = local_scope.FindVar(id_name)->Get<framework::LoDTensor>(); auto& id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
std::vector<int64_t> ids_vector; std::vector<int64_t> ids_vector;
if (platform::is_cpu_place(id_tensor.place())) { if (platform::is_cpu_place(id_tensor.place())) {
auto* id_data = id_tensor.data<int64_t>(); auto* id_data = id_tensor.data<int64_t>();
for (size_t i = 0; i < id_tensor.numel(); ++i) { for (int64_t i = 0; i < id_tensor.numel(); ++i) {
ids_vector.push_back(id_data[i]); ids_vector.push_back(id_data[i]);
} }
} else { } else {
...@@ -202,7 +209,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, ...@@ -202,7 +209,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
PADDLE_THROW("paddle is not compiled with CUDA!"); PADDLE_THROW("paddle is not compiled with CUDA!");
#else #else
auto cpu_place = platform::CPUPlace(); auto cpu_place = platform::CPUPlace();
framework::Tensor cpu_tensor; framework::LoDTensor cpu_tensor;
auto* cpu_tensor_data = auto* cpu_tensor_data =
cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place); cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place);
auto stream = auto stream =
...@@ -246,8 +253,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, ...@@ -246,8 +253,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
out_var_names, height_sections, splited_ids, out_var_names, height_sections, splited_ids,
context, &local_scope, &actual_ctx); context, &local_scope, &actual_ctx);
scope.DeleteScope(&local_scope);
context.scope().DeleteScope(&local_scope);
} }
}; // namespace distributed }; // namespace distributed
......
...@@ -27,7 +27,56 @@ void prefetch(const std::string& id_name, const std::string& out_name, ...@@ -27,7 +27,56 @@ void prefetch(const std::string& id_name, const std::string& out_name,
const std::vector<std::string>& table_names, const std::vector<std::string>& table_names,
const std::vector<std::string>& epmap, const std::vector<std::string>& epmap,
const std::vector<int>& height_sections, const std::vector<int>& height_sections,
const framework::ExecutionContext& context); const framework::ExecutionContext& context,
const framework::Scope& scope);
template <typename T>
void prefetch_with_reconstruct(const std::string& id_name,
const std::string& out_name,
const std::vector<std::string>& table_names,
const std::vector<std::string>& epmap,
const std::vector<int>& height_sections,
const framework::ExecutionContext& context,
const framework::Scope& scope,
framework::LoDTensor* original) {
prefetch(id_name, out_name, table_names, epmap, height_sections, context,
scope);
auto& out = scope.FindVar(out_name)->Get<framework::LoDTensor>();
auto& ids = scope.FindVar(id_name)->Get<framework::LoDTensor>();
auto* original_value = original->data<T>();
auto* out_value = out.data<T>();
size_t original_width = original->numel() / original->dims()[0];
bool is_on_cpu_place = true;
if (!platform::is_cpu_place(ids.place())) {
is_on_cpu_place = false;
}
if (is_on_cpu_place) {
for (int64_t i = 0; i < ids.numel(); i++) {
const T* out_rows = out_value + original_width * i;
T* original_row =
original_value + original_width * ids.data<int64_t>()[i];
std::memcpy(original_row, out_rows, original_width * sizeof(T));
}
} else {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("paddle is not compiled with CUDA!");
#else
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& actual_ctx = *pool.Get(context.GetPlace());
for (int64_t i = 0; i < ids.numel(); i++) {
const T* out_rows = out_value + original_width * i;
T* original_row =
original_value + original_width * ids.data<int64_t>()[i];
auto stream =
static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();
memory::Copy(boost::get<platform::CUDAPlace>(ids.place()), original_row,
platform::CPUPlace(), out_rows, original_width * sizeof(T),
stream);
}
#endif
}
}
}; // namespace distributed }; // namespace distributed
}; // namespace operators }; // namespace operators
......
...@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_sub, elementwise_sub,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_sub_grad, elementwise_sub_grad,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows;
using DDim = framework::DDim;
template <typename T>
struct EmbeddingVSumFunctor {
void operator()(const framework::ExecutionContext &context,
const LoDTensor *table_t, const LoDTensor *ids_t,
LoDTensor *output_t) {
auto *table = table_t->data<T>();
int64_t row_number = table_t->dims()[0];
int64_t row_width = table_t->dims()[1];
int64_t last_dim = output_t->dims()[1];
const int64_t *ids = ids_t->data<int64_t>();
auto ids_lod = ids_t->lod()[0];
int64_t ids_count = ids_t->numel() / ids_lod.back();
auto *output = output_t->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int64_t i = 0; i != ids_lod.size() - 1; ++i) {
size_t begin = ids_lod[i] * ids_count;
for (int64_t j = 0; j != ids_count; ++j) {
PADDLE_ENFORCE_LT(ids[begin], row_number);
PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i);
blas.VCOPY(row_width, table + ids[begin + j] * row_width,
output + i * last_dim + j * row_width);
}
for (int64_t r = (ids_lod[i] + 1) * ids_count;
r < ids_lod[i + 1] * ids_count; ++r) {
PADDLE_ENFORCE_LT(ids[r], row_number);
PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i);
blas.AXPY(row_width, 1., table + ids[r] * row_width,
output + i * last_dim + (r % ids_count) * row_width);
}
}
}
};
template <typename T>
class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
const LoDTensor *ids_t = context.Input<LoDTensor>("Ids"); // int tensor
LoDTensor *output_t = context.Output<LoDTensor>("Out"); // float tensor
const LoDTensor *table_var = context.Input<LoDTensor>("W");
const std::string &combiner_type = context.Attr<std::string>("combiner");
if (combiner_type == "sum") {
EmbeddingVSumFunctor<T> functor;
functor(context, table_var, ids_t, output_t);
}
}
};
template <typename T>
class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *table_var = context.InputVar("W");
DDim table_dim;
if (table_var->IsType<LoDTensor>()) {
table_dim = context.Input<LoDTensor>("W")->dims();
} else if (table_var->IsType<SelectedRows>()) {
auto *table_t = context.Input<SelectedRows>("W");
table_dim = table_t->value().dims();
} else {
PADDLE_THROW(
"The parameter W of a LookupTable "
"must be either LoDTensor or SelectedRows");
}
bool is_sparse = context.Attr<bool>("is_sparse");
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
if (is_sparse) {
auto *ids = context.Input<LoDTensor>("Ids");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
auto *ids_data = ids->data<int64_t>();
int64_t ids_num = ids->numel();
auto lod = ids->lod()[0];
int64_t row_width = d_output->dims()[1];
framework::Vector<int64_t> *new_rows = d_table->mutable_rows();
new_rows->resize(ids_num);
std::memcpy(&(*new_rows)[0], ids_data, ids_num * sizeof(int64_t));
auto *d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_num, table_dim[1]});
T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
const T *d_output_data = d_output->data<T>();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
int64_t in_offset = lod[i] * row_width;
const T *out_pos = d_output_data + i * row_width;
T *in_pos = d_table_data + in_offset;
for (int r = 0; r != h; ++r) {
blas.VCOPY(row_width, out_pos, in_pos + r * row_width);
}
}
} else {
LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now";
}
}
};
} // namespace operators
} // namespace paddle
此差异已折叠。
...@@ -190,6 +190,26 @@ void BenchGRUKernel() { ...@@ -190,6 +190,26 @@ void BenchGRUKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void BenchSeqPoolKernel() {
std::vector<jit::SeqPoolType> pool_types = {
jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
for (auto type : pool_types) {
for (int w : TestSizes()) {
jit::seq_pool_attr_t attr(w, type);
for (int h : TestSizes()) {
attr.h = h;
std::vector<T> x(h * w), y(w);
RandomVec<T>(h * w, x.data(), -2.f, 2.f);
const T* x_data = x.data();
T* y_data = y.data();
BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
y_data, &attr);
}
}
}
}
// Benchmark all jit kernels including jitcode, mkl and refer. // Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...] // To use this tool, run command: ./benchmark [options...]
// Options: // Options:
...@@ -228,4 +248,7 @@ int main(int argc, char* argv[]) { ...@@ -228,4 +248,7 @@ int main(int argc, char* argv[]) {
BenchGRUKernel<jit::kGRUH1, T, PlaceType>(); BenchGRUKernel<jit::kGRUH1, T, PlaceType>();
BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>(); BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>(); BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
// seq pool function
BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>();
} }
...@@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1) ...@@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1)
USE_JITKERNEL_GEN(kGRUHtPart1) USE_JITKERNEL_GEN(kGRUHtPart1)
USE_JITKERNEL_GEN(kGRUHtPart2) USE_JITKERNEL_GEN(kGRUHtPart2)
USE_JITKERNEL_GEN(kNCHW16CMulNC) USE_JITKERNEL_GEN(kNCHW16CMulNC)
USE_JITKERNEL_GEN(kSeqPool)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册