提交 eac5a0aa 编写于 作者: S sneaxiy

Merge develop

test=develop
...@@ -19,3 +19,10 @@ find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALL ...@@ -19,3 +19,10 @@ find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALL
mark_as_advanced( mark_as_advanced(
JEMALLOC_LIBRARIES JEMALLOC_LIBRARIES
JEMALLOC_INCLUDE_DIR) JEMALLOC_INCLUDE_DIR)
if (JEMALLOC_FOUND)
add_library(jemalloc::jemalloc UNKNOWN IMPORTED)
set_target_properties(jemalloc::jemalloc PROPERTIES
IMPORTED_LOCATION ${JEMALLOC_LIBRARIES}
INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}")
endif()
...@@ -5,6 +5,8 @@ endif() ...@@ -5,6 +5,8 @@ endif()
set(paddle_known_gpu_archs "30 35 50 52 60 61 70") set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs7 "30 35 50 52") set(paddle_known_gpu_archs7 "30 35 50 52")
set(paddle_known_gpu_archs8 "30 35 50 52 60 61") set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
###################################################################################### ######################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled) # A function for automatic detection of GPUs installed (if autodetection is enabled)
...@@ -59,7 +61,7 @@ endfunction() ...@@ -59,7 +61,7 @@ endfunction()
# select_nvcc_arch_flags(out_variable) # select_nvcc_arch_flags(out_variable)
function(select_nvcc_arch_flags out_variable) function(select_nvcc_arch_flags out_variable)
# List of arch names # List of arch names
set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual") set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
set(archs_name_default "All") set(archs_name_default "All")
if(NOT CMAKE_CROSSCOMPILING) if(NOT CMAKE_CROSSCOMPILING)
list(APPEND archs_names "Auto") list(APPEND archs_names "Auto")
...@@ -93,6 +95,8 @@ function(select_nvcc_arch_flags out_variable) ...@@ -93,6 +95,8 @@ function(select_nvcc_arch_flags out_variable)
set(cuda_arch_bin "60 61") set(cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
set(cuda_arch_bin "70") set(cuda_arch_bin "70")
elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
set(cuda_arch_bin "75")
elseif(${CUDA_ARCH_NAME} STREQUAL "All") elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs}) set(cuda_arch_bin ${paddle_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
...@@ -153,6 +157,16 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x ...@@ -153,6 +157,16 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
# warning for now. # warning for now.
list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
add_definitions("-DPADDLE_CUDA_BINVER=\"80\"") add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
add_definitions("-DPADDLE_CUDA_BINVER=\"90\"")
elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
endif() endif()
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})
......
...@@ -23,11 +23,8 @@ set(BOOST_PROJECT "extern_boost") ...@@ -23,11 +23,8 @@ set(BOOST_PROJECT "extern_boost")
# checked that the devtools package of CentOS 6 installs boost 1.41.0. # checked that the devtools package of CentOS 6 installs boost 1.41.0.
# So we use 1.41.0 here. # So we use 1.41.0 here.
set(BOOST_VER "1.41.0") set(BOOST_VER "1.41.0")
if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL)) set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
message(STATUS "use pre defined download url") set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
endif()
MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
......
...@@ -63,6 +63,15 @@ ADD_DEPENDENCIES(gflags extern_gflags) ...@@ -63,6 +63,15 @@ ADD_DEPENDENCIES(gflags extern_gflags)
LIST(APPEND external_project_dependencies gflags) LIST(APPEND external_project_dependencies gflags)
# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
if (WIN32)
include(CheckIncludeFileCXX)
check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI)
if (HAVE_SHLWAPI)
set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
endif(HAVE_SHLWAPI)
endif (WIN32)
IF(WITH_C_API) IF(WITH_C_API)
INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags) INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
IF(ANDROID) IF(ANDROID)
......
...@@ -55,7 +55,7 @@ ExternalProject_Add( ...@@ -55,7 +55,7 @@ ExternalProject_Add(
${MKLDNN_PROJECT} ${MKLDNN_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS} DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git"
GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a" GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a"
PREFIX ${MKLDNN_SOURCES_DIR} PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
......
...@@ -16,6 +16,12 @@ IF(NOT ${WITH_MKLML}) ...@@ -16,6 +16,12 @@ IF(NOT ${WITH_MKLML})
return() return()
ENDIF(NOT ${WITH_MKLML}) ENDIF(NOT ${WITH_MKLML})
IF(APPLE)
MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
return()
ENDIF()
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(MKLML_DST_DIR "mklml") SET(MKLML_DST_DIR "mklml")
SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
...@@ -23,32 +29,24 @@ SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) ...@@ -23,32 +29,24 @@ SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
SET(MKLML_ROOT ${MKLML_INSTALL_DIR}) SET(MKLML_ROOT ${MKLML_INSTALL_DIR})
SET(MKLML_INC_DIR ${MKLML_ROOT}/include) SET(MKLML_INC_DIR ${MKLML_ROOT}/include)
SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib)
if(WIN32) SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
SET(TIME_VERSION "2019.0.1.20181227")
IF(WIN32)
SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll)
else() ELSE()
SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
endif() ENDIF()
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
MESSAGE(STATUS "use pre defined download url")
if(WIN32)
SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
elseif(APPLE)
SET(MKLML_VER "mklml_mac_2019.0.1.20180928" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
else()
SET(MKLML_VER "mklml_lnx_2019.0.1.20180928" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
ENDIF()
endif()
SET(MKLML_PROJECT "extern_mklml") SET(MKLML_PROJECT "extern_mklml")
MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
......
...@@ -37,14 +37,18 @@ INCLUDE(GNUInstallDirs) ...@@ -37,14 +37,18 @@ INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph") SET(NGRAPH_PROJECT "extern_ngraph")
SET(NGRAPH_GIT_TAG "08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9") SET(NGRAPH_GIT_TAG "20bd8bbc79ae3a81c57313846a2be7313e5d1dab")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
SET(NGRAPH_SHARED_LIB_NAME libngraph.so) SET(NGRAPH_SHARED_LIB_NAME libngraph.so)
SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so)
SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) if(CMAKE_BUILD_TYPE STREQUAL "Debug")
SET(NGRAPH_TBB_LIB_NAME libtbb_debug.so.2)
else()
SET(NGRAPH_TBB_LIB_NAME libtbb.so.2)
endif()
SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git")
SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME}) SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME}) SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
...@@ -66,16 +70,7 @@ ExternalProject_Add( ...@@ -66,16 +70,7 @@ ExternalProject_Add(
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
) CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
# Workaround for nGraph expecting mklml to be in mkldnn install directory.
ExternalProject_Add_Step(
${NGRAPH_PROJECT}
PrepareMKL
COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_LIB} ${MKLDNN_INSTALL_DIR}/lib/libmklml_intel.so
COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_IOMP_LIB} ${MKLDNN_INSTALL_DIR}/lib/libiomp5.so
DEPENDEES download
DEPENDERS configure
) )
add_dependencies(ngraph ${NGRAPH_PROJECT}) add_dependencies(ngraph ${NGRAPH_PROJECT})
......
...@@ -117,7 +117,7 @@ function(common_link TARGET_NAME) ...@@ -117,7 +117,7 @@ function(common_link TARGET_NAME)
endif() endif()
if (WITH_JEMALLOC) if (WITH_JEMALLOC)
target_link_libraries(${TARGET_NAME} ${JEMALLOC_LIBRARIES}) target_link_libraries(${TARGET_NAME} jemalloc::jemalloc)
endif() endif()
endfunction() endfunction()
...@@ -359,6 +359,8 @@ function(cc_binary TARGET_NAME) ...@@ -359,6 +359,8 @@ function(cc_binary TARGET_NAME)
add_dependencies(${TARGET_NAME} ${cc_binary_DEPS}) add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
common_link(${TARGET_NAME}) common_link(${TARGET_NAME})
endif() endif()
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
endfunction(cc_binary) endfunction(cc_binary)
function(cc_test TARGET_NAME) function(cc_test TARGET_NAME)
...@@ -367,18 +369,15 @@ function(cc_test TARGET_NAME) ...@@ -367,18 +369,15 @@ function(cc_test TARGET_NAME)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS) set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
if(WIN32) if(WIN32)
list(APPEND win32_deps shlwapi)
if("${cc_test_DEPS};" MATCHES "python;") if("${cc_test_DEPS};" MATCHES "python;")
list(REMOVE_ITEM cc_test_DEPS python) list(REMOVE_ITEM cc_test_DEPS python)
list(APPEND win32_deps ${PYTHON_LIBRARIES}) target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
endif() endif()
endif(WIN32) endif(WIN32)
add_executable(${TARGET_NAME} ${cc_test_SRCS}) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main lod_tensor memory gtest gflags glog)
if(WIN32)
target_link_libraries(${TARGET_NAME} ${win32_deps})
endif(WIN32)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
common_link(${TARGET_NAME}) common_link(${TARGET_NAME})
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
...@@ -451,7 +450,8 @@ function(nv_test TARGET_NAME) ...@@ -451,7 +450,8 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules})
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
common_link(${TARGET_NAME}) common_link(${TARGET_NAME})
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
...@@ -538,7 +538,8 @@ function(hip_test TARGET_NAME) ...@@ -538,7 +538,8 @@ function(hip_test TARGET_NAME)
endif() endif()
add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources}) add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags ${os_dependency_modules})
add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
common_link(${TARGET_NAME}) common_link(${TARGET_NAME})
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
......
...@@ -405,28 +405,50 @@ paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None ...@@ -405,28 +405,50 @@ paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True))
paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.optimizer.SGDOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.SGDOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdamaxOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.DecayedAdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)) paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None))
paddle.fluid.optimizer.FtrlOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.FtrlOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.RMSPropOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdadeltaOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)) paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.optimizer.ModelAverage.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.ModelAverage.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)) paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.LarsMomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)) paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
......
...@@ -94,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS ...@@ -94,4 +94,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass fuse_elewise_add_act_pass multi_batch_merge_pass
memory_optimize_pass) memory_optimize_pass lock_free_optimize_pass)
...@@ -232,3 +232,4 @@ USE_PASS(analysis_var_pass); ...@@ -232,3 +232,4 @@ USE_PASS(analysis_var_pass);
USE_PASS(sequential_execution_pass); USE_PASS(sequential_execution_pass);
USE_PASS(all_reduce_deps_pass); USE_PASS(all_reduce_deps_pass);
USE_PASS(modify_op_lock_and_record_event_pass); USE_PASS(modify_op_lock_and_record_event_pass);
USE_PASS(lock_free_optimize_pass);
...@@ -25,6 +25,8 @@ namespace paddle { ...@@ -25,6 +25,8 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
static const std::string kEagerDeletionOpName{"eager_deletion"}; // NOLINT
EagerDeletionOpHandle::EagerDeletionOpHandle( EagerDeletionOpHandle::EagerDeletionOpHandle(
ir::Node *node, const Scope *scope, const platform::Place &place, ir::Node *node, const Scope *scope, const platform::Place &place,
const std::unordered_set<std::string> &var_names, GarbageCollector *gc, const std::unordered_set<std::string> &var_names, GarbageCollector *gc,
...@@ -59,20 +61,15 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { ...@@ -59,20 +61,15 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
#endif #endif
} }
std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } std::string EagerDeletionOpHandle::Name() const { return kEagerDeletionOpName; }
void EagerDeletionOpHandle::RunImpl() { void EagerDeletionOpHandle::RunImpl() {
#ifdef PADDLE_WITH_CUDA platform::RecordEvent event(kEagerDeletionOpName, nullptr);
platform::RecordEvent record_event(Name(), dev_ctx_);
#else
platform::RecordEvent record_event(Name(), nullptr);
#endif
Scope *exec_scope = nullptr; Scope *exec_scope = nullptr;
std::deque<std::shared_ptr<memory::Allocation>> garbages; std::deque<std::shared_ptr<memory::Allocation>> garbages;
for (auto &name : var_names_) { for (auto &name : var_names_) {
auto it = ref_cnts_->find(name); auto it = ref_cnts_->find(name);
// Var not found, not reference count has not decreased to 0 // Reference count has not decreased to 0
if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) { if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
continue; continue;
} }
...@@ -81,6 +78,7 @@ void EagerDeletionOpHandle::RunImpl() { ...@@ -81,6 +78,7 @@ void EagerDeletionOpHandle::RunImpl() {
exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(); exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
} }
// Var not found
auto *var = exec_scope->FindVar(name); auto *var = exec_scope->FindVar(name);
if (var == nullptr) { if (var == nullptr) {
continue; continue;
......
...@@ -226,7 +226,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl( ...@@ -226,7 +226,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
* Only variables should be the leaves of graph. * Only variables should be the leaves of graph.
*/ */
AddOutputToLeafOps(&result); AddOutputToLeafOps(&result);
result.Erase<GraphOps>(kGraphOps); result.Erase(kGraphOps);
return graph; return graph;
} }
......
...@@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) ...@@ -31,6 +31,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
pass_library(graph_to_program_pass base) pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass base) pass_library(graph_viz_pass base)
pass_library(lock_free_optimize_pass base)
pass_library(fc_fuse_pass inference) pass_library(fc_fuse_pass inference)
pass_library(attention_lstm_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference)
pass_library(infer_clean_graph_pass inference) pass_library(infer_clean_graph_pass inference)
...@@ -41,11 +42,23 @@ pass_library(seq_concat_fc_fuse_pass inference) ...@@ -41,11 +42,23 @@ pass_library(seq_concat_fc_fuse_pass inference)
pass_library(multi_batch_merge_pass base) pass_library(multi_batch_merge_pass base)
pass_library(conv_bn_fuse_pass inference) pass_library(conv_bn_fuse_pass inference)
pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference)
pass_library(seqpool_concat_fuse_pass inference)
pass_library(is_test_pass base) pass_library(is_test_pass base)
pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add_act_fuse_pass inference)
pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference)
pass_library(conv_elementwise_add_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference)
pass_library(conv_affine_channel_fuse_pass inference) pass_library(conv_affine_channel_fuse_pass inference)
pass_library(transpose_flatten_concat_fuse_pass inference)
# There may be many transpose-flatten structures in a model, and the output of
# these structures will be used as inputs to the concat Op. This pattern will
# be detected by our pass. The index here represents the number of structures in the
# pattern. We use index 3 ~ 6, because these quantities of structures are
# common in the models.
foreach (index RANGE 3 6)
file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
endforeach()
if(WITH_MKLDNN) if(WITH_MKLDNN)
pass_library(mkldnn_placement_pass base) pass_library(mkldnn_placement_pass base)
pass_library(depthwise_conv_mkldnn_pass base) pass_library(depthwise_conv_mkldnn_pass base)
...@@ -67,6 +80,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r ...@@ -67,6 +80,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
if (WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
......
...@@ -109,7 +109,6 @@ class Graph { ...@@ -109,7 +109,6 @@ class Graph {
attr_dels_[attr_name] = []() {}; attr_dels_[attr_name] = []() {};
} }
template <typename AttrType>
void Erase(const std::string &attr_name) { void Erase(const std::string &attr_name) {
PADDLE_ENFORCE(attrs_.count(attr_name) != 0, "%s not set in the graph", PADDLE_ENFORCE(attrs_.count(attr_name) != 0, "%s not set in the graph",
attr_name); attr_name);
......
...@@ -1306,6 +1306,69 @@ PDNode *patterns::ConvAffineChannel::operator()( ...@@ -1306,6 +1306,69 @@ PDNode *patterns::ConvAffineChannel::operator()(
return ac_out_var; return ac_out_var;
} }
// a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a
// b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b
// ...
// z -> transpose_op(n) -> transpose_out_z -> flatten_op(n) -> flatten_out_z
// flatten_out_a -> concat_op flatten_out_b -> concat_op ... flatten_out_z ->
// concat_op
PDNode *patterns::TransposeFlattenConcat::operator()(
std::vector<PDNode *> conv_in, int times) {
// The times represents the repeat times of the
// {trans, trans_out, flatten, flatten_out}
const int kNumFields = 4;
const int kTransOutOffset = 1;
const int kFlattenOffset = 2;
const int kFlattenOutOffset = 3;
std::vector<PDNode *> nodes;
for (int i = 0; i < times; i++) {
nodes.push_back(
pattern->NewNode(GetNodeName("transpose" + std::to_string(i)))
->assert_is_op("transpose2"));
nodes.push_back(
pattern->NewNode(GetNodeName("transpose_out" + std::to_string(i)))
->assert_is_op_output("transpose2")
->assert_is_op_input("flatten2", "X")
->AsIntermediate());
nodes.push_back(pattern->NewNode(GetNodeName("flatten" + std::to_string(i)))
->assert_is_op("flatten2"));
nodes.push_back(
pattern->NewNode(GetNodeName("flatten_out" + std::to_string(i)))
->assert_is_op_output("flatten2")
->assert_is_op_nth_input("concat", "X", i)
->AsIntermediate());
}
auto concat_op = pattern->NewNode(GetNodeName("concat"))
->assert_is_op("concat")
->assert_op_has_n_inputs("concat", times);
auto concat_out = pattern->NewNode(GetNodeName("concat_out"))
->assert_is_op_output("concat")
->AsOutput();
std::vector<PDNode *> flatten_outs;
for (int i = 0; i < times; i++) {
conv_in[i]->AsInput();
// trans
nodes[i * kNumFields]->LinksFrom({conv_in[i]});
// trans_out
nodes[i * kNumFields + kTransOutOffset]->LinksFrom({nodes[i * kNumFields]});
// flatten
nodes[i * kNumFields + kFlattenOffset]->LinksFrom(
{nodes[i * kNumFields + kTransOutOffset]});
// flatten_out
nodes[i * kNumFields + kFlattenOutOffset]->LinksFrom(
{nodes[i * kNumFields + kFlattenOffset]});
flatten_outs.push_back(nodes[i * kNumFields + kFlattenOutOffset]);
}
concat_op->LinksFrom(flatten_outs).LinksTo({concat_out});
return concat_out;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -766,6 +766,21 @@ struct ConvAffineChannel : public PatternBase { ...@@ -766,6 +766,21 @@ struct ConvAffineChannel : public PatternBase {
PATTERN_DECL_NODE(ac_out); // Out PATTERN_DECL_NODE(ac_out); // Out
}; };
struct TransposeFlattenConcat : public PatternBase {
TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "transpose_flatten_concat") {}
PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
std::string GetNodeName(const std::string& op_type) {
return PDNodeName(name_scope_, repr_, id_, op_type);
}
PDNode* GetPDNode(const std::string& op_type) {
return pattern->RetrieveNode(GetNodeName(op_type));
}
};
} // namespace patterns } // namespace patterns
// Link two ir::Nodes from each other. // Link two ir::Nodes from each other.
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/lock_free_optimize_pass.h"
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
const char kSumGradOpName[] = "sum";
// TODO(minqiyang): only support sgd at current time, please add
// other optimizers later.
const char kOptimizerType[] = "sgd";
std::unique_ptr<ir::Graph> LockFreeOptimizePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
// We could collect all weights' name from SGD, where
// W1 <- SGD(W0, Grad0)
std::unordered_set<std::string> weight_var_set;
for (auto* node : graph->Nodes()) {
if (IsOpNamed(node, kOptimizerType)) {
auto& param_out_vars = node->Op()->Output("ParamOut");
PADDLE_ENFORCE(param_out_vars.size() == 1u);
weight_var_set.insert(param_out_vars[0]);
}
}
// find all grad's merge op via weight name, where
// Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
std::unordered_set<ir::Node*> grad_sum_op_set;
for (ir::Node* node : graph->Nodes()) {
if (IsOpNamed(node, kSumGradOpName)) {
for (ir::Node* output : node->outputs) {
// strip the last grad suffix @GRAD
std::string var_name = output->Name();
const std::string suffix(kGradVarSuffix);
if (var_name != suffix && var_name.size() > suffix.size() &&
var_name.substr(var_name.size() - suffix.size()) == suffix) {
// if so then strip them off
var_name = var_name.substr(0, var_name.size() - suffix.size());
if (weight_var_set.find(var_name) != weight_var_set.end()) {
grad_sum_op_set.insert(node);
break;
}
}
}
}
}
// get the forward op and backward op pairs, where
// out <- forward(X, W)
// Grad1 <- backward(out, X')
// Grad0 <- SUM(Grad1, Grad2, Grad3 ...)
// W0 <- SGD(W1, Grad0)
for (ir::Node* node : grad_sum_op_set) {
for (ir::Node* merged_grad_var : node->outputs) {
// find the optimizers connected with sum op
if (IsVarNameEndsWith(merged_grad_var, kGradVarSuffix) &&
merged_grad_var->outputs.size() == 1u) {
ir::Node* opt_node = merged_grad_var->outputs[0];
VLOG(3) << "Found opt node " << opt_node->Name();
// find the backward op connected with sum op
for (ir::Node* unmerged_grad_var : node->inputs) {
if (IsVarNameContains(unmerged_grad_var, kGradVarSuffix) &&
unmerged_grad_var->inputs.size() == 1u) {
ir::Node* backward_op = unmerged_grad_var->inputs[0];
VLOG(3) << "Found backward_op " << backward_op->Name();
// find the forward op related to the backward op
ir::Node* forward_op =
FindForwardOpViaBackwardOp(graph.get(), backward_op);
VLOG(3) << "Found forward_op " << forward_op->Name();
PADDLE_ENFORCE(forward_op);
Node* new_optimizer_node = CreateNewSGDNode(
graph.get(), forward_op, backward_op, node, opt_node);
PADDLE_ENFORCE(new_optimizer_node);
}
}
}
}
}
// Remove the sum_op and its' outputs and connected Optimizers
for (Node* sum_op : grad_sum_op_set) {
for (Node* sum_op_output : sum_op->outputs) {
for (Node* optimize_op : sum_op_output->outputs) {
if (optimize_op->NodeType() == Node::Type::kOperation &&
optimize_op->Name() == kOptimizerType) {
VLOG(3) << "remove optimize_op: " << optimize_op->Name() << "_"
<< optimize_op->id();
graph->RemoveNode(optimize_op);
}
}
VLOG(3) << "remove sum_op_output: " << sum_op_output->Name() << "_"
<< sum_op_output->id();
graph->RemoveNode(sum_op_output);
}
VLOG(3) << "remove sum_op: " << sum_op->Name() << "_" << sum_op->id();
graph->RemoveNode(sum_op);
}
for (auto* node : graph->Nodes()) {
for (Node* output_node : node->outputs) {
if (output_node->Name() == "sgd") {
VLOG(3) << "Node link to SGD: " << node->Name() << "_" << node->id()
<< " --> " << output_node->Name() << "_" << output_node->id();
for (Node* input_node : node->inputs) {
VLOG(3) << "SGD Input link: " << input_node->Name() << "_"
<< input_node->id() << " --> " << node->Name() << "_"
<< node->id();
}
}
}
}
return graph;
}
ir::Node* LockFreeOptimizePass::CreateNewSGDNode(
ir::Graph* graph, ir::Node* forward_node, ir::Node* backward_node,
ir::Node* grad_sum_node, ir::Node* optimize_node) const {
PADDLE_ENFORCE(graph);
PADDLE_ENFORCE(forward_node);
PADDLE_ENFORCE(backward_node);
PADDLE_ENFORCE(grad_sum_node);
PADDLE_ENFORCE(optimize_node);
// find the grad var node between the grad sum node and backward_node
std::vector<ir::Node*> grad_vars =
FindConnectedNode(backward_node, grad_sum_node);
ir::Node* grad_node = nullptr;
for (ir::Node* node : grad_vars) {
if (!ir::IsControlDepVar(*node)) {
grad_node = node;
}
}
PADDLE_ENFORCE(grad_node);
// create a new SGD node
OpDesc* old_desc = optimize_node->Op();
// keep with the same block between new optimizer and the old one
OpDesc new_desc(*old_desc, old_desc->Block());
new_desc.SetInput("Param", old_desc->Input("Param"));
new_desc.SetInput("LearningRate", old_desc->Input("LearningRate"));
new_desc.SetInput("Grad", std::vector<std::string>({grad_node->Name()}));
new_desc.SetOutput("ParamOut", old_desc->Output("ParamOut"));
std::vector<std::string> op_role_vars = boost::get<std::vector<std::string>>(
new_desc.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName()));
// replace the second op role var, because the grad name was
// changed in new optimizer
op_role_vars.pop_back();
op_role_vars.push_back(grad_node->Name());
new_desc.SetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(),
op_role_vars);
new_desc.SetType(kOptimizerType);
// set backward op's op role var, this will be used to
// set device_id in multi_device_pass
backward_node->Op()->SetAttr(
framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), op_role_vars);
// backward_node->Op()->SetAttr(
// framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), {});
// keep with the same output nodes between new optimizer and the
// old one
Node* sgd_node = graph->CreateOpNode(&new_desc);
// change all outputs of the optimize_node to the new one
ReplaceAllDownstreamNode(optimize_node, sgd_node);
// find connected node between forward node and optimize node
// and replace the optimize node to new sgd node
std::vector<ir::Node*> forward_opt_connected_nodes =
FindConnectedNode(forward_node, optimize_node);
for (ir::Node* node : forward_opt_connected_nodes) {
ReplaceUpstreamNode(node, optimize_node, sgd_node);
}
// find connected node between backward node and optimize node
// and replace the optimize node to new sgd node
std::vector<ir::Node*> backward_opt_connected_nodes =
FindConnectedNode(backward_node, optimize_node);
for (ir::Node* node : backward_opt_connected_nodes) {
ReplaceUpstreamNode(node, optimize_node, sgd_node);
}
// SGD must have only one param and LR in
PADDLE_ENFORCE(old_desc->Input("LearningRate").size() == 1u);
PADDLE_ENFORCE(old_desc->Input("Param").size() == 1u);
// LR and weight nodes should be copied
for (Node* upstream_node : optimize_node->inputs) {
if (upstream_node->Name() == old_desc->Input("LearningRate")[0] ||
upstream_node->Name() == old_desc->Input("Param")[0]) {
ReplaceUpstreamNode(upstream_node, optimize_node, sgd_node);
}
}
VLOG(3) << "Create new opt node" << sgd_node->Name() << "_" << sgd_node->id();
return sgd_node;
}
std::vector<ir::Node*> LockFreeOptimizePass::FindConnectedNode(
ir::Node* upstream_node, ir::Node* downstream_node) const {
std::vector<ir::Node*> result;
for (ir::Node* out_node : upstream_node->outputs) {
for (ir::Node* in_node : downstream_node->inputs) {
if (in_node == out_node) {
result.push_back(in_node);
}
}
}
return result;
}
void LockFreeOptimizePass::ReplaceUpstreamNode(
ir::Node* upstream_node, ir::Node* old_optimizer_node,
ir::Node* new_optimizer_node) const {
PADDLE_ENFORCE(upstream_node);
PADDLE_ENFORCE(old_optimizer_node);
PADDLE_ENFORCE(new_optimizer_node);
// Remove the old_optimizer_node from upstream_node's outputs vector
auto& output_node_vec = upstream_node->outputs;
for (auto output_node_iter = output_node_vec.begin();
output_node_iter != output_node_vec.end();) {
if (*output_node_iter == old_optimizer_node) {
output_node_vec.erase(output_node_iter);
break;
} else {
++output_node_iter;
}
}
// Add the new_optimizer_node to upstream_node's outputs vector
output_node_vec.emplace_back(new_optimizer_node);
new_optimizer_node->inputs.emplace_back(upstream_node);
}
void LockFreeOptimizePass::ReplaceAllDownstreamNode(
ir::Node* old_optimizer_node, ir::Node* new_optimizer_node) const {
PADDLE_ENFORCE(old_optimizer_node);
PADDLE_ENFORCE(new_optimizer_node);
for (ir::Node* downstream_node : old_optimizer_node->outputs) {
// Remove the old_optimizer_node from downstream_node's inputs vector
auto& input_node_vec = downstream_node->inputs;
for (auto input_node_iter = input_node_vec.begin();
input_node_iter != input_node_vec.end();) {
if (*input_node_iter == old_optimizer_node) {
input_node_vec.erase(input_node_iter);
break;
} else {
++input_node_iter;
}
}
// Add the new_optimizer_node to downstream_node's inputs vector
input_node_vec.emplace_back(new_optimizer_node);
new_optimizer_node->outputs.emplace_back(downstream_node);
}
}
ir::Node* LockFreeOptimizePass::FindForwardOpViaBackwardOp(
ir::Graph* graph, ir::Node* backward_node) const {
PADDLE_ENFORCE(graph);
PADDLE_ENFORCE(backward_node);
// strip the suffix _grad of backward_node's name
std::string forward_op_name = backward_node->Name();
const std::string suffix("_grad");
if (forward_op_name != suffix && forward_op_name.size() > suffix.size() &&
forward_op_name.substr(forward_op_name.size() - suffix.size()) ==
suffix) {
// if so then strip them off
forward_op_name =
forward_op_name.substr(0, forward_op_name.size() - suffix.size());
} else {
LOG(WARNING) << "Illegal backward node's name " << backward_node->Name()
<< " id " << backward_node->id();
return nullptr;
}
for (ir::Node* node : graph->Nodes()) {
if (node->Name() == forward_op_name) {
if (node->outputs.size() == 0u) {
// if forward_node has no output, then it has NO grad op
continue;
}
// check whether all inputs of the backward_op that ends_with @GRAD
// comes from the output of forward_op is the input of the backward_op
bool is_related_forward_node = true;
for (ir::Node* backward_input : backward_node->inputs) {
if (IsVarNameEndsWith(backward_input, kGradVarSuffix)) {
bool meets_correct_output = false;
for (ir::Node* forward_output : node->outputs) {
if (forward_output->Name() + kGradVarSuffix ==
backward_input->Name()) {
meets_correct_output = true;
break;
}
}
if (!meets_correct_output) {
is_related_forward_node = false;
break;
}
}
}
if (is_related_forward_node) {
return node;
}
}
}
return nullptr;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(lock_free_optimize_pass,
paddle::framework::ir::LockFreeOptimizePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
#define PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
#include <string>
#include <vector>
#include <boost/algorithm/string/predicate.hpp>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
/*
* Remove the sum op of all gradients of the backward op.
* And remove the dependecies of the optimizer related to the
* same backward op.
*
* Before this pass:
*
* forward_op1 forward_op2
* | |
* grad_op1 grad_op2
* \ /
* \ /
* sum_op
* |
* sgd_op
*
* After this pass:
* forward_op1 forward_op2
* | |
* grad_op1 grad_op2
* | |
* sgd_op1 sgd_op2
*
* sgd_op1 and sgd_op2 will update the same weight which holds the same
* memory, so we could benefits from the acceleration
*/
class LockFreeOptimizePass : public Pass {
public:
virtual ~LockFreeOptimizePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
private:
// Create a new sgd node via current optimizer node
ir::Node* CreateNewSGDNode(ir::Graph* graph, ir::Node* forward_node,
ir::Node* backward_node, ir::Node* grad_sum_node,
ir::Node* optimize_node) const;
// Replace the input weight's optimizers
void ReplaceUpstreamNode(ir::Node* upstream_node,
ir::Node* old_optimizer_node,
ir::Node* new_optimizer_node) const;
// Replace the output weight's optimizers
void ReplaceAllDownstreamNode(ir::Node* old_optimizer_node,
ir::Node* new_optimizer_node) const;
// Find all weight variables in graph
bool FindAllWeightVars(ir::Graph* graph) const;
// Find the forward_op node via the backward_op node
ir::Node* FindForwardOpViaBackwardOp(ir::Graph* graph,
ir::Node* backward_node) const;
std::vector<ir::Node*> FindConnectedNode(ir::Node* upstream_node,
ir::Node* downstream_node) const;
inline bool IsOpNamed(ir::Node* node, const std::string& name) const {
PADDLE_ENFORCE(node);
return node->NodeType() == Node::Type::kOperation && node->Name() == name;
}
inline bool IsVarNamed(ir::Node* node, const std::string& name) const {
PADDLE_ENFORCE(node);
return node->NodeType() == Node::Type::kVariable && node->Name() == name;
}
inline bool IsVarNameEndsWith(ir::Node* node, const std::string& name) const {
PADDLE_ENFORCE(node);
return node->NodeType() == Node::Type::kVariable &&
boost::algorithm::ends_with(node->Name(), name);
}
inline bool IsVarNameContains(ir::Node* node, const std::string& name) const {
PADDLE_ENFORCE(node);
return node->NodeType() == Node::Type::kVariable &&
node->Name().find(name) != std::string::npos;
}
inline bool IsControlDepFrom(ir::Node* ctrl_dep_node, ir::Node* node) const {
PADDLE_ENFORCE(ctrl_dep_node);
PADDLE_ENFORCE(node);
return IsControlDepVar(*ctrl_dep_node) &&
ctrl_dep_node->inputs.size() >= 1u &&
ctrl_dep_node->inputs[0] == node;
}
};
} // namespace ir
} // namespace framework
} // namespace paddle
#endif // PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#define MAX_CONCAT_INPUTS 200
namespace paddle {
namespace framework {
namespace ir {
PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern,
const std::string& name_scope,
int num_inputs) {
auto is_concat_op_with_inputs = [](Node* x, int num) -> bool {
return x && x->IsOp() && x->Op()->Type() == "concat" &&
x->Op()->Input("X").size() == static_cast<size_t>(num);
};
auto is_nth_input_var_of_concat = [=](Node* x, int idx) -> bool {
return x && x->IsVar() && VarLinksToOp(x, "concat") &&
x->outputs.size() == 1 && IsNthInput(x, x->outputs[0], "X", idx) &&
is_concat_op_with_inputs(x->outputs[0], num_inputs);
};
auto is_seqpool_op_with_pootype_of_nth_input_of_concat = [=](
Node* x, const std::string& type, int idx) -> bool {
bool this_is_seqpool_op =
x && x->IsOp() && x->Op()->Type() == "sequence_pool" &&
x->Op()->HasAttr("pooltype") &&
boost::get<std::string>(x->Op()->GetAttr("pooltype")) == type &&
x->outputs.size() == 2; // seqpool should only have 2 outputs
bool satisfied_all = this_is_seqpool_op;
if (this_is_seqpool_op) {
// Only one output of seqpool_op is nth_input_var of concat,
// the other one should be unused empty var.
if (is_nth_input_var_of_concat(x->outputs[0], idx)) {
satisfied_all = satisfied_all && x->outputs[1]->IsVar() &&
x->outputs[1]->outputs.empty();
} else {
satisfied_all =
satisfied_all && is_nth_input_var_of_concat(x->outputs[1], idx) &&
x->outputs[0]->IsVar() && x->outputs[0]->outputs.size() == 0;
}
}
return satisfied_all;
};
auto* concat_op = pattern->NewNode(
[=](Node* x) { return is_concat_op_with_inputs(x, num_inputs); },
name_scope + "/concat_op");
concat_op->assert_op_attr<int>("axis", 1);
auto* concat_out_var = pattern->NewNode(
[=](Node* x) {
return x && x->IsVar() && VarLinksFromOp(x, "concat") &&
x->inputs.size() == 1 &&
is_concat_op_with_inputs(x->inputs[0], num_inputs);
},
name_scope + "/concat_out_var");
concat_out_var->assert_is_only_output_of_op("concat");
std::vector<PDNode*> seqpool_ops_input_var(num_inputs);
std::vector<PDNode*> seqpool_ops_output_var(num_inputs);
std::vector<PDNode*> seqpool_ops_output_unused_var(num_inputs);
std::vector<PDNode*> seqpool_ops(num_inputs);
for (int i = 0; i < num_inputs; ++i) {
seqpool_ops_output_var[i] = pattern->NewNode(
[=](Node* x) {
return x && x->IsVar() && is_nth_input_var_of_concat(x, i) &&
x->inputs.size() == 1 &&
is_seqpool_op_with_pootype_of_nth_input_of_concat(x->inputs[0],
"SUM", i);
},
name_scope + "/sequence_pool_out_" + std::to_string(i));
seqpool_ops_output_unused_var[i] = pattern->NewNode(
[=](Node* x) {
return x && x->IsVar() && x->inputs.size() == 1 &&
x->outputs.size() == 0 &&
is_seqpool_op_with_pootype_of_nth_input_of_concat(x->inputs[0],
"SUM", i);
},
name_scope + "/sequence_pool_unused_out_" + std::to_string(i));
seqpool_ops[i] = pattern->NewNode(
[=](Node* x) {
return x && x->IsOp() &&
is_seqpool_op_with_pootype_of_nth_input_of_concat(x, "SUM", i);
},
name_scope + "/sequence_pool_op_" + std::to_string(i));
seqpool_ops_input_var[i] = pattern->NewNode(
[=](Node* x) {
bool basic = x && x->IsVar() && x->outputs.size() >= 1;
bool next_is_fine = false;
for (auto* o : x->outputs) {
if (is_seqpool_op_with_pootype_of_nth_input_of_concat(o, "SUM",
i)) {
next_is_fine = true;
break;
}
}
return basic && next_is_fine;
},
name_scope + "/sequence_pool_in_" + std::to_string(i));
// Links
seqpool_ops[i]
->LinksFrom({seqpool_ops_input_var[i]})
.LinksTo({seqpool_ops_output_var[i], seqpool_ops_output_unused_var[i]});
}
concat_op->LinksFrom(seqpool_ops_output_var).LinksTo({concat_out_var});
return concat_out_var;
}
int BuildFusion(Graph* graph, const std::string& name_scope, int num_inputs) {
GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern();
BuildSeqPoolConcatPattern(pattern, name_scope, num_inputs);
auto retrieve_node = [](const std::string& name,
const GraphPatternDetector::subgraph_t& subgraph,
const PDPattern& pat) -> Node* {
PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)),
"pattern has no Node called %s", name.c_str());
Node* p = subgraph.at(pat.RetrieveNode(name));
PADDLE_ENFORCE_NOT_NULL(p, "subgraph has no node %s", name.c_str());
return p;
};
int fusion_count{0};
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle SeqPool Concat fuse";
std::vector<std::string> input_names(num_inputs);
std::vector<Node*> input_vars(num_inputs);
auto& fused_pattern = gpd.pattern();
for (int i = 0; i < num_inputs; ++i) {
input_vars[i] =
retrieve_node(name_scope + "/sequence_pool_in_" + std::to_string(i),
subgraph, fused_pattern);
input_names[i] = input_vars[i]->Name();
}
auto* concat_op =
retrieve_node(name_scope + "/concat_op", subgraph, fused_pattern);
auto* concat_out_var =
retrieve_node(name_scope + "/concat_out_var", subgraph, fused_pattern);
auto* seqpool_op0 = retrieve_node(name_scope + "/sequence_pool_op_0",
subgraph, fused_pattern);
// Create New OpDesc
OpDesc op_desc;
op_desc.SetType("fusion_seqpool_concat");
op_desc.SetInput("X", input_names);
op_desc.SetAttr("pooltype", seqpool_op0->Op()->GetAttr("pooltype"));
op_desc.SetAttr("axis", concat_op->Op()->GetAttr("axis"));
op_desc.SetOutput("Out", {concat_out_var->Name()});
auto* op = graph->CreateOpNode(&op_desc);
for (size_t i = 0; i < input_vars.size(); ++i) {
IR_NODE_LINK_TO(input_vars[i], op);
}
IR_NODE_LINK_TO(op, concat_out_var);
std::unordered_set<const Node*> marked_nodes;
for (auto& item : subgraph) {
marked_nodes.insert(item.second);
}
for (size_t i = 0; i < input_vars.size(); ++i) {
marked_nodes.erase(input_vars[i]);
}
marked_nodes.erase(concat_out_var);
GraphSafeRemoveNodes(graph, marked_nodes);
++fusion_count;
};
gpd(graph, handler);
return fusion_count;
}
std::unique_ptr<ir::Graph> SeqPoolConcatFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
FusePassBase::Init(name_scope_, graph.get());
int fusion_count = 0;
for (int i = MAX_CONCAT_INPUTS; i > 0; --i) {
fusion_count +=
BuildFusion(graph.get(), name_scope_ + "/" + std::to_string(i), i);
}
AddStatis(fusion_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(seqpool_concat_fuse_pass,
paddle::framework::ir::SeqPoolConcatFusePass);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
/**
* Fuse SequencePool(with sum pooltype yet) and Concat;
*
* Before fuse:
* | | |
* seq_pool, seq_pool, ... seq_pool
* \ | ... /
* concat
* |
* After fuse:
* \ | /
* FusionSeqPoolConcat
* |
*/
class SeqPoolConcatFusePass : public FusePassBase {
public:
virtual ~SeqPoolConcatFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
const std::string name_scope_{"seqpool_concat_fuse"};
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
if (type == "sequence_pool") {
op->SetInput("X", {inputs[0]});
std::string pooltype = "SUM";
op->SetAttr("pooltype", pooltype);
op->SetOutput("MaxIndex", {outputs[0]});
op->SetOutput("Out", {outputs[1]});
} else if (type == "concat") {
op->SetInput("X", inputs);
op->SetAttr("axis", 1);
op->SetOutput("Out", {outputs[0]});
} else {
op->SetInput("X", inputs);
op->SetOutput("Out", outputs);
}
op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(OpRole::kForward));
}
int CountOpType(const ir::Graph* graph,
const std::string& op_type = "fusion_seqpool_concat") {
int count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()->Type() == op_type) {
++count;
}
}
return count;
}
std::unique_ptr<ir::Graph> GetNumNodesOfBeforeAfter(
std::unique_ptr<ir::Graph> graph, int* before, int* after,
const std::string& pass_type = "seqpool_concat_fuse_pass") {
auto pass = PassRegistry::Instance().Get(pass_type);
*before = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
*after = graph->Nodes().size();
return graph;
}
/*
* Before fuse:
* a b c
* | | |
* op1 op2 op3
* / \ / \ / \
* d e f g h i
* \ | /
* concat
* |
* j
* Type of op1, op2 and op3 are sequence_pool, with "SUM" pooltype attr
*
* After fuse:
* a b c
* \ | /
* fusion_seqpool_concat
* |
* j
*/
TEST(SeqPoolConcatFusePass, basic) {
ProgramDesc prog;
for (auto& v : std::vector<std::string>(
{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"})) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::LOD_TENSOR);
}
SetOp(&prog, "sequence_pool", std::vector<std::string>({"a"}),
std::vector<std::string>({"d", "e"}));
SetOp(&prog, "sequence_pool", std::vector<std::string>({"b"}),
std::vector<std::string>({"f", "g"}));
SetOp(&prog, "sequence_pool", std::vector<std::string>({"c"}),
std::vector<std::string>({"h", "i"}));
SetOp(&prog, "concat", std::vector<std::string>({"e", "g", "i"}),
std::vector<std::string>({"j"}));
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
int before, after;
graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
// Remove 10 Nodes: op1, op2, op3, d, e, f, g, h, i, concat_op
// Add 1 Node: fusion_seqpool_concat
EXPECT_EQ(after, before - 9);
EXPECT_EQ(CountOpType(graph.get()), 1);
}
/*
* Before fuse:
* a b
* | / \
* op1 op2 op3
* / \ / \ \
* c d e f g
* \ /
* concat
* |
* h
* Type of op1 and op2 are sequence_pool, with "SUM" pooltype attr
*
* After fuse:
* a b
* \ / \
* fusion_seqpool_concat op3
* | |
* h g
*/
TEST(SeqPoolConcatFusePass, advanced) {
ProgramDesc prog;
for (auto& v :
std::vector<std::string>({"a", "b", "c", "d", "e", "f", "g", "h"})) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::LOD_TENSOR);
}
SetOp(&prog, "sequence_pool", std::vector<std::string>({"a"}),
std::vector<std::string>({"c", "d"}));
SetOp(&prog, "sequence_pool", std::vector<std::string>({"b"}),
std::vector<std::string>({"e", "f"}));
SetOp(&prog, "op3", std::vector<std::string>({"b"}),
std::vector<std::string>({"g"}));
SetOp(&prog, "concat", std::vector<std::string>({"d", "f"}),
std::vector<std::string>({"h"}));
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
int before, after;
graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
// Remove 7 Nodes: op1, op2, c, d, e, f concat_op
// Add 1 Node: fusion_seqpool_concat
EXPECT_EQ(after, before - 6);
EXPECT_EQ(CountOpType(graph.get()), 1);
}
ProgramDesc BuildProgramDesc(int num_inputs_of_concat) {
ProgramDesc prog;
auto new_var = [&](const std::string& name) {
auto* var = prog.MutableBlock(0)->Var(name);
var->SetType(proto::VarType::LOD_TENSOR);
};
std::vector<std::string> concat_inputs;
for (int i = 0; i < num_inputs_of_concat; ++i) {
std::string prefix = "seqpool_op_" + i;
new_var(prefix + "in");
new_var(prefix + "out");
new_var(prefix + "out_unused");
SetOp(&prog, "sequence_pool", std::vector<std::string>({prefix + "in"}),
std::vector<std::string>({prefix + "out", prefix + "out_unused"}));
concat_inputs.push_back(prefix + "out");
}
SetOp(&prog, "concat", concat_inputs,
std::vector<std::string>({"concat_out"}));
return prog;
}
// test more inputs of concat
TEST(SeqPoolConcatFusePass, more_inputs) {
for (int num : {1, 2, 10}) {
ProgramDesc prog = BuildProgramDesc(num);
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
int before, after;
graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after);
// Remove Nodes: n * (seqpool_op, out, out_unused), and concat_op
// Add Node: fusion_seqpool_concat op
EXPECT_EQ(after, before - num * 3);
EXPECT_EQ(CountOpType(graph.get()), 1);
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(seqpool_concat_fuse_pass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
namespace paddle {
namespace framework {
namespace ir {
template <int times>
std::unique_ptr<ir::Graph> TransposeFlattenConcatFusePass<times>::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
const std::string pattern_name =
"transpose_flatten" + std::to_string(times) + "_concat_fuse";
FusePassBase::Init(pattern_name, graph.get());
GraphPatternDetector gpd;
std::vector<PDNode *> input_nodes;
for (int i = 0; i < times; i++) {
input_nodes.push_back(gpd.mutable_pattern()
->NewNode("x" + std::to_string(i))
->assert_is_op_input("transpose2", "X")
->AsInput());
}
patterns::TransposeFlattenConcat pattern(gpd.mutable_pattern(), pattern_name);
pattern(input_nodes, times);
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
const int kNumFields = 5;
const int kTransOffset = 1;
const int kTransOutOffset = 2;
const int kFlattenOffset = 3;
const int kFlattenOutOffset = 4;
std::vector<Node *> nodes;
for (int i = 0; i < times; i++) {
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))));
PADDLE_ENFORCE(
subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))));
PADDLE_ENFORCE(subgraph.at(input_nodes[i]));
nodes.push_back(subgraph.at(input_nodes[i]));
nodes.push_back(
subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i))));
nodes.push_back(
subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i))));
}
Node *concat_op = subgraph.at(pattern.GetPDNode("concat"));
Node *concat_out = subgraph.at(pattern.GetPDNode("concat_out"));
std::vector<std::string> input_names;
std::vector<int> trans_axis = boost::get<std::vector<int>>(
nodes[kTransOffset]->Op()->GetAttr("axis"));
int flatten_axis =
boost::get<int>(nodes[kFlattenOffset]->Op()->GetAttr("axis"));
int concat_axis = boost::get<int>(concat_op->Op()->GetAttr("axis"));
std::string output_name = concat_out->Name();
for (int i = 0; i < times; i++) {
input_names.push_back(nodes[i * kNumFields]->Name());
}
framework::OpDesc new_op_desc;
new_op_desc.SetType("fusion_transpose_flatten_concat");
new_op_desc.SetInput("X", input_names);
new_op_desc.SetAttr("trans_axis", trans_axis);
new_op_desc.SetAttr("flatten_axis", flatten_axis);
new_op_desc.SetAttr("concat_axis", concat_axis);
new_op_desc.SetOutput("Out", {output_name});
new_op_desc.Flush();
// Create a new node for the fused op.
auto *new_conv_op = graph->CreateOpNode(&new_op_desc);
std::unordered_set<const Node *> delete_nodes;
for (int i = 0; i < times; i++) {
nodes[i * kNumFields]->outputs.push_back(new_conv_op);
new_conv_op->inputs.push_back(nodes[i * kNumFields]);
delete_nodes.insert(nodes[i * kNumFields + kTransOffset]);
delete_nodes.insert(nodes[i * kNumFields + kTransOutOffset]);
delete_nodes.insert(nodes[i * kNumFields + kFlattenOffset]);
delete_nodes.insert(nodes[i * kNumFields + kFlattenOutOffset]);
}
delete_nodes.insert(concat_op);
new_conv_op->outputs.push_back(concat_out);
concat_out->inputs.push_back(new_conv_op);
// Delete the unneeded nodes.
GraphSafeRemoveNodes(graph.get(), delete_nodes);
};
gpd(graph.get(), handler);
return graph;
}
template class TransposeFlattenConcatFusePass<1>;
template class TransposeFlattenConcatFusePass<3>;
template class TransposeFlattenConcatFusePass<4>;
template class TransposeFlattenConcatFusePass<5>;
template class TransposeFlattenConcatFusePass<6>;
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(transpose_flatten_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<3>);
REGISTER_PASS(transpose_flatten4_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<4>);
REGISTER_PASS(transpose_flatten5_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<5>);
REGISTER_PASS(transpose_flatten6_concat_fuse_pass,
paddle::framework::ir::TransposeFlattenConcatFusePass<6>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace paddle {
namespace framework {
namespace ir {
// There may be many transpose-flatten structures in a model, and the output of
// these structures will be used as inputs to the concat Op. This pattern will
// be detected by our pass. The times here represents the repeat times of this
// structure.
template <int times>
class TransposeFlattenConcatFusePass : public FusePassBase {
public:
virtual ~TransposeFlattenConcatFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -32,8 +32,11 @@ std::map<std::string, ...@@ -32,8 +32,11 @@ std::map<std::string,
std::string, std::shared_ptr<ngraph::Node>>>)>> std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = { NgraphBridge::NG_NODE_MAP = {
{"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode}, {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode},
{"mean", paddle::operators::ngraphs::BuildMeanNode},
{"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode},
{"mul", paddle::operators::ngraphs::BuildMulNode}, {"mul", paddle::operators::ngraphs::BuildMulNode},
{"mul_grad", paddle::operators::ngraphs::BuildMulGradNode}, {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
{"scale", paddle::operators::ngraphs::BuildScaleNode},
{"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>}, {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
{"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>}, {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>},
{"top_k", paddle::operators::ngraphs::BuildTopKNode}}; {"top_k", paddle::operators::ngraphs::BuildTopKNode}};
......
...@@ -193,15 +193,14 @@ ParallelExecutor::ParallelExecutor( ...@@ -193,15 +193,14 @@ ParallelExecutor::ParallelExecutor(
const std::unordered_set<std::string> &bcast_vars, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, const std::vector<Scope *> &local_scopes, Scope *scope, const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy)
size_t num_trainers, size_t trainer_id)
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
member_->use_cuda_ = exec_strategy.use_cuda_; member_->use_cuda_ = exec_strategy.use_cuda_;
member_->build_strategy_ = build_strategy; member_->build_strategy_ = build_strategy;
member_->use_all_reduce_ = member_->use_all_reduce_ =
build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
member_->nranks_ = num_trainers * places.size(); member_->nranks_ = build_strategy.num_trainers_ * places.size();
if (!member_->use_all_reduce_) { if (!member_->use_all_reduce_) {
PADDLE_ENFORCE(places.size() > 1, PADDLE_ENFORCE(places.size() > 1,
...@@ -253,7 +252,8 @@ ParallelExecutor::ParallelExecutor( ...@@ -253,7 +252,8 @@ ParallelExecutor::ParallelExecutor(
} }
member_->nccl_ctxs_.reset(new platform::NCCLContextMap( member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
member_->places_, nccl_id, num_trainers, trainer_id)); member_->places_, nccl_id, build_strategy.num_trainers_,
build_strategy.trainer_id_));
#else #else
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
#endif #endif
......
...@@ -50,8 +50,7 @@ class ParallelExecutor { ...@@ -50,8 +50,7 @@ class ParallelExecutor {
const std::string &loss_var_name, Scope *scope, const std::string &loss_var_name, Scope *scope,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy, const ExecutionStrategy &exec_strategy,
const BuildStrategy &build_strategy, const BuildStrategy &build_strategy);
size_t num_trainers = 1, size_t trainer_id = 0);
~ParallelExecutor(); ~ParallelExecutor();
......
...@@ -87,11 +87,12 @@ Variable* Scope::Var(const std::string& name) { ...@@ -87,11 +87,12 @@ Variable* Scope::Var(const std::string& name) {
} }
Variable* Scope::Var(std::string* name) { Variable* Scope::Var(std::string* name) {
auto new_name = string::Sprintf("%p.%d", this, vars_.size()); SCOPE_VARS_WRITER_LOCK
auto new_name = std::to_string(reinterpret_cast<uintptr_t>(this)) + "." +
std::to_string(vars_.size());
if (name != nullptr) { if (name != nullptr) {
*name = new_name; *name = new_name;
} }
SCOPE_VARS_WRITER_LOCK
return VarInternal(new_name); return VarInternal(new_name);
} }
......
...@@ -105,13 +105,15 @@ struct VarIdToTypeIndexMapHolder { ...@@ -105,13 +105,15 @@ struct VarIdToTypeIndexMapHolder {
} // namespace detail } // namespace detail
const std::type_index &ToTypeIndex(int var_id) { const std::type_index &VarTraitIdToTypeIndex(int var_id) {
return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id); return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
} }
const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); } const char *ToTypeName(int var_id) {
return VarTraitIdToTypeIndex(var_id).name();
}
int ToTypeId(const std::type_index &type) { int TypeIndexToVarTraitId(const std::type_index &type) {
return detail::VarIdToTypeIndexMapHolder::ToTypeId(type); return detail::VarIdToTypeIndexMapHolder::ToTypeId(type);
} }
......
...@@ -66,8 +66,8 @@ namespace paddle { ...@@ -66,8 +66,8 @@ namespace paddle {
namespace framework { namespace framework {
const char *ToTypeName(int var_id); const char *ToTypeName(int var_id);
const std::type_index &ToTypeIndex(int var_id); const std::type_index &VarTraitIdToTypeIndex(int var_id);
int ToTypeId(const std::type_index &type); int TypeIndexToVarTraitId(const std::type_index &type);
namespace detail { namespace detail {
......
...@@ -45,10 +45,11 @@ struct TypeIndexChecker { ...@@ -45,10 +45,11 @@ struct TypeIndexChecker {
constexpr auto kId = VarTypeTrait<Type>::kId; constexpr auto kId = VarTypeTrait<Type>::kId;
std::type_index actual_type(typeid(Type)); std::type_index actual_type(typeid(Type));
EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name())); EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
EXPECT_EQ(ToTypeIndex(kId), actual_type); EXPECT_EQ(VarTraitIdToTypeIndex(kId), actual_type);
EXPECT_EQ(ToTypeId(actual_type), kId); EXPECT_EQ(TypeIndexToVarTraitId(actual_type), kId);
EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type); EXPECT_EQ(VarTraitIdToTypeIndex(TypeIndexToVarTraitId(actual_type)),
EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId); actual_type);
EXPECT_EQ(TypeIndexToVarTraitId(VarTraitIdToTypeIndex(kId)), kId);
EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT
EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT
......
...@@ -77,6 +77,7 @@ class PreparedOp { ...@@ -77,6 +77,7 @@ class PreparedOp {
framework::OperatorWithKernel::OpKernelFunc func; framework::OperatorWithKernel::OpKernelFunc func;
platform::DeviceContext* dev_ctx; platform::DeviceContext* dev_ctx;
}; };
class OpBase; class OpBase;
class VarBase { class VarBase {
......
...@@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) { ...@@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
i++) { i++) {
LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i] LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i]
<< " result: " << result[i]; << " result: " << result[i];
PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i], EXPECT_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
result[i]); 1e-3);
} }
} }
......
...@@ -123,8 +123,6 @@ struct Argument { ...@@ -123,8 +123,6 @@ struct Argument {
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller,
std::function<bool(const framework::ir::Node*)>);
DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
......
...@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument,
for (const std::string &pass_name : passes) { for (const std::string &pass_name : passes) {
auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
// Set some pass attributes.
if (pass_name == "ir_analysis_pass") {
pass->Set("tensorrt_node_teller",
new SubgraphDetector::NodeInsideSubgraphTeller(
argument->tensorrt_node_teller()));
}
if (pass_name == "graph_viz_pass") { if (pass_name == "graph_viz_pass") {
std::string dot_file_path = std::to_string(pass_num) + "_ir_" + std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
(pre_pass.empty() ? "origin" : pre_pass) + (pre_pass.empty() ? "origin" : pre_pass) +
...@@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument,
} }
if (pass_name == "tensorrt_subgraph_pass") { if (pass_name == "tensorrt_subgraph_pass") {
PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());
pass->SetNotOwned("tensorrt_node_teller",
argument->tensorrt_node_teller_ptr());
pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
pass->Set("min_subgraph_size", pass->Set("min_subgraph_size",
......
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector)
set(analysis_deps ${analysis_deps}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL "")
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) if (TENSORRT_FOUND)
file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
set(analysis_deps ${analysis_deps}
subgraph_detector tensorrt_subgraph_pass
CACHE INTERNAL "")
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
endif()
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( ...@@ -35,8 +36,10 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
std::unique_ptr<framework::ir::Graph> graph) const { std::unique_ptr<framework::ir::Graph> graph) const {
framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
auto teller = auto teller = [](const framework::ir::Node *node) {
Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller"); if (!node->IsOp() || !node->Op()) return false;
return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
};
SubGraphFuser fuser(graph.get(), teller, SubGraphFuser fuser(graph.get(), teller,
Get<int>("min_subgraph_size") /*min subgraph size*/); Get<int>("min_subgraph_size") /*min subgraph size*/);
...@@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters( ...@@ -232,7 +235,6 @@ std::vector<std::string> ExtractParameters(
REGISTER_PASS(tensorrt_subgraph_pass, REGISTER_PASS(tensorrt_subgraph_pass,
paddle::inference::analysis::TensorRtSubgraphPass) paddle::inference::analysis::TensorRtSubgraphPass)
.RequirePassAttr("tensorrt_node_teller")
.RequirePassAttr("max_batch_size") .RequirePassAttr("max_batch_size")
.RequirePassAttr("workspace_size") .RequirePassAttr("workspace_size")
.RequirePassAttr("min_subgraph_size"); .RequirePassAttr("min_subgraph_size");
...@@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps} ...@@ -7,4 +7,5 @@ set(analysis_deps ${analysis_deps}
ir_graph_build_pass ir_graph_build_pass
ir_analysis_pass ir_analysis_pass
analysis_passes analysis_passes
subgraph_detector
CACHE INTERNAL "") CACHE INTERNAL "")
...@@ -27,9 +27,6 @@ namespace analysis { ...@@ -27,9 +27,6 @@ namespace analysis {
void IrAnalysisComposePass::RunImpl(Argument *argument) { void IrAnalysisComposePass::RunImpl(Argument *argument) {
ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
InitTensorRTAttrs(argument);
}
ApplyIrPasses(argument); ApplyIrPasses(argument);
CollectFusionStatis(argument); CollectFusionStatis(argument);
} }
...@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const { ...@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const {
return "ir-analysis-compose-pass"; return "ir-analysis-compose-pass";
} }
void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
LOG(INFO) << "Initing TensorRT pass";
argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) {
std::unordered_set<std::string> teller_set(
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"conv2d_transpose", "leaky_relu"});
if (!node->IsOp()) return false;
if (teller_set.count(node->Op()->Type())) {
return true;
} else {
return false;
}
});
}
}
void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) { void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
std::vector<std::string> passes({ std::vector<std::string> passes({
"ir_graph_build_pass", "ir_analysis_pass", "ir_graph_build_pass", "ir_analysis_pass",
......
...@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass { ...@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass {
std::string repr() const override; std::string repr() const override;
private: private:
void InitTensorRTAttrs(Argument* argument);
void ApplyIrPasses(Argument* argument); void ApplyIrPasses(Argument* argument);
void CollectFusionStatis(Argument* argument); void CollectFusionStatis(Argument* argument);
......
...@@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, ...@@ -127,6 +127,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
use_tensorrt_ = true; use_tensorrt_ = true;
tensorrt_workspace_size_ = workspace_size; tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size; tensorrt_max_batchsize_ = max_batch_size;
Update();
} }
void contrib::AnalysisConfig::Update() { void contrib::AnalysisConfig::Update() {
......
...@@ -35,8 +35,11 @@ using framework::proto::ProgramDesc; ...@@ -35,8 +35,11 @@ using framework::proto::ProgramDesc;
using framework::NaiveExecutor; using framework::NaiveExecutor;
using contrib::AnalysisConfig; using contrib::AnalysisConfig;
/* This predictor is based on the original native predictor with IR and Analysis /** \brief This predictor is based on the original native predictor with IR and
* support. It will optimize IR and Parameters in the runtime. * Analysis support.
*
* It will optimize IR and Parameters in the runtime.
*
* TODO(Superjomn) Replace the Navive predictor? * TODO(Superjomn) Replace the Navive predictor?
*/ */
class AnalysisPredictor : public PaddlePredictor { class AnalysisPredictor : public PaddlePredictor {
......
...@@ -19,7 +19,6 @@ limitations under the License. */ ...@@ -19,7 +19,6 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
......
...@@ -92,10 +92,10 @@ if(WITH_MKL) ...@@ -92,10 +92,10 @@ if(WITH_MKL)
if(NOT WIN32) if(NOT WIN32)
set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
else(WIN32) else()
set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml${CMAKE_SHARED_LIBRARY_SUFFIX} set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml${CMAKE_SHARED_LIBRARY_SUFFIX}
${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md${CMAKE_SHARED_LIBRARY_SUFFIX}) ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md${CMAKE_SHARED_LIBRARY_SUFFIX})
endif(WIN32) endif()
set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn") set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
if(EXISTS ${MKLDNN_PATH}) if(EXISTS ${MKLDNN_PATH})
include_directories("${MKLDNN_PATH}/include") include_directories("${MKLDNN_PATH}/include")
...@@ -128,8 +128,8 @@ else() ...@@ -128,8 +128,8 @@ else()
${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf ${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash ${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash
snappystream ${EXTERNAL_LIB}) snappystream ${EXTERNAL_LIB})
# NOTE(dzhwinter) shlwapi is deprecated. get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
set(DEPS ${DEPS} libcmt shlwapi) set(DEPS ${DEPS} libcmt ${os_dependency_modules})
endif(NOT WIN32) endif(NOT WIN32)
if(WITH_GPU) if(WITH_GPU)
......
...@@ -116,6 +116,10 @@ D ...@@ -116,6 +116,10 @@ D
--modeldir=$DATA_DIR/mobilenet/model \ --modeldir=$DATA_DIR/mobilenet/model \
--data=$DATA_DIR/mobilenet/data.txt \ --data=$DATA_DIR/mobilenet/data.txt \
--refer=$DATA_DIR/mobilenet/result.txt --refer=$DATA_DIR/mobilenet/result.txt
if [ $? -ne 0 ]; then
echo "trt demo trt_mobilenet_demo runs fail."
exit 1
fi
fi fi
done done
set +x set +x
...@@ -38,8 +38,8 @@ void Main() { ...@@ -38,8 +38,8 @@ void Main() {
std::unique_ptr<PaddlePredictor> predictor; std::unique_ptr<PaddlePredictor> predictor;
paddle::contrib::AnalysisConfig config; paddle::contrib::AnalysisConfig config;
config.EnableUseGpu(100, 0); config.EnableUseGpu(100, 0);
config.SetModel(FLAGS_modeldir + "/__params__", config.SetModel(FLAGS_modeldir + "/__model__",
FLAGS_modeldir + "/__model__"); FLAGS_modeldir + "/__params__");
config.EnableTensorRtEngine(); config.EnableTensorRtEngine();
predictor = CreatePaddlePredictor(config); predictor = CreatePaddlePredictor(config);
......
...@@ -204,11 +204,14 @@ static std::string DescribeTensor(const PaddleTensor &tensor) { ...@@ -204,11 +204,14 @@ static std::string DescribeTensor(const PaddleTensor &tensor) {
os << to_string(l) << "; "; os << to_string(l) << "; ";
} }
os << "\n"; os << "\n";
os << " - data: "; os << " - memory length: " << tensor.data.length();
os << "\n";
os << " - data: ";
int dim = VecReduceToInt(tensor.shape); int dim = VecReduceToInt(tensor.shape);
float *pdata = static_cast<float *>(tensor.data.data());
for (int i = 0; i < dim; i++) { for (int i = 0; i < dim; i++) {
os << static_cast<float *>(tensor.data.data())[i] << " "; os << pdata[i] << " ";
} }
os << '\n'; os << '\n';
return os.str(); return os.str();
...@@ -224,10 +227,12 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) { ...@@ -224,10 +227,12 @@ static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) {
os << to_string(l) << "; "; os << to_string(l) << "; ";
} }
os << "\n"; os << "\n";
os << " - data: ";
PaddlePlace place; PaddlePlace place;
int size; int size;
const auto *data = tensor.data<float>(&place, &size); const auto *data = tensor.data<float>(&place, &size);
os << " - numel: " << size;
os << "\n";
os << " - data: ";
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
os << data[i] << " "; os << data[i] << " ";
} }
......
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
/*! \file */
// Here we include some header files with relative paths, for that in deploy, // Here we include some header files with relative paths, for that in deploy,
// the abstract path of this header file will be changed. // the abstract path of this header file will be changed.
#include "paddle_api.h" // NOLINT #include "paddle_api.h" // NOLINT
...@@ -41,49 +43,125 @@ struct AnalysisConfig { ...@@ -41,49 +43,125 @@ struct AnalysisConfig {
explicit AnalysisConfig(const std::string& prog_file, explicit AnalysisConfig(const std::string& prog_file,
const std::string& params_file); const std::string& params_file);
// Model path related. /** Set model with a directory.
*/
void SetModel(const std::string& model_dir) { model_dir_ = model_dir; } void SetModel(const std::string& model_dir) { model_dir_ = model_dir; }
/** Set model with two specific pathes for program and parameters.
*/
void SetModel(const std::string& prog_file_path, void SetModel(const std::string& prog_file_path,
const std::string& params_file_path); const std::string& params_file_path);
/** Set program file path.
*/
void SetProgFile(const std::string& x) { prog_file_ = x; } void SetProgFile(const std::string& x) { prog_file_ = x; }
/** Set parameter composed file path.
*/
void SetParamsFile(const std::string& x) { params_file_ = x; } void SetParamsFile(const std::string& x) { params_file_ = x; }
/** Get the model directory path.
*/
const std::string& model_dir() const { return model_dir_; } const std::string& model_dir() const { return model_dir_; }
/** Get the program file path.
*/
const std::string& prog_file() const { return prog_file_; } const std::string& prog_file() const { return prog_file_; }
/** Get the composed parameters file.
*/
const std::string& params_file() const { return params_file_; } const std::string& params_file() const { return params_file_; }
// GPU related. // GPU related.
/**
* \brief Turn on GPU.
* @param memory_pool_init_size_mb initial size of the GPU memory pool in MB.
* @param device_id the GPU card to use (default is 0).
*/
void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0); void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
/** Turn off the GPU.
*/
void DisableGpu(); void DisableGpu();
/** A bool state telling whether the GPU is turned on.
*/
bool use_gpu() const { return use_gpu_; } bool use_gpu() const { return use_gpu_; }
/** Get the GPU device id.
*/
int gpu_device_id() const { return device_id_; } int gpu_device_id() const { return device_id_; }
/** Get the initial size in MB of the GPU memory pool.
*/
int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; } int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; }
/** Get the proportion of the initial memory pool size compared to the device.
*/
float fraction_of_gpu_memory_for_pool() const; float fraction_of_gpu_memory_for_pool() const;
// Determine whether to perform graph optimization. /** \brief Control whether to perform IR graph optimization.
*
* If turned off, the AnalysisConfig will act just like a NativeConfig.
*/
void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; } void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; }
/** A boolean state tell whether the ir graph optimization is actived.
*/
bool ir_optim() const { return enable_ir_optim_; } bool ir_optim() const { return enable_ir_optim_; }
/** \brief INTERNAL Determine whether to use the feed and fetch operators.
* Just for internal development, not stable yet.
* When ZeroCopyTensor is used, this should turned off.
*/
void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; } void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; }
/** A boolean state telling whether to use the feed and fetch operators.
*/
bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; } bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; }
/** \brief Control whether to specify the inputs' names.
*
* The PaddleTensor type has a `name` member, assign it with the corresponding
* variable name. This is used only when the input PaddleTensors passed to the
* `PaddlePredictor.Run(...)` cannot follow the order in the training phase.
*/
void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; } void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; }
/** A boolean state tell whether the input PaddleTensor names specified should
* be used to reorder the inputs in `PaddlePredictor.Run(...)`.
*/
bool specify_input_name() const { return specify_input_name_; } bool specify_input_name() const { return specify_input_name_; }
/**
* \brief Turn on the TensorRT engine.
*
* The TensorRT engine will accelerate some subgraphes in the original Fluid
* computation graph. In some models such as TensorRT50, GoogleNet and so on,
* it gains significant performance acceleration.
*
* @param workspace_size the memory size(in byte) used for TensorRT workspace.
* @param max_batch_size the maximum batch size of this prediction task,
* better set as small as possible, or performance loss.
* @param min_subgrpah_size the minimum TensorRT subgraph size needed, if a
* subgraph is less than this, it will not transfer to TensorRT engine.
*/
void EnableTensorRtEngine(int workspace_size = 1 << 20, void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3); int max_batch_size = 1, int min_subgraph_size = 3);
/** A boolean state telling whether the TensorRT engine is used.
*/
bool tensorrt_engine_enabled() const { return use_tensorrt_; } bool tensorrt_engine_enabled() const { return use_tensorrt_; }
/** Control whther to debug IR graph analysis phase.
*/
void SwitchIrDebug(int x = true) { ir_debug_ = x; } void SwitchIrDebug(int x = true) { ir_debug_ = x; }
/** Turn on MKLDNN.
*/
void EnableMKLDNN(); void EnableMKLDNN();
/** A boolean state telling whether to use the MKLDNN.
*/
bool mkldnn_enabled() const { return use_mkldnn_; } bool mkldnn_enabled() const { return use_mkldnn_; }
// Set and get the number of cpu math library threads. /** Set and get the number of cpu math library threads.
*/
void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads); void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads);
/** An int state telling how many threads are used in the CPU math library.
*/
int cpu_math_library_num_threads() const { int cpu_math_library_num_threads() const {
return cpu_math_library_num_threads_; return cpu_math_library_num_threads_;
} }
/** Transform the AnalysisConfig to NativeConfig.
*/
NativeConfig ToNativeConfig() const { NativeConfig ToNativeConfig() const {
NativeConfig config; NativeConfig config;
config.model_dir = model_dir_; config.model_dir = model_dir_;
...@@ -95,19 +173,30 @@ struct AnalysisConfig { ...@@ -95,19 +173,30 @@ struct AnalysisConfig {
config.specify_input_name = specify_input_name_; config.specify_input_name = specify_input_name_;
return config; return config;
} }
/** Specify the operator type list to use MKLDNN acceleration.
* @param op_list the operator type list.
*/
void SetMKLDNNOp(std::unordered_set<std::string> op_list) { void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
mkldnn_enabled_op_types_ = op_list; mkldnn_enabled_op_types_ = op_list;
} }
// Specify the memory buffer of program and parameter /** Specify the memory buffer of program and parameter
* @param prog_buffer the memory buffer of program.
* @param prog_buffer_size the size of the data.
* @param params_buffer the memory buffer of the composed parameters file.
* @param params_buffer_size the size of the commposed parameters data.
*/
void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size, void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
const char* program_buffer, size_t program_buffer_size); const char* params_buffer, size_t params_buffer_size);
/** A boolean state telling whether the model is set from the CPU memory.
*/
bool model_from_memory() const { return model_from_memory_; } bool model_from_memory() const { return model_from_memory_; }
friend class ::paddle::AnalysisPredictor; friend class ::paddle::AnalysisPredictor;
// NOTE just for developer, not an official API, easily to be broken. /** NOTE just for developer, not an official API, easily to be broken.
// Get a pass builder for customize the passes in IR analysis phase. * Get a pass builder for customize the passes in IR analysis phase.
*/
PassStrategy* pass_builder() const; PassStrategy* pass_builder() const;
protected: protected:
......
...@@ -13,61 +13,76 @@ ...@@ -13,61 +13,76 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
/*! \file paddle_api.h
*/
#include <cassert> #include <cassert>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
/*! \namespace paddle
*/
namespace paddle { namespace paddle {
// Data type. /** paddle data type.
*/
enum PaddleDType { enum PaddleDType {
FLOAT32, FLOAT32,
INT64, INT64,
// TODO(Superjomn) support more data types if needed. // TODO(Superjomn) support more data types if needed.
}; };
/* /**
* Memory menage for PaddleTensor. *\brief Memory menager for PaddleTensor.
* The PaddleBuf holds a buffer for data input or output. The memory can be
* allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
* should be reused for better performance.
* *
* For user allocated memory, the following API can be used: *The PaddleBuf holds a buffer for data input or output. The memory can be
* - PaddleBuf(void* data, size_t length) to set an external memory by *allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
* specifying *should be reused for better performance.
* the memory address and length.
* - Reset(void* data, size_t length) to reset the PaddleBuf with an external
* memory.
* ATTENTION, for user allocated memory, deallocation should be done by users
* externally after the program finished. The PaddleBuf won't do any allocation
* or deallocation.
* *
* To have the PaddleBuf allocate and manage the memory: *For user allocated memory, the following API can be used:
* - PaddleBuf(size_t length) will allocate a memory of size `length`. *- PaddleBuf(void* data, size_t length) to set an external memory by
* - Resize(size_t length) resize the memory to no less than `length`, ATTENTION *specifying
* if the allocated memory is larger than `length`, nothing will done. * the memory address and length.
*- Reset(void* data, size_t length) to reset the PaddleBuf with an external
*memory.
*ATTENTION, for user allocated memory, deallocation should be done by users
*externally after the program finished. The PaddleBuf won't do any allocation
*or deallocation.
*
*To have the PaddleBuf allocate and manage the memory:
*- PaddleBuf(size_t length) will allocate a memory of size `length`.
*- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* if the allocated memory is larger than `length`, nothing will done.
*/ */
class PaddleBuf { class PaddleBuf {
public: public:
// PaddleBuf allocate memory internally, and manage it. /** PaddleBuf allocate memory internally, and manage it.
*/
explicit PaddleBuf(size_t length) explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {} : data_(new char[length]), length_(length), memory_owned_(true) {}
// Set external memory, the PaddleBuf won't manage it. /** Set external memory, the PaddleBuf won't manage it.
*/
PaddleBuf(void* data, size_t length) PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {} : data_(data), length_(length), memory_owned_{false} {}
// Copy only available when memory is managed externally. /** Copy only available when memory is managed externally.
*/
explicit PaddleBuf(const PaddleBuf&); explicit PaddleBuf(const PaddleBuf&);
// Resize the memory. /** Resize the memory.
*/
void Resize(size_t length); void Resize(size_t length);
// Reset to external memory, with address and length set. /** Reset to external memory, with address and length set.
*/
void Reset(void* data, size_t length); void Reset(void* data, size_t length);
// Tell whether the buffer is empty. /** Tell whether the buffer is empty.
*/
bool empty() const { return length_ == 0; } bool empty() const { return length_ == 0; }
// Get the memory address. /** Get the memory address.
*/
void* data() const { return data_; } void* data() const { return data_; }
// Get the memory length. /** Get the memory length.
*/
size_t length() const { return length_; } size_t length() const { return length_; }
~PaddleBuf() { Free(); } ~PaddleBuf() { Free(); }
...@@ -83,7 +98,8 @@ class PaddleBuf { ...@@ -83,7 +98,8 @@ class PaddleBuf {
bool memory_owned_{true}; bool memory_owned_{true};
}; };
// Basic input and output data structure for PaddlePredictor. /** Basic input and output data structure for PaddlePredictor.
*/
struct PaddleTensor { struct PaddleTensor {
PaddleTensor() = default; PaddleTensor() = default;
std::string name; // variable name. std::string name; // variable name.
...@@ -94,19 +110,23 @@ struct PaddleTensor { ...@@ -94,19 +110,23 @@ struct PaddleTensor {
}; };
enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
// Tensor without copy, currently only supports AnalysisPredictor. /** Tensor without copy, currently only supports AnalysisPredictor.
*/
class ZeroCopyTensor { class ZeroCopyTensor {
public: public:
void Reshape(const std::vector<int>& shape); void Reshape(const std::vector<int>& shape);
// Get the memory in CPU or GPU with specific data type, should Reshape first /** Get the memory in CPU or GPU with specific data type, should Reshape first
// to tell the data size. * to tell the data size.
// Once can directly call this data to feed the data. * Once can directly call this data to feed the data.
// This is for write the input tensor. * This is for write the input tensor.
*/
template <typename T> template <typename T>
T* mutable_data(PaddlePlace place); T* mutable_data(PaddlePlace place);
// Get the memory directly, will return the place and memory size by pointer. /** Get the memory directly, will return the place and element size by
// This is for reading the output tensor. * pointer.
* This is for reading the output tensor.
*/
template <typename T> template <typename T>
T* data(PaddlePlace* place, int* size) const; T* data(PaddlePlace* place, int* size) const;
...@@ -128,8 +148,7 @@ class ZeroCopyTensor { ...@@ -128,8 +148,7 @@ class ZeroCopyTensor {
void* scope_{nullptr}; void* scope_{nullptr};
}; };
/* /** A simple Inference API for Paddle.
* A simple Inference API for Paddle.
*/ */
class PaddlePredictor { class PaddlePredictor {
public: public:
...@@ -138,18 +157,20 @@ class PaddlePredictor { ...@@ -138,18 +157,20 @@ class PaddlePredictor {
PaddlePredictor(const PaddlePredictor&) = delete; PaddlePredictor(const PaddlePredictor&) = delete;
PaddlePredictor& operator=(const PaddlePredictor&) = delete; PaddlePredictor& operator=(const PaddlePredictor&) = delete;
// Predict an record. /** Predict an record.
// The caller should be responsible for allocating and releasing the memory of * The caller should be responsible for allocating and releasing the memory of
// `inputs`. `inputs` should be available until Run returns. Caller should be * `inputs`. `inputs` should be available until Run returns. Caller should be
// responsible for the output tensor's buffer, either allocated or passed from * responsible for the output tensor's buffer, either allocated or passed from
// outside. * outside.
*/
virtual bool Run(const std::vector<PaddleTensor>& inputs, virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data, std::vector<PaddleTensor>* output_data,
int batch_size = -1) = 0; int batch_size = -1) = 0;
// Zero copy input and output optimization. /** Zero copy input and output optimization.
// Get the input or output tensors, and operate on their memory directly, * Get the input or output tensors, and operate on their memory directly,
// without copy. * without copy.
*/
virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor( virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
const std::string& name) { const std::string& name) {
return nullptr; return nullptr;
...@@ -160,16 +181,19 @@ class PaddlePredictor { ...@@ -160,16 +181,19 @@ class PaddlePredictor {
} }
virtual bool ZeroCopyRun() { return false; } virtual bool ZeroCopyRun() { return false; }
// Clone a predictor that share the model weights, the Cloned predictor should /** Clone a predictor that share the model weights, the Cloned predictor
// be thread-safe. * should be thread-safe.
*/
virtual std::unique_ptr<PaddlePredictor> Clone() = 0; virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
// Destroy the Predictor. /** Destroy the Predictor.
*/
virtual ~PaddlePredictor() = default; virtual ~PaddlePredictor() = default;
// The common configs for all the predictors. /** The common configs for all the predictors.
*/
struct Config { struct Config {
std::string model_dir; // path to the model directory. std::string model_dir; /*!< path to the model directory. */
}; };
}; };
...@@ -177,17 +201,21 @@ struct NativeConfig : public PaddlePredictor::Config { ...@@ -177,17 +201,21 @@ struct NativeConfig : public PaddlePredictor::Config {
// GPU related fields. // GPU related fields.
bool use_gpu{false}; bool use_gpu{false};
int device{0}; int device{0};
float fraction_of_gpu_memory{-1.f}; // Change to a float in (0,1] if needed. float fraction_of_gpu_memory{
-1.f}; /*!< Change to a float in (0,1] if needed. */
// Specify the exact path of program and parameter files. // Specify the exact path of program and parameter files.
std::string prog_file; std::string prog_file;
std::string param_file; std::string param_file;
// Specify the variable's name of each input if input tensors don't follow the /** Specify the variable's name of each input if input tensors don't follow
// `feeds` and `fetches` of the phase `save_inference_model`. * the
* `feeds` and `fetches` of the phase `save_inference_model`.
*/
bool specify_input_name{false}; bool specify_input_name{false};
// Set and get the number of cpu math library threads. /** Set and get the number of cpu math library threads.
*/
void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) { void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
cpu_math_library_num_threads_ = cpu_math_library_num_threads; cpu_math_library_num_threads_ = cpu_math_library_num_threads;
} }
...@@ -201,28 +229,33 @@ struct NativeConfig : public PaddlePredictor::Config { ...@@ -201,28 +229,33 @@ struct NativeConfig : public PaddlePredictor::Config {
int cpu_math_library_num_threads_{1}; int cpu_math_library_num_threads_{1};
}; };
// A factory to help create different predictors. /*! \fn std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&
// * config);
// Usage: *
// * \brief A factory to help create different predictors.
// NativeConfig config; *
// ... // change the configs. * Usage:
// auto native_predictor = CreatePaddlePredictor(config); *
// * NativeConfig config;
// FOR EXTENSION DEVELOPER: * ... // change the configs.
// Different predictors are designated by config type. Similar configs can be * auto native_predictor = CreatePaddlePredictor(config);
// merged, but there shouldn't be a huge config containing different fields for *
// more than one kind of predictors. * FOR EXTENSION DEVELOPER:
* Different predictors are designated by config type. Similar configs can be
* merged, but there shouldn't be a huge config containing different fields for
* more than one kind of predictors.
*/
template <typename ConfigT> template <typename ConfigT>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config); std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
// NOTE The following APIs are too trivial, we will discard it in the following /** NOTE The following APIs are too trivial, we will discard it in the following
// versions. * versions.
*/
enum class PaddleEngineKind { enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility. kNative = 0, /*!< Use the native Fluid facility. */
kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT. kAutoMixedTensorRT, /*!< Automatically mix Fluid with TensorRT. */
kAnalysis, // More optimization. kAnalysis, /*!< More optimization. */
kAnakin // Use Anakin for inference, not mature yet. kAnakin /*!< Use Anakin for inference, not mature yet. */
}; };
template <typename ConfigT, PaddleEngineKind engine> template <typename ConfigT, PaddleEngineKind engine>
......
...@@ -18,30 +18,39 @@ ...@@ -18,30 +18,39 @@
#include <string> #include <string>
#include <vector> #include <vector>
/*! \file */
/*! \namespace paddle */
namespace paddle { namespace paddle {
/*
* This is a pass builder based on string. It is part of inference API. /** This is a pass builder based on string. It is part of inference API.
*/ */
class PaddlePassBuilder { class PaddlePassBuilder {
public: public:
explicit PaddlePassBuilder(const std::vector<std::string> &passes) explicit PaddlePassBuilder(const std::vector<std::string> &passes)
: passes_(passes) {} : passes_(passes) {}
/** Append a pass to the end of the passes. */
void AppendPass(const std::string &pass_type); void AppendPass(const std::string &pass_type);
/** Insert a pass to a specific position.
* @param idx the position to insert.
* @param pass_type the pass key.
*/
void InsertPass(size_t idx, const std::string &pass_type); void InsertPass(size_t idx, const std::string &pass_type);
// Delete the `idx`-th pass. /** Delete the `idx`-th pass. */
void DeletePass(size_t idx); void DeletePass(size_t idx);
// Delete all the passes that has type `pass_type`. /** Delete all the passes that has type `pass_type`. */
void DeletePass(const std::string &pass_type); void DeletePass(const std::string &pass_type);
// Visualize the computation graph after each pass by generating a DOT /** Visualize the computation graph after each pass by generating a DOT
// language file, one can draw them with the Graphviz toolkit. * language file, one can draw them with the Graphviz toolkit.
*/
void TurnOnDebug(); void TurnOnDebug();
// Human-readible information. /** Human-readible information. */
std::string DebugString(); std::string DebugString();
const std::vector<std::string> &AllPasses() const { return passes_; } const std::vector<std::string> &AllPasses() const { return passes_; }
...@@ -50,16 +59,16 @@ class PaddlePassBuilder { ...@@ -50,16 +59,16 @@ class PaddlePassBuilder {
std::vector<std::string> passes_; std::vector<std::string> passes_;
}; };
/* /**Pass strategy to help control the IR passes.
* Pass strategy to help control the IR passes.
*/ */
class PassStrategy : public PaddlePassBuilder { class PassStrategy : public PaddlePassBuilder {
public: public:
explicit PassStrategy(const std::vector<std::string> &passes) explicit PassStrategy(const std::vector<std::string> &passes)
: PaddlePassBuilder(passes) {} : PaddlePassBuilder(passes) {}
// The MKLDNN control exists in both CPU and GPU mode, because there can be /** The MKLDNN control exists in both CPU and GPU mode, because there can be
// still some CPU kernels running in CPU mode. * still some CPU kernels running in CPU mode.
*/
virtual void EnableMKLDNN() = 0; virtual void EnableMKLDNN() = 0;
bool use_gpu() const { return use_gpu_; } bool use_gpu() const { return use_gpu_; }
...@@ -70,8 +79,7 @@ class PassStrategy : public PaddlePassBuilder { ...@@ -70,8 +79,7 @@ class PassStrategy : public PaddlePassBuilder {
bool use_gpu_{false}; bool use_gpu_{false};
}; };
/* /** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
* The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
*/ */
class CpuPassStrategy : public PassStrategy { class CpuPassStrategy : public PassStrategy {
public: public:
...@@ -81,6 +89,7 @@ class CpuPassStrategy : public PassStrategy { ...@@ -81,6 +89,7 @@ class CpuPassStrategy : public PassStrategy {
passes_.assign({ passes_.assign({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
"attention_lstm_fuse_pass", // "attention_lstm_fuse_pass", //
"seqpool_concat_fuse_pass", //
"seqconv_eltadd_relu_fuse_pass", // "seqconv_eltadd_relu_fuse_pass", //
// "embedding_fc_lstm_fuse_pass", // // "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass", // "fc_lstm_fuse_pass", //
...@@ -117,8 +126,7 @@ class CpuPassStrategy : public PassStrategy { ...@@ -117,8 +126,7 @@ class CpuPassStrategy : public PassStrategy {
CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {} CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {}
}; };
/* /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
* The GPU passes strategy, it is used in
*/ */
class GpuPassStrategy : public PassStrategy { class GpuPassStrategy : public PassStrategy {
public: public:
...@@ -133,6 +141,10 @@ class GpuPassStrategy : public PassStrategy { ...@@ -133,6 +141,10 @@ class GpuPassStrategy : public PassStrategy {
"conv_elementwise_add_fuse_pass", // "conv_elementwise_add_fuse_pass", //
}); });
for (int i = 6; i >= 3; i--) {
passes_.push_back("transpose_flatten" + std::to_string(i) +
"_concat_fuse_pass");
}
use_gpu_ = true; use_gpu_ = true;
} }
......
nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
add_subdirectory(plugin) add_subdirectory(plugin)
......
...@@ -39,6 +39,7 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -39,6 +39,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
const framework::Scope& scope, bool test_mode) override { const framework::Scope& scope, bool test_mode) override {
// Here the two nullptr looks strange, that's because the // Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange. // framework::OpDesc's constructor is strange.
nvinfer1::ILayer* layer = nullptr;
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer"; VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
...@@ -98,13 +99,21 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -98,13 +99,21 @@ class ElementwiseWeightOpConverter : public OpConverter {
0}; 0};
TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
0}; 0};
if (op_type_ == "add") {
nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *X, scale_mode, shift_weights.get(),
scale_weights.get(), power_weights.get());
layer = scale_layer;
} else if (op_type_ == "mul") {
nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *X, scale_mode, scale_weights.get(),
shift_weights.get(), power_weights.get());
layer = scale_layer;
}
nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
shift_weights.get(), scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
layer->setName(
layer->setName(("elementwise_add (Output: " + output_name + ")").c_str()); ("elementwise_" + op_type_ + "(Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str()); layer->getOutput(0)->setName(output_name.c_str());
engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor); engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
...@@ -113,6 +122,9 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -113,6 +122,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
engine_->DeclareOutput(output_name); engine_->DeclareOutput(output_name);
} }
} }
protected:
std::string op_type_;
}; };
class ElementwiseTensorOpConverter : public OpConverter { class ElementwiseTensorOpConverter : public OpConverter {
...@@ -188,6 +200,16 @@ const std::unordered_map<std::string, nvinfer1::ElementWiseOperation> ...@@ -188,6 +200,16 @@ const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
{"max", nvinfer1::ElementWiseOperation::kMAX}, {"max", nvinfer1::ElementWiseOperation::kMAX},
}; };
class ElementwiseWeightAddOpConverter : public ElementwiseWeightOpConverter {
public:
ElementwiseWeightAddOpConverter() { op_type_ = "add"; }
};
class ElementwiseWeightMulOpConverter : public ElementwiseWeightOpConverter {
public:
ElementwiseWeightMulOpConverter() { op_type_ = "mul"; }
};
class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter { class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
public: public:
ElementwiseTensorAddOpConverter() { op_type_ = "add"; } ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
...@@ -227,7 +249,10 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter { ...@@ -227,7 +249,10 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter); REGISTER_TRT_OP_CONVERTER(elementwise_add_weight,
ElementwiseWeightAddOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_mul_weight,
ElementwiseWeightMulOpConverter);
REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor, REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
ElementwiseTensorAddOpConverter); ElementwiseTensorAddOpConverter);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tensorrt/op_teller.h"
namespace paddle {
namespace inference {
namespace tensorrt {
// Just tell by the op_types.
struct SimpleOpTypeSetTeller : public Teller {
SimpleOpTypeSetTeller() {}
bool operator()(const std::string& op_type,
const framework::OpDesc& desc) override {
return teller_set.count(op_type);
}
private:
std::unordered_set<std::string> teller_set{
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"conv2d_transpose", "leaky_relu"}};
};
bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
for (auto& teller : tellers_) {
if ((*teller)(op_type, desc)) return true;
}
return false;
}
OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* Single Op teller definition.
* One can override this and define a more complex tell logic, considerring more
* issues such as op_desc.
*/
struct Teller {
virtual bool operator()(const std::string& op_type,
const framework::OpDesc& desc) = 0;
virtual ~Teller() = default;
};
/*
* A real example:
*
* struct SomeTeller : public Teller {
* bool operator()(const std::string& op_type,
* const framework::OpDesc& desc) override {
* return op_type == "fc" && desc.Inputs().size() == 2;
* }
*};
*/
/*
* class OpTeller helps to tell whether a fluid
* operator can be transformed to a TensorRT layer.
*/
class OpTeller {
public:
static OpTeller& Global() {
static std::unique_ptr<OpTeller> x(new OpTeller);
return *x;
}
bool Tell(const std::string& op_type, const framework::OpDesc& desc);
private:
OpTeller();
private:
std::vector<std::unique_ptr<Teller>> tellers_;
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -41,7 +41,7 @@ endfunction() ...@@ -41,7 +41,7 @@ endfunction()
if(NOT APPLE AND WITH_MKLML) if(NOT APPLE AND WITH_MKLML)
set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz") download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc) inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc SERIAL)
else() else()
# TODO: fix this test on MACOS and OPENBLAS, the reason is that # TODO: fix this test on MACOS and OPENBLAS, the reason is that
# fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
...@@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2 ...@@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
# normal DAM # normal DAM
set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc) inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL)
# small DAM # small DAM
set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz") download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1) ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1 SERIAL)
# chinese_ner # chinese_ner
set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
...@@ -111,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ...@@ -111,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
# resnet50 # resnet50
inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
"${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL)
# mobilenet with depthwise_conv op # mobilenet with depthwise_conv op
inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
"${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)
# anakin # anakin
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
......
...@@ -283,7 +283,7 @@ TEST(Analyzer_rnn1, multi_thread) { ...@@ -283,7 +283,7 @@ TEST(Analyzer_rnn1, multi_thread) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg), TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, 4 /* multi_thread */); input_slots_all, &outputs, 2 /* multi_thread */);
} }
// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing // Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing
...@@ -351,10 +351,10 @@ TEST(Analyzer_rnn1, ZeroCopy) { ...@@ -351,10 +351,10 @@ TEST(Analyzer_rnn1, ZeroCopy) {
ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs)); ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs));
LOG(INFO) << "native output " << DescribeTensor(native_outputs.front()); LOG(INFO) << "native output " << DescribeTensor(native_outputs.front());
int output_size{0}; int output_size{0}; // this is the number of elements not memory size
auto *zero_copy_data = output_tensor->data<float>(&place, &output_size); auto *zero_copy_data = output_tensor->data<float>(&place, &output_size);
auto *native_data = static_cast<float *>(native_outputs.front().data.data()); auto *native_data = static_cast<float *>(native_outputs.front().data.data());
for (size_t i = 0; i < output_size / sizeof(float); i++) { for (int i = 0; i < output_size; i++) {
EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3); EXPECT_NEAR(zero_copy_data[i], native_data[i], 1e-3);
} }
} }
......
...@@ -121,14 +121,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) { ...@@ -121,14 +121,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
} }
} }
void SetConfig(AnalysisConfig *cfg) {
cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->pass_builder()->TurnOnDebug();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<PaddleTensor> input_slots; std::vector<PaddleTensor> input_slots;
...@@ -141,15 +133,22 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -141,15 +133,22 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
} }
} }
void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
cfg->DisableGpu();
cfg->SwitchSpecifyInputNames();
cfg->pass_builder()->TurnOnDebug();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
if (use_mkldnn) {
cfg->EnableMKLDNN();
}
}
void profile(bool use_mkldnn = false) { void profile(bool use_mkldnn = false) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg, use_mkldnn);
if (use_mkldnn) {
cfg.EnableMKLDNN();
}
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg), TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
...@@ -169,16 +168,110 @@ TEST(Analyzer_seq_pool1, compare) { ...@@ -169,16 +168,110 @@ TEST(Analyzer_seq_pool1, compare) {
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all); reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
// Check the fuse status // Compare Deterministic result
TEST(Analyzer_seq_pool1, fuse_statis) { TEST(Analyzer_seq_pool1, compare_determine) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all);
}
void analysis_fuse_statis(bool use_zerocopy) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg.SwitchUseFeedFetchOps(!use_zerocopy);
int num_ops; int num_ops;
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis( auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
LOG(INFO) << "num_ops: " << num_ops; LOG(INFO) << "num_ops: " << num_ops;
EXPECT_EQ(num_ops, 349); EXPECT_EQ(num_ops, 195);
}
// Check the fuse status
TEST(Analyzer_seq_pool1, fuse_statis) { analysis_fuse_statis(false); }
void PrepareZeroCopyInputs(
const std::unique_ptr<PaddlePredictor> &predictor,
std::vector<std::unique_ptr<ZeroCopyTensor>> *inputs) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
// only feed one batch
const auto &one_batch = data.NextBatch();
inputs->clear();
for (size_t i = 0; i < one_batch.size(); ++i) {
auto &slot = one_batch[i];
auto tensor = predictor->GetInputTensor(slot.name + "_embed");
tensor->Reshape(slot.shape);
tensor->SetLoD({slot.lod});
ZeroCopyTensorAssignData<float>(tensor.get(), slot.data);
inputs->emplace_back(std::move(tensor));
}
}
// return the output values
std::vector<float> zerocopy_profile(int repeat_times) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(false);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
PrepareZeroCopyInputs(predictor, &inputs);
auto output_tensor = predictor->GetOutputTensor("reduce_sum_0.tmp_0");
Timer timer;
LOG(INFO) << "Warm up run...";
timer.tic();
predictor->ZeroCopyRun();
PrintTime(FLAGS_batch_size, 1, 1, 0, timer.toc(), 1);
if (FLAGS_profile) {
paddle::platform::ResetProfiler();
}
LOG(INFO) << "Run " << repeat_times << " times...";
timer.tic();
for (int i = 0; i < repeat_times; i++) {
predictor->ZeroCopyRun();
}
PrintTime(FLAGS_batch_size, repeat_times, 1, 0, timer.toc() / repeat_times,
1);
VLOG(3) << "ZeroCopy output: " << DescribeZeroCopyTensor(*output_tensor);
PaddlePlace place;
int output_size{0};
auto *pdata = output_tensor->data<float>(&place, &output_size);
std::vector<float> res(output_size);
for (int i = 0; i < output_size; ++i) {
res[i] = pdata[i];
}
return res;
}
TEST(Analyzer_seq_pool1, zerocopy_profile) { zerocopy_profile(FLAGS_repeat); }
TEST(Analyzer_seq_pool1, zerocopy_fuse_statis) { analysis_fuse_statis(true); }
TEST(Analyzer_seq_pool1, zerocopy_compare_native) {
AnalysisConfig config;
SetConfig(&config);
config.SwitchUseFeedFetchOps(true);
auto predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
std::vector<PaddleTensor> native_outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
ASSERT_TRUE(predictor->Run(input_slots_all[0], &native_outputs));
EXPECT_EQ(native_outputs.size(), 1UL);
auto zerocopy_output = zerocopy_profile(1);
EXPECT_EQ(zerocopy_output.size() * sizeof(float),
native_outputs.front().data.length());
auto *native_data = static_cast<float *>(native_outputs.front().data.data());
for (size_t i = 0; i < zerocopy_output.size(); ++i) {
EXPECT_NEAR(zerocopy_output[i], native_data[i], 1e-3);
}
} }
} // namespace analysis } // namespace analysis
......
...@@ -62,7 +62,7 @@ std::ostream &operator<<(std::ostream &os, ...@@ -62,7 +62,7 @@ std::ostream &operator<<(std::ostream &os,
const contrib::AnalysisConfig &config) { const contrib::AnalysisConfig &config) {
os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n"; os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
num_spaces++; num_spaces++;
os << *reinterpret_cast<const NativeConfig *>(&config); os << config.ToNativeConfig();
if (!config.model_from_memory()) { if (!config.model_from_memory()) {
os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n"; os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n";
os << GenSpaces(num_spaces) << "param_file: " << config.params_file() os << GenSpaces(num_spaces) << "param_file: " << config.params_file()
......
...@@ -54,11 +54,13 @@ namespace paddle { ...@@ -54,11 +54,13 @@ namespace paddle {
namespace inference { namespace inference {
void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
const auto *analysis_config =
reinterpret_cast<const contrib::AnalysisConfig *>(config);
if (use_analysis) { if (use_analysis) {
LOG(INFO) << *reinterpret_cast<const contrib::AnalysisConfig *>(config); LOG(INFO) << *analysis_config;
return; return;
} }
LOG(INFO) << *reinterpret_cast<const NativeConfig *>(config); LOG(INFO) << analysis_config->ToNativeConfig();
} }
void CompareResult(const std::vector<PaddleTensor> &outputs, void CompareResult(const std::vector<PaddleTensor> &outputs,
...@@ -96,12 +98,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -96,12 +98,13 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
std::unique_ptr<PaddlePredictor> CreateTestPredictor( std::unique_ptr<PaddlePredictor> CreateTestPredictor(
const PaddlePredictor::Config *config, bool use_analysis = true) { const PaddlePredictor::Config *config, bool use_analysis = true) {
const auto *analysis_config =
reinterpret_cast<const contrib::AnalysisConfig *>(config);
if (use_analysis) { if (use_analysis) {
return CreatePaddlePredictor<contrib::AnalysisConfig>( return CreatePaddlePredictor<contrib::AnalysisConfig>(*analysis_config);
*(reinterpret_cast<const contrib::AnalysisConfig *>(config)));
} }
return CreatePaddlePredictor<NativeConfig>( auto native_config = analysis_config->ToNativeConfig();
*(reinterpret_cast<const NativeConfig *>(config))); return CreatePaddlePredictor<NativeConfig>(native_config);
} }
size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
...@@ -328,10 +331,7 @@ void CompareNativeAndAnalysis( ...@@ -328,10 +331,7 @@ void CompareNativeAndAnalysis(
const std::vector<std::vector<PaddleTensor>> &inputs) { const std::vector<std::vector<PaddleTensor>> &inputs) {
PrintConfig(config, true); PrintConfig(config, true);
std::vector<PaddleTensor> native_outputs, analysis_outputs; std::vector<PaddleTensor> native_outputs, analysis_outputs;
const auto *analysis_config = TestOneThreadPrediction(config, inputs, &native_outputs, false);
reinterpret_cast<const contrib::AnalysisConfig *>(config);
auto native_config = analysis_config->ToNativeConfig();
TestOneThreadPrediction(&native_config, inputs, &native_outputs, false);
TestOneThreadPrediction(config, inputs, &analysis_outputs, true); TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
CompareResult(analysis_outputs, native_outputs); CompareResult(analysis_outputs, native_outputs);
} }
......
...@@ -99,24 +99,12 @@ void compare(std::string model_dir, bool use_tensorrt) { ...@@ -99,24 +99,12 @@ void compare(std::string model_dir, bool use_tensorrt) {
SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
} }
std::vector<PaddleTensor> native_outputs;
NativeConfig native_config;
SetConfig<NativeConfig>(&native_config, model_dir, true, false,
FLAGS_batch_size);
TestOneThreadPrediction(
reinterpret_cast<PaddlePredictor::Config*>(&native_config), inputs_all,
&native_outputs, false);
std::vector<PaddleTensor> analysis_outputs;
contrib::AnalysisConfig analysis_config; contrib::AnalysisConfig analysis_config;
analysis_config.EnableUseGpu(50, 0);
SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true, SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
use_tensorrt, FLAGS_batch_size); use_tensorrt, FLAGS_batch_size);
TestOneThreadPrediction( CompareNativeAndAnalysis(
reinterpret_cast<PaddlePredictor::Config*>(&analysis_config), inputs_all, reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config),
&analysis_outputs, true); inputs_all);
CompareResult(native_outputs, analysis_outputs);
} }
TEST(TensorRT_mobilenet, compare) { TEST(TensorRT_mobilenet, compare) {
......
...@@ -2,6 +2,3 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce) ...@@ -2,6 +2,3 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce)
cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
cc_binary(visualizer SRCS visualizer.cc DEPS analysis cc_binary(visualizer SRCS visualizer.cc DEPS analysis
paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes) paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
if(WIN32)
target_link_libraries(visualizer shlwapi)
endif(WIN32)
...@@ -297,6 +297,21 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -297,6 +297,21 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>( cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
layout, framework::vectorize2int(filter->dims()), groups); layout, framework::vectorize2int(filter->dims()), groups);
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Enable Tensor Core for cudnn backward
if (dev_ctx.GetComputeCapability() >= 70 &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
VLOG(5) << "use cudnn_tensor_op_math for backward";
} else {
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
VLOG(5) << "NOT use cudnn_tensor_op_math for backward";
}
#endif
int input_channels = input->dims()[1]; int input_channels = input->dims()[1];
int input_height, input_width, input_depth; int input_height, input_width, input_depth;
if (input->dims().size() == 5) { if (input->dims().size() == 5) {
......
...@@ -318,8 +318,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -318,8 +318,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations"); std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups"); int groups = ctx.Attr<int>("groups");
bool fuse_relu = ctx.Attr<bool>("fuse_relu");
bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
bool force_fp32_output = ctx.Attr<bool>("force_fp32_output"); bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
if (fuse_residual_conn) {
PADDLE_ENFORCE(force_fp32_output != true,
"residual fusion does not support force output with fp32");
}
bool is_conv3d = strides.size() == 3U; bool is_conv3d = strides.size() == 3U;
// TODO(tpatejko): add support for dilation // TODO(tpatejko): add support for dilation
...@@ -329,6 +335,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -329,6 +335,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dilations[2] == 1 dilations[2] == 1
: dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet"); "dilation in convolution is not implemented yet");
PADDLE_ENFORCE(is_conv3d != true, "int8 does not support conv3d currently"); PADDLE_ENFORCE(is_conv3d != true, "int8 does not support conv3d currently");
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
...@@ -340,17 +347,35 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -340,17 +347,35 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
GetWeightsTz(weights_tz, g, is_conv3d); GetWeightsTz(weights_tz, g, is_conv3d);
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims()); std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
mkldnn::memory::data_type src_dt =
paddle::framework::ToMKLDNNDataType(input->type());
auto dst_dt = fuse_relu ? paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<uint8_t>::DataType)
: paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<int8_t>::DataType);
if (force_fp32_output) {
dst_dt = paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<float>::DataType);
}
if (fuse_residual_conn) {
auto residual = ctx.Input<Tensor>("ResidualData");
auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type());
if (dst_dt != residual_dt) dst_dt = residual_dt;
}
// Get unique name for storing MKLDNN primitives // Get unique name for storing MKLDNN primitives
std::string key; std::string key;
key.reserve(MaxKeyLength); key.reserve(MaxKeyLength);
mkldnn::memory::data_type src_dt =
paddle::framework::ToMKLDNNDataType(input->type());
platform::ConvMKLDNNHandler::AppendKey( platform::ConvMKLDNNHandler::AppendKey(
&key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt, &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt,
input->format(), ctx.op().Output("Output")); input->format(), fuse_relu, fuse_residual_conn,
ctx.op().Output("Output"));
const std::string key_conv_pd = key + "@conv_pd"; const std::string key_conv_pd = key + "@conv_pd";
bool need_s8_to_u8 = false;
std::shared_ptr<mkldnn::convolution_forward> conv_p = nullptr; std::shared_ptr<mkldnn::convolution_forward> conv_p = nullptr;
std::shared_ptr<mkldnn::memory> src_memory_p = nullptr; std::shared_ptr<mkldnn::memory> src_memory_p = nullptr;
std::shared_ptr<mkldnn::memory> user_src_memory_p = nullptr; std::shared_ptr<mkldnn::memory> user_src_memory_p = nullptr;
...@@ -365,14 +390,20 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -365,14 +390,20 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto src_key = key + "@src_mem_p"; auto src_key = key + "@src_mem_p";
auto user_src_key = key + "@user_src_mem_p"; auto user_src_key = key + "@user_src_mem_p";
auto src_reorder_key = key + "@src_mem_preorder_p"; auto src_reorder_key = key + "@src_mem_preorder_p";
auto residual_reorder_key = key + "@residual_data_mem_preorder_p";
conv_p = std::static_pointer_cast<mkldnn::convolution_forward>( conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
dev_ctx.GetBlob(prim_key)); dev_ctx.GetBlob(prim_key));
if (conv_p == nullptr || !is_test) { if (conv_p == nullptr || !is_test) {
const K* filter_data = filter->data<K>(); const K* filter_data = filter->data<K>();
auto scale_in_data = ctx.Attr<float>("Scale_in"); auto scale_in_data = ctx.Attr<float>("Scale_in");
auto scale_in_eltwise_data = ctx.Attr<float>("Scale_in_eltwise");
auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights"); auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
auto scale_out_data = auto scale_out_data =
force_fp32_output ? 1.0f : ctx.Attr<float>("Scale_out"); force_fp32_output ? 1.0f : ctx.Attr<float>("Scale_out");
float sum_scale =
fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
bool is_multi_channel = scale_weights_data.size() > 1; bool is_multi_channel = scale_weights_data.size() > 1;
...@@ -413,15 +444,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -413,15 +444,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format); platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc( auto weights_md = platform::MKLDNNMemDesc(
weights_tz, memory::data_type::s8, chosen_memory_format); weights_tz, memory::data_type::s8, chosen_memory_format);
auto dst_dt = force_fp32_output
? paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<float>::DataType)
: paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<int8_t>::DataType);
auto dst_md = auto dst_md =
platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format); platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format);
// create a conv primitive descriptor and save it for usage in backward // create a conv primitive descriptor and save it for usage in backward
if (bias) { if (bias) {
bias_tz = paddle::framework::vectorize2int(bias->dims()); bias_tz = paddle::framework::vectorize2int(bias->dims());
...@@ -429,11 +454,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -429,11 +454,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
memory::format::x); memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine, strides, paddings, mkldnn_engine,
output_shift_scale, is_test); fuse_relu, fuse_residual_conn,
output_shift_scale, sum_scale, is_test);
} else { } else {
conv_pd = conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine, output_shift_scale, is_test); mkldnn_engine, fuse_relu, fuse_residual_conn,
output_shift_scale, sum_scale, is_test);
} }
// Save conv_pd/src_memory/weights_memory for backward pass // Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd); dev_ctx.SetBlob(key_conv_pd, conv_pd);
...@@ -458,8 +485,46 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -458,8 +485,46 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
user_weights_memory_p, pipeline, is_test, true, scale_weights_data, user_weights_memory_p, pipeline, is_test, true, scale_weights_data,
mask_reorder); mask_reorder);
if (!force_fp32_output) { if (fuse_residual_conn) {
dst_memory_p = platform::SetDstMemory<int8_t>(ctx, output, handler); auto residual_param = ctx.Input<Tensor>("ResidualData");
PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
"Output and elementwise parameter need to have the "
"same dimension sizes");
auto residual_dt =
paddle::framework::ToMKLDNNDataType(residual_param->type());
if (residual_param->format() != handler->GetDstFormat()) {
auto residual_data_tz =
paddle::framework::vectorize2int(residual_param->dims());
auto user_residual_md = platform::MKLDNNMemDesc(
residual_data_tz, residual_dt, residual_param->format());
if (residual_dt == mkldnn::memory::data_type::u8) {
dst_memory_p = platform::SetDstMemory<uint8_t>(
ctx, output, residual_param, user_residual_md, handler,
&pipeline);
} else {
need_s8_to_u8 = fuse_relu;
dst_memory_p = platform::SetDstMemory<int8_t>(
ctx, output, residual_param, user_residual_md, handler,
&pipeline);
}
} else {
output->ShareDataWith(*residual_param);
if (residual_dt == mkldnn::memory::data_type::u8) {
dst_memory_p =
platform::SetDstMemory<uint8_t>(ctx, output, handler);
} else {
need_s8_to_u8 = fuse_relu;
dst_memory_p = platform::SetDstMemory<int8_t>(ctx, output, handler);
}
}
} else if (!force_fp32_output) {
if (fuse_relu) {
dst_memory_p = platform::SetDstMemory<uint8_t>(ctx, output, handler);
} else {
dst_memory_p = platform::SetDstMemory<int8_t>(ctx, output, handler);
}
} else { } else {
dst_memory_p = platform::SetDstMemory<float>(ctx, output, handler); dst_memory_p = platform::SetDstMemory<float>(ctx, output, handler);
} }
...@@ -467,11 +532,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -467,11 +532,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
// create convolution op primitive // create convolution op primitive
auto scale_bias_key = key + "@scale_bias"; auto scale_bias_key = key + "@scale_bias";
if (bias) { if (bias) {
const float* bias_data = bias->data<float>(); const K* bias_data = bias->data<K>();
auto user_bias_md = platform::MKLDNNMemDesc( auto user_bias_md = platform::MKLDNNMemDesc(
{bias_tz}, platform::MKLDNNGetDataType<float>(), memory::format::x); {bias_tz}, platform::MKLDNNGetDataType<K>(), memory::format::x);
auto user_bias_memory_p = handler->AcquireBiasMemory( auto user_bias_memory_p = handler->AcquireBiasMemory(
user_bias_md, to_void_cast<float>(bias_data)); user_bias_md, to_void_cast<K>(bias_data));
std::shared_ptr<mkldnn::memory> bias_memory_p; std::shared_ptr<mkldnn::memory> bias_memory_p;
int mask_reorder = is_multi_channel ? 1 << 0 : 1; int mask_reorder = is_multi_channel ? 1 << 0 : 1;
int count = int count =
...@@ -517,21 +582,51 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -517,21 +582,51 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
mkldnn_engine, key)); mkldnn_engine, key));
} }
if (!force_fp32_output) {
dst_memory_p = if (fuse_residual_conn) {
platform::SetDstMemoryHandler<int8_t>(ctx, output, handler); auto residual_param = ctx.Input<Tensor>("ResidualData");
auto residual_dt =
paddle::framework::ToMKLDNNDataType(residual_param->type());
output->ShareDataWith(*residual_param);
if (residual_dt == mkldnn::memory::data_type::u8) {
platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler,
&dst_memory_p);
} else {
platform::SetDstMemoryHandler<int8_t>(ctx, output, handler,
&dst_memory_p);
}
} else if (!force_fp32_output) {
if (fuse_relu) {
platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler,
&dst_memory_p);
} else {
platform::SetDstMemoryHandler<int8_t>(ctx, output, handler,
&dst_memory_p);
}
} else { } else {
dst_memory_p = platform::SetDstMemoryHandler<float>(ctx, output, handler,
platform::SetDstMemoryHandler<float>(ctx, output, handler); &dst_memory_p);
} }
if (src_memory_reorder_p) { if (src_memory_reorder_p) {
pipeline.push_back(*src_memory_reorder_p); pipeline.push_back(*src_memory_reorder_p);
} }
auto residual_reorder_p = std::static_pointer_cast<mkldnn::memory>(
dev_ctx.GetBlob(residual_reorder_key));
if (residual_reorder_p) {
pipeline.push_back(*residual_reorder_p);
}
pipeline.push_back(*conv_p); pipeline.push_back(*conv_p);
} }
// push primitive to stream and wait until it's executed // push primitive to stream and wait until it's executed
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
if (need_s8_to_u8) {
output->mutable_data<uint8_t>(ctx.GetPlace());
}
output->set_layout(DataLayout::kMKLDNN); output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory_p)); output->set_format(GetMKLDNNFormat(*dst_memory_p));
} }
...@@ -563,11 +658,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -563,11 +658,22 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
} }
mkldnn::primitive_attr CreatePostOps( mkldnn::primitive_attr CreatePostOps(
const std::vector<float> output_shift_scale) const { bool fuse_relu, bool fuse_residual_conn,
const std::vector<float> output_shift_scale, float sum_scale) const {
mkldnn::primitive_attr conv_attr; mkldnn::primitive_attr conv_attr;
mkldnn::post_ops post_operations; mkldnn::post_ops post_operations;
int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
conv_attr.set_output_scales(mask, output_shift_scale); conv_attr.set_output_scales(mask, output_shift_scale);
if (fuse_residual_conn) {
post_operations.append_sum(sum_scale);
}
if (fuse_relu) {
constexpr float scale = 1.0f;
constexpr float negative_slope = 0.0f;
constexpr float placeholder = 1.0f; // beta
post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
negative_slope, placeholder);
}
conv_attr.set_post_ops(post_operations); conv_attr.set_post_ops(post_operations);
return conv_attr; return conv_attr;
} }
...@@ -600,9 +706,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -600,9 +706,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides, const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const mkldnn::engine& engine, const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_residual_conn,
const std::vector<float> output_shift_scale, const std::vector<float> output_shift_scale,
bool is_test) const { const float sum_scale, bool is_test) const {
memory::dims stride_dims = {strides[0], strides[1]}; memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]}; memory::dims padding_dims = {paddings[0], paddings[1]};
...@@ -613,7 +720,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -613,7 +720,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims, propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims,
padding_dims, padding_dims, mkldnn::padding_kind::zero); padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale); mkldnn::primitive_attr conv_attr = CreatePostOps(
fuse_relu, fuse_residual_conn, output_shift_scale, sum_scale);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine); conv_desc, conv_attr, engine);
...@@ -652,9 +760,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -652,9 +760,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const memory::desc& bias, const memory::desc& dst, const memory::desc& bias, const memory::desc& dst,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const mkldnn::engine& engine, const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_residual_conn,
const std::vector<float> output_shift_scale, const std::vector<float> output_shift_scale,
bool is_test) const { const float sum_scale, bool is_test) const {
memory::dims stride_dims = {strides[0], strides[1]}; memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]}; memory::dims padding_dims = {paddings[0], paddings[1]};
...@@ -665,7 +774,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -665,7 +774,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
propagation, mkldnn::convolution_direct, src, weights, bias, dst, propagation, mkldnn::convolution_direct, src, weights, bias, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale); mkldnn::primitive_attr conv_attr = CreatePostOps(
fuse_relu, fuse_residual_conn, output_shift_scale, sum_scale);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine); conv_desc, conv_attr, engine);
...@@ -868,7 +978,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -868,7 +978,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p)); input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p));
} }
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
} // Compute() }
}; };
} // namespace operators } // namespace operators
......
...@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_sub, elementwise_sub,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>); ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_sub_grad, elementwise_sub_grad,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h"
#include "paddle/fluid/framework/var_type_inference.h"
namespace paddle {
namespace operators {
class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("W"),
"Input W of FusedEmbeddingSeqPoolOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Ids"),
"Input Ids of FusedEmbeddingSeqPoolOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output of FusedEmbeddingSeqPoolOp should not be null.");
auto table_dims = ctx->GetInputDim("W");
auto ids_dims = ctx->GetInputDim("Ids");
const std::string& combiner = ctx->Attrs().Get<std::string>("combiner");
PADDLE_ENFORCE_EQ(table_dims.size(), 2);
PADDLE_ENFORCE_GE(ids_dims.size(), 1,
"The dim size of the 'Ids' tensor must greater than 1.");
PADDLE_ENFORCE_EQ(ids_dims[ids_dims.size() - 1], 1,
"The last dimension of the 'Ids' tensor must be 1.");
// we only support sum now
PADDLE_ENFORCE_EQ(combiner, "sum");
int64_t last_dim = table_dims[1];
for (int i = 1; i != ids_dims.size(); ++i) {
last_dim *= ids_dims[i];
}
if (ctx->IsRuntime()) {
framework::Variable* ids_var =
boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Ids")[0]);
const auto& ids_lod = ids_var->Get<LoDTensor>().lod();
// in run time, the LoD of ids must be 1
PADDLE_ENFORCE(ids_lod.size(), 1u,
"The LoD level of Input(Ids) must be 1");
PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty");
int64_t batch_size = ids_lod[0].size() - 1;
// in run time, the shape from Ids -> output
// should be [seq_length, 1] -> [batch_size, embedding_size]
ctx->SetOutputDim("Out", framework::make_ddim({batch_size, last_dim}));
} else {
// in compile time, the lod level of ids must be 1
framework::VarDesc* ids_desc =
boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Ids")[0]);
PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1);
// in compile time, the shape from Ids -> output
// should be [-1, 1] -> [-1, embedding_size]
ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim}));
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
return framework::OpKernelType(data_type, ctx.device_context());
}
};
class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("W",
"(Tensor) The input represents embedding tensors, "
"which is a learnable parameter.");
AddInput("Ids",
"An input with type int32 or int64 "
"contains the ids to be looked up in W. "
"The last dimension size must be 1.");
AddOutput("Out", "The lookup results, which have the same type as W.");
AddAttr<std::string>("combiner",
"(string, default sum) "
"A string specifying the reduction op. Currently sum "
"are supported, sum computes the weighted sum of the "
"embedding results for each row.")
.SetDefault("sum");
// NOTE(minqiyang): grad_inplace is an temporal attribute,
// please do NOT set this attribute in python layer.
AddAttr<bool>("grad_inplace",
"(boolean, default false) "
"If the grad op reuse the input's variable.")
.SetDefault(false);
AddAttr<bool>("is_sparse",
"(boolean, default false) "
"Sparse update.")
.SetDefault(false);
AddComment(R"DOC(
FusedEmbeddingSeqPool Operator.
Computes embeddings for the given ids and weights.
This operator is used to perform lookups on the parameter W,
then computes the weighted sum of the lookups results for each row
and concatenated into a dense tensor.
The input Ids should carry the LoD (Level of Details) information.
And the output will change the LoD information with input Ids.
)DOC");
}
};
class FusedEmbeddingSeqPoolOpGradDescMaker
: public framework::DefaultGradOpDescMaker<true> {
using ::paddle::framework::DefaultGradOpDescMaker<
true>::DefaultGradOpDescMaker;
protected:
virtual std::string GradOpType() const {
return "fused_embedding_seq_pool_grad";
}
};
class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
auto table_dims = ctx->GetInputDim("W");
ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
return framework::OpKernelType(data_type, ctx.device_context());
}
};
class FusedEmbeddingSeqPoolOpGradVarTypeInference
: public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
auto attr = op_desc.GetAttr("is_sparse");
bool is_sparse = boost::get<bool>(attr);
if (is_sparse) {
VLOG(3) << "fused_embedding_seq_pool_grad op "
<< framework::GradVarName("W") << " is set to SelectedRows";
block->Var(out_var_name)
->SetType(framework::proto::VarType::SELECTED_ROWS);
} else {
VLOG(3) << "fused_embedding_seq_pool_grad op "
<< framework::GradVarName("W") << " is set to LoDTensor";
block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
}
block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fused_embedding_seq_pool, ops::FusedEmbeddingSeqPoolOp,
ops::FusedEmbeddingSeqPoolOpGradDescMaker,
ops::FusedEmbeddingSeqPoolOpMaker);
REGISTER_OPERATOR(fused_embedding_seq_pool_grad,
ops::FusedEmbeddingSeqPoolOpGrad,
ops::FusedEmbeddingSeqPoolOpGradVarTypeInference);
REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool,
ops::FusedEmbeddingSeqPoolKernel<float>,
ops::FusedEmbeddingSeqPoolKernel<double>);
REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool_grad,
ops::FusedEmbeddingSeqPoolGradKernel<float>,
ops::FusedEmbeddingSeqPoolGradKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows;
using DDim = framework::DDim;
template <typename T>
struct EmbeddingVSumFunctor {
void operator()(const framework::ExecutionContext &context,
const LoDTensor *table_t, const LoDTensor *ids_t,
LoDTensor *output_t) {
auto *table = table_t->data<T>();
int64_t row_number = table_t->dims()[0];
int64_t row_width = table_t->dims()[1];
int64_t last_dim = output_t->dims()[1];
const int64_t *ids = ids_t->data<int64_t>();
auto ids_lod = ids_t->lod()[0];
int64_t ids_count = ids_t->numel() / ids_lod.back();
auto *output = output_t->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int64_t i = 0; i != ids_lod.size() - 1; ++i) {
size_t begin = ids_lod[i] * ids_count;
for (int64_t j = 0; j != ids_count; ++j) {
PADDLE_ENFORCE_LT(ids[begin], row_number);
PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i);
blas.VCOPY(row_width, table + ids[begin + j] * row_width,
output + i * last_dim + j * row_width);
}
for (int64_t r = (ids_lod[i] + 1) * ids_count;
r < ids_lod[i + 1] * ids_count; ++r) {
PADDLE_ENFORCE_LT(ids[r], row_number);
PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i);
blas.AXPY(row_width, 1., table + ids[r] * row_width,
output + i * last_dim + (r % ids_count) * row_width);
}
}
}
};
template <typename T>
class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
const LoDTensor *ids_t = context.Input<LoDTensor>("Ids"); // int tensor
LoDTensor *output_t = context.Output<LoDTensor>("Out"); // float tensor
const LoDTensor *table_var = context.Input<LoDTensor>("W");
const std::string &combiner_type = context.Attr<std::string>("combiner");
if (combiner_type == "sum") {
EmbeddingVSumFunctor<T> functor;
functor(context, table_var, ids_t, output_t);
}
}
};
template <typename T>
class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *table_var = context.InputVar("W");
DDim table_dim;
if (table_var->IsType<LoDTensor>()) {
table_dim = context.Input<LoDTensor>("W")->dims();
} else if (table_var->IsType<SelectedRows>()) {
auto *table_t = context.Input<SelectedRows>("W");
table_dim = table_t->value().dims();
} else {
PADDLE_THROW(
"The parameter W of a LookupTable "
"must be either LoDTensor or SelectedRows");
}
bool is_sparse = context.Attr<bool>("is_sparse");
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
if (is_sparse) {
auto *ids = context.Input<LoDTensor>("Ids");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
auto *ids_data = ids->data<int64_t>();
int64_t ids_num = ids->numel();
auto lod = ids->lod()[0];
int64_t row_width = d_output->dims()[1];
framework::Vector<int64_t> *new_rows = d_table->mutable_rows();
new_rows->resize(ids_num);
std::memcpy(&(*new_rows)[0], ids_data, ids_num * sizeof(int64_t));
auto *d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_num, table_dim[1]});
T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
const T *d_output_data = d_output->data<T>();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
int64_t in_offset = lod[i] * row_width;
const T *out_pos = d_output_data + i * row_width;
T *in_pos = d_table_data + in_offset;
for (int r = 0; r != h; ++r) {
blas.VCOPY(row_width, out_pos, in_pos + r * row_width);
}
}
} else {
LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now";
}
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_seqpool_concat_op.h"
#include <string>
#include <vector>
#include "paddle/fluid/operators/jit/kernels.h"
namespace paddle {
namespace operators {
void FusionSeqPoolConcatOp::InferShape(
framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
"Inputs(X) of FusionSeqPoolConcatOp should not be empty.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FusionSeqPoolConcatOp should not be null.");
int axis = ctx->Attrs().Get<int>("axis");
PADDLE_ENFORCE_EQ(axis, 1,
"FusionSeqPoolConcatOp only supports concat axis=1 yet.");
auto ins_dims = ctx->GetInputsDim("X");
const size_t n = ins_dims.size();
PADDLE_ENFORCE_GT(n, 0UL, "Input tensors count should > 0.");
if (n == 1) {
LOG(WARNING) << "Only have one input, may waste memory";
}
// The output height should be confirmed in Compute,
// since input lod is not accessible here.
PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2UL,
"The dims size of first input should be 2.");
ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)});
}
framework::OpKernelType FusionSeqPoolConcatOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
return framework::OpKernelType(
framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]), ctx.GetPlace());
}
void FusionSeqPoolConcatOpMaker::Make() {
AddInput("X", "(LoDTensor) Input tensors of this operator.").AsDuplicable();
AddOutput("Out", "(LoDTensor) Output tensor of concat operator.");
AddAttr<std::string>("pooltype",
"(string, default 'SUM') some of the pooling "
"pooltype of SequencePoolOp.")
.SetDefault("SUM")
.InEnum({"AVERAGE", "SUM", "SQRT"});
AddAttr<int>("axis",
"The axis along which the input tensors will be concatenated. "
"Only supports concat axis=1 yet.")
.SetDefault(1);
AddComment(R"DOC(
Fusion Sequence Pool of pooltype(sum, average and sqrt) and Concat Operator.
)DOC");
}
template <typename T>
class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto ins = ctx.MultiInput<LoDTensor>("X");
auto* out = ctx.Output<LoDTensor>("Out");
std::string pooltype = ctx.Attr<std::string>("pooltype");
auto x0_lod = ins[0]->lod();
auto x0_dims = ins[0]->dims();
auto y_dims = out->dims();
size_t bs = x0_lod[0].size() - 1;
out->Resize({static_cast<int64_t>(bs), y_dims[1]});
framework::LoD y_lod(1);
y_lod[0].resize(bs + 1);
for (size_t i = 0; i <= bs; ++i) {
y_lod[0][i] = i;
}
out->set_lod(y_lod);
auto place = ctx.GetPlace();
T* y_data = out->mutable_data<T>(place);
int w = ins[0]->numel() / x0_dims[0];
PADDLE_ENFORCE_EQ(y_dims[1] % w, 0,
"The output of dims[1] should be dividable of w");
jit::seq_pool_attr_t attr(w, jit::SeqPoolType::kSum);
if (pooltype == "AVERAGE") {
attr.type = jit::SeqPoolType::kAvg;
} else if (pooltype == "SQRT") {
attr.type = jit::SeqPoolType::kSqrt;
}
auto seqpool =
jit::Get<jit::kSeqPool, jit::SeqPoolTuples<T>, platform::CPUPlace>(
attr);
size_t n = ins.size();
size_t dst_step_size = n * w;
for (size_t i = 0; i < n; ++i) {
auto x_dims = ins[i]->dims();
auto x_lod = ins[i]->lod()[0];
const T* src = ins[i]->data<T>();
T* dst = y_data + i * w;
PADDLE_ENFORCE_EQ(static_cast<int>(ins[i]->numel() / x_dims[0]), w,
"Width of all inputs should be equal.");
PADDLE_ENFORCE_EQ(x_lod.size(), bs + 1,
"Batchsize of all inputs should be equal.");
for (size_t j = 0; j < bs; ++j) {
attr.h = static_cast<int>(x_lod[j + 1] - x_lod[j]);
seqpool(src, dst, &attr);
dst += dst_step_size;
src += attr.h * attr.w;
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(fusion_seqpool_concat, ops::FusionSeqPoolConcatOp,
ops::FusionSeqPoolConcatOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OP_CPU_KERNEL(fusion_seqpool_concat,
ops::FusionSeqPoolConcatKernel<float>,
ops::FusionSeqPoolConcatKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
class FusionSeqPoolConcatOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class FusionSeqPoolConcatOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
} // namespace operators
} // namespace paddle
...@@ -52,11 +52,11 @@ struct BenchFunc { ...@@ -52,11 +52,11 @@ struct BenchFunc {
for (int i = 0; i < FLAGS_burning; ++i) { for (int i = 0; i < FLAGS_burning; ++i) {
tgt(args...); tgt(args...);
} }
auto start = paddle::platform::PosixInNsec() / 1e-3; auto start = paddle::platform::PosixInNsec() * 1e-3;
for (int i = 0; i < FLAGS_repeat; ++i) { for (int i = 0; i < FLAGS_repeat; ++i) {
tgt(args...); tgt(args...);
} }
auto end = paddle::platform::PosixInNsec() / 1e-3; auto end = paddle::platform::PosixInNsec() * 1e-3;
return static_cast<double>(end - start) / FLAGS_repeat; return static_cast<double>(end - start) / FLAGS_repeat;
} }
}; };
...@@ -190,6 +190,26 @@ void BenchGRUKernel() { ...@@ -190,6 +190,26 @@ void BenchGRUKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void BenchSeqPoolKernel() {
std::vector<jit::SeqPoolType> pool_types = {
jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
for (auto type : pool_types) {
for (int w : TestSizes()) {
jit::seq_pool_attr_t attr(w, type);
for (int h : TestSizes()) {
attr.h = h;
std::vector<T> x(h * w), y(w);
RandomVec<T>(h * w, x.data(), -2.f, 2.f);
const T* x_data = x.data();
T* y_data = y.data();
BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
y_data, &attr);
}
}
}
}
// Benchmark all jit kernels including jitcode, mkl and refer. // Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...] // To use this tool, run command: ./benchmark [options...]
// Options: // Options:
...@@ -228,4 +248,7 @@ int main(int argc, char* argv[]) { ...@@ -228,4 +248,7 @@ int main(int argc, char* argv[]) {
BenchGRUKernel<jit::kGRUH1, T, PlaceType>(); BenchGRUKernel<jit::kGRUH1, T, PlaceType>();
BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>(); BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>(); BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
// seq pool function
BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>();
} }
...@@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1) ...@@ -26,3 +26,4 @@ USE_JITKERNEL_GEN(kGRUH1)
USE_JITKERNEL_GEN(kGRUHtPart1) USE_JITKERNEL_GEN(kGRUHtPart1)
USE_JITKERNEL_GEN(kGRUHtPart2) USE_JITKERNEL_GEN(kGRUHtPart2)
USE_JITKERNEL_GEN(kNCHW16CMulNC) USE_JITKERNEL_GEN(kNCHW16CMulNC)
USE_JITKERNEL_GEN(kSeqPool)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/seqpool.h"
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
void SeqPoolJitCode::genCode() {
constexpr int block = YMM_FLOAT_BLOCK;
constexpr int max_num_regs = 8;
const int num_block = w_ / block;
const int num_groups = num_block / max_num_regs;
int rest_num_regs = num_block % max_num_regs;
mov(reg32_int_h, dword[param_attr]);
if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
mov(reg_tmp, reinterpret_cast<size_t>(exp_float_consts));
vmovups(xmm_t(1), ptr[reg_tmp + OFFSET_EXP_ONE]);
mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
fild(dword[param_attr]);
fstp(dword[reg_tmp]);
vmovss(xmm_t(0), ptr[reg_tmp]);
if (type_ == SeqPoolType::kSqrt) {
vsqrtps(xmm_t(0), xmm_t(0));
}
vdivps(xmm_t(1), xmm_t(1), xmm_t(0));
vmovss(ptr[reg_tmp], xmm_t(1));
}
const int group_len = max_num_regs * block * sizeof(float);
for (int g = 0; g < num_groups; ++g) {
pool_height<ymm_t>(g * group_len, block, max_num_regs);
}
if (rest_num_regs > 0) {
pool_height<ymm_t>(num_groups * group_len, block, rest_num_regs);
}
// part of rest_w * height
const int rest = w_ % block;
pool_height_of_rest_width(rest, (w_ - rest) * sizeof(float), max_num_regs);
ret();
}
class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
public:
bool UseMe(const seq_pool_attr_t& attr) const override {
return platform::MayIUse(platform::avx);
}
size_t CodeSize(const seq_pool_attr_t& attr) const override {
return 96 +
((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
4 /* load, mul and save */ +
256) *
8;
}
std::unique_ptr<GenBase> CreateJitCode(
const seq_pool_attr_t& attr) const override {
PADDLE_ENFORCE_GT(attr.w, 0);
PADDLE_ENFORCE_GT(attr.h, 0);
return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
}
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
namespace gen = paddle::operators::jit::gen;
REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/operators/jit/gen/jitcode.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
class SeqPoolJitCode : public JitCode {
public:
explicit SeqPoolJitCode(const seq_pool_attr_t& attr,
size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) {
if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg ||
type_ == SeqPoolType::kSqrt)) {
LOG(FATAL) << "Only support sum pool yet ";
}
fp_h_[0] = 1.f;
this->genCode();
}
virtual const char* name() const {
std::string base = "SeqPoolJitCode";
if (type_ == SeqPoolType::kSum) {
base += "_Sum";
} else if (type_ == SeqPoolType::kAvg) {
base += "_Avg";
} else if (type_ == SeqPoolType::kSqrt) {
base += "_Sqrt";
}
base += ("_W" + std::to_string(w_));
return base.c_str();
}
void genCode() override;
protected:
template <typename JMM>
void pool_height(int w_offset, int block, int max_num_regs) {
int offset = w_offset;
for (int i = 0; i < max_num_regs; ++i) {
vmovups(JMM(i), ptr[param_src + offset]);
offset += sizeof(float) * block;
}
cmp(reg32_int_h, 1);
Label l_next_h, l_h_done;
jle(l_h_done, T_NEAR);
mov(reg_h_i, 1);
mov(reg_tmp, param_src);
add(reg_tmp, w_ * sizeof(float) + w_offset);
L(l_next_h);
{
mov(reg_ptr_src_i, reg_tmp);
for (int i = 0; i < max_num_regs; ++i) {
vmovups(JMM(i + max_num_regs), ptr[reg_ptr_src_i]);
// sum anyway
vaddps(JMM(i), JMM(i), JMM(i + max_num_regs));
add(reg_ptr_src_i, sizeof(float) * block);
}
inc(reg_h_i);
add(reg_tmp, w_ * sizeof(float));
cmp(reg_h_i, reg32_int_h);
jl(l_next_h, T_NEAR);
}
L(l_h_done);
// save right now
if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
vbroadcastss(JMM(max_num_regs), ptr[reg_tmp]);
}
offset = w_offset;
for (int i = 0; i < max_num_regs; ++i) {
if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
vmulps(JMM(i), JMM(i), JMM(max_num_regs));
}
vmovups(ptr[param_dst + offset], JMM(i));
offset += sizeof(float) * block;
}
}
void pool_height_of_rest_width(int rest, int w_offset, int max_num_regs) {
const int rest_used_num_regs = load_rest(rest, w_offset, 0);
const bool has_block4 = rest / 4 > 0;
const bool has_block2 = (rest % 4) / 2 > 0;
const bool has_block1 = (rest % 2) == 1;
cmp(reg32_int_h, 1);
Label l_next_h, l_h_done;
jle(l_h_done, T_NEAR);
mov(reg_h_i, 1);
mov(reg_tmp, param_src);
add(reg_tmp, w_ * sizeof(float) + w_offset);
L(l_next_h);
{
int reg_idx = 0;
mov(reg_ptr_src_i, reg_tmp);
if (has_block4) {
vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
add(reg_ptr_src_i, sizeof(float) * 4);
reg_idx++;
}
if (has_block2) {
vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
add(reg_ptr_src_i, sizeof(float) * 2);
reg_idx++;
}
if (has_block1) {
vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
reg_idx++;
}
PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs,
"All heights should use same regs");
for (int i = 0; i < reg_idx; ++i) {
vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
}
inc(reg_h_i);
add(reg_tmp, w_ * sizeof(float));
cmp(reg_h_i, reg32_int_h);
jl(l_next_h, T_NEAR);
}
L(l_h_done);
// save right now
if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
vbroadcastss(xmm_t(max_num_regs), ptr[reg_tmp]);
for (int i = 0; i < rest_used_num_regs; ++i) {
vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs));
}
}
save_rest(rest, w_offset);
}
// return the number of used regs, use start from reg 0
int load_rest(int rest, int w_offset, const int num_shift_regs,
const int reg_start = 0) {
const bool has_block4 = rest / 4 > 0;
const bool has_block2 = (rest % 4) / 2 > 0;
const bool has_block1 = (rest % 2) == 1;
int reg_idx = reg_start;
if (has_block4) {
vmovups(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
w_offset += sizeof(float) * 4;
reg_idx++;
}
if (has_block2) {
vmovq(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
w_offset += sizeof(float) * 2;
reg_idx++;
}
if (has_block1) {
vmovss(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
reg_idx++;
}
return reg_idx;
}
// use reg start from 0
void save_rest(int rest, int w_offset, int reg_start = 0) {
const bool has_block4 = rest / 4 > 0;
const bool has_block2 = (rest % 4) / 2 > 0;
const bool has_block1 = (rest % 2) == 1;
int reg_idx = reg_start;
if (has_block4) {
vmovups(ptr[param_dst + w_offset], xmm_t(reg_idx));
w_offset += sizeof(float) * 4;
reg_idx++;
}
if (has_block2) {
vmovq(ptr[param_dst + w_offset], xmm_t(reg_idx));
w_offset += sizeof(float) * 2;
reg_idx++;
}
if (has_block1) {
vmovss(ptr[param_dst + w_offset], xmm_t(reg_idx));
}
}
private:
float ALIGN32_BEG fp_h_[1] ALIGN32_END;
int w_;
SeqPoolType type_;
reg64_t param_src{abi_param1};
reg64_t param_dst{abi_param2};
reg64_t param_attr{abi_param3};
reg64_t reg_tmp{rax};
reg32_t reg32_int_h{r8d};
reg32_t reg32_fp_h{r9d};
reg64_t reg_h_i{r10};
reg64_t reg_ptr_src_i{r11};
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
...@@ -26,6 +26,7 @@ namespace jit { ...@@ -26,6 +26,7 @@ namespace jit {
const char* to_string(KernelType kt) { const char* to_string(KernelType kt) {
switch (kt) { switch (kt) {
ONE_CASE(kNone);
ONE_CASE(kVMul); ONE_CASE(kVMul);
ONE_CASE(kVAdd); ONE_CASE(kVAdd);
ONE_CASE(kVAddRelu); ONE_CASE(kVAddRelu);
...@@ -45,12 +46,26 @@ const char* to_string(KernelType kt) { ...@@ -45,12 +46,26 @@ const char* to_string(KernelType kt) {
ONE_CASE(kCRFDecoding); ONE_CASE(kCRFDecoding);
ONE_CASE(kLayerNorm); ONE_CASE(kLayerNorm);
ONE_CASE(kNCHW16CMulNC); ONE_CASE(kNCHW16CMulNC);
ONE_CASE(kSeqPool);
default: default:
PADDLE_THROW("Not support type: %d, or forget to add it.", kt); PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
return "NOT JITKernel"; return "NOT JITKernel";
} }
return nullptr; return nullptr;
} }
const char* to_string(SeqPoolType tp) {
switch (tp) {
ONE_CASE(kNonePoolType);
ONE_CASE(kSum);
ONE_CASE(kAvg);
ONE_CASE(kSqrt);
default:
PADDLE_THROW("Not support type: %d, or forget to add it.", tp);
return "NOT PoolType";
}
return nullptr;
}
#undef ONE_CASE #undef ONE_CASE
KernelType to_kerneltype(const std::string& act) { KernelType to_kerneltype(const std::string& act) {
......
...@@ -119,6 +119,7 @@ typename KernelTuples::func_type Get( ...@@ -119,6 +119,7 @@ typename KernelTuples::func_type Get(
} }
const char* to_string(KernelType kt); const char* to_string(KernelType kt);
const char* to_string(SeqPoolType kt);
KernelType to_kerneltype(const std::string& act); KernelType to_kerneltype(const std::string& act);
...@@ -134,6 +135,11 @@ inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) { ...@@ -134,6 +135,11 @@ inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
<< "],act_cand[" << to_string(attr.act_cand) << "]"; << "],act_cand[" << to_string(attr.act_cand) << "]";
return os; return os;
} }
inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type["
<< to_string(attr.type) << "]";
return os;
}
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
......
...@@ -41,8 +41,16 @@ typedef enum { ...@@ -41,8 +41,16 @@ typedef enum {
kCRFDecoding, kCRFDecoding,
kLayerNorm, kLayerNorm,
kNCHW16CMulNC, kNCHW16CMulNC,
kSeqPool,
} KernelType; } KernelType;
typedef enum {
kNonePoolType = 0,
kSum = 1,
kAvg,
kSqrt,
} SeqPoolType;
template <typename T> template <typename T>
struct XYZNTuples { struct XYZNTuples {
typedef T data_type; typedef T data_type;
...@@ -112,6 +120,21 @@ struct GRUTuples { ...@@ -112,6 +120,21 @@ struct GRUTuples {
typedef void (*func_type)(gru_t*, const gru_attr_t*); typedef void (*func_type)(gru_t*, const gru_attr_t*);
}; };
typedef struct seq_pool_attr_s {
int h, w; // h should always be the first one
SeqPoolType type;
seq_pool_attr_s() = default;
explicit seq_pool_attr_s(int width, SeqPoolType pool_type, int height = 1)
: h(height), w(width), type(pool_type) {}
} seq_pool_attr_t;
template <typename T>
struct SeqPoolTuples {
typedef T data_type;
typedef seq_pool_attr_t attr_type;
typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
};
template <typename T> template <typename T>
struct CRFDecodingTuples { struct CRFDecodingTuples {
typedef T data_type; typedef T data_type;
......
...@@ -42,6 +42,13 @@ size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) { ...@@ -42,6 +42,13 @@ size_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
(static_cast<int>(attr.act_cand) << act_type_shift); (static_cast<int>(attr.act_cand) << act_type_shift);
} }
template <>
size_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
size_t key = attr.w;
constexpr int pool_type_shift = 3;
return (key << pool_type_shift) + static_cast<int>(attr.type);
}
} // namespace jit } // namespace jit
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -9,3 +9,4 @@ USE_JITKERNEL_MORE(kVScal, mkl) ...@@ -9,3 +9,4 @@ USE_JITKERNEL_MORE(kVScal, mkl)
USE_JITKERNEL_MORE(kVExp, mkl) USE_JITKERNEL_MORE(kVExp, mkl)
USE_JITKERNEL_MORE(kVSigmoid, mkl) USE_JITKERNEL_MORE(kVSigmoid, mkl)
USE_JITKERNEL_MORE(kVTanh, mkl) USE_JITKERNEL_MORE(kVTanh, mkl)
USE_JITKERNEL_MORE(kSeqPool, mkl)
...@@ -72,6 +72,26 @@ void VExp<double>(const double* x, double* y, int n) { ...@@ -72,6 +72,26 @@ void VExp<double>(const double* x, double* y, int n) {
platform::dynload::vdExp(n, x, y); platform::dynload::vdExp(n, x, y);
} }
template <>
void VCopy<float>(const float* x, float* y, int n) {
platform::dynload::cblas_scopy(n, x, 1, y, 1);
}
template <>
void VCopy<double>(const double* x, double* y, int n) {
platform::dynload::cblas_dcopy(n, x, 1, y, 1);
}
template <>
void VAXPY<float>(float a, const float* x, float* y, int n) {
platform::dynload::cblas_saxpy(n, a, x, 1, y, 1);
}
template <>
void VAXPY<double>(double a, const double* x, double* y, int n) {
platform::dynload::cblas_daxpy(n, a, x, 1, y, 1);
}
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template <> template <>
bool VMulKernel<float>::UseMe(const int& d) const { bool VMulKernel<float>::UseMe(const int& d) const {
...@@ -103,6 +123,16 @@ bool VTanhKernel<float>::UseMe(const int& d) const { ...@@ -103,6 +123,16 @@ bool VTanhKernel<float>::UseMe(const int& d) const {
return d > 7; return d > 7;
} }
template <>
bool SeqPoolKernel<float>::UseMe(const seq_pool_attr_t& attr) const {
return true;
}
template <>
bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
return true;
}
#define AWALYS_USE_ME_WITH_DOUBLE(func) \ #define AWALYS_USE_ME_WITH_DOUBLE(func) \
template <> \ template <> \
bool func##Kernel<double>::UseMe(const int& d) const { \ bool func##Kernel<double>::UseMe(const int& d) const { \
...@@ -135,5 +165,6 @@ REGISTER_MKL_KERNEL(kVScal, VScal); ...@@ -135,5 +165,6 @@ REGISTER_MKL_KERNEL(kVScal, VScal);
REGISTER_MKL_KERNEL(kVExp, VExp); REGISTER_MKL_KERNEL(kVExp, VExp);
REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
REGISTER_MKL_KERNEL(kVTanh, VTanh); REGISTER_MKL_KERNEL(kVTanh, VTanh);
REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
#undef REGISTER_MKL_KERNEL #undef REGISTER_MKL_KERNEL
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <cmath>
#include <type_traits> #include <type_traits>
#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_base.h"
...@@ -35,6 +36,12 @@ void VScal(const T* a, const T* x, T* y, int n); ...@@ -35,6 +36,12 @@ void VScal(const T* a, const T* x, T* y, int n);
template <typename T> template <typename T>
void VExp(const T* x, T* y, int n); void VExp(const T* x, T* y, int n);
template <typename T>
void VCopy(const T* x, T* y, int n);
template <typename T>
void VAXPY(T a, const T* x, T* y, int n);
template <typename T> template <typename T>
void VSigmoid(const T* x, T* y, int n) { void VSigmoid(const T* x, T* y, int n) {
const T min = SIGMOID_THRESHOLD_MIN; const T min = SIGMOID_THRESHOLD_MIN;
...@@ -60,6 +67,23 @@ void VTanh(const T* x, T* y, int n) { ...@@ -60,6 +67,23 @@ void VTanh(const T* x, T* y, int n) {
} }
} }
template <typename T>
void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
VCopy<T>(x, y, attr->w);
for (int h = 1; h != attr->h; ++h) {
VAXPY<T>(static_cast<T>(1), x + h * attr->w, y, attr->w);
}
if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) {
T scalar = static_cast<T>(1);
if (attr->type == SeqPoolType::kAvg) {
scalar = scalar / static_cast<T>(attr->h);
} else {
scalar = scalar / std::sqrt(static_cast<T>(attr->h));
}
VScal<T>(&scalar, y, y, attr->w);
}
}
#define DECLARE_MKL_KERNEL(name, tuples) \ #define DECLARE_MKL_KERNEL(name, tuples) \
template <typename T> \ template <typename T> \
class name##Kernel : public KernelMore<tuples<T>> { \ class name##Kernel : public KernelMore<tuples<T>> { \
...@@ -81,6 +105,8 @@ DECLARE_MKL_KERNEL(VExp, XYNTuples); ...@@ -81,6 +105,8 @@ DECLARE_MKL_KERNEL(VExp, XYNTuples);
DECLARE_MKL_KERNEL(VSigmoid, XYNTuples); DECLARE_MKL_KERNEL(VSigmoid, XYNTuples);
DECLARE_MKL_KERNEL(VTanh, XYNTuples); DECLARE_MKL_KERNEL(VTanh, XYNTuples);
DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples);
#undef DECLARE_MKL_KERNEL #undef DECLARE_MKL_KERNEL
} // namespace mkl } // namespace mkl
......
...@@ -26,3 +26,4 @@ USE_JITKERNEL_REFER(kGRUHtPart2) ...@@ -26,3 +26,4 @@ USE_JITKERNEL_REFER(kGRUHtPart2)
USE_JITKERNEL_REFER(kCRFDecoding) USE_JITKERNEL_REFER(kCRFDecoding)
USE_JITKERNEL_REFER(kLayerNorm) USE_JITKERNEL_REFER(kLayerNorm)
USE_JITKERNEL_REFER(kNCHW16CMulNC) USE_JITKERNEL_REFER(kNCHW16CMulNC)
USE_JITKERNEL_REFER(kSeqPool)
...@@ -47,4 +47,6 @@ REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm); ...@@ -47,4 +47,6 @@ REGISTER_REFER_KERNEL(kLayerNorm, LayerNorm);
REGISTER_REFER_KERNEL(kNCHW16CMulNC, NCHW16CMulNC); REGISTER_REFER_KERNEL(kNCHW16CMulNC, NCHW16CMulNC);
REGISTER_REFER_KERNEL(kSeqPool, SeqPool);
#undef REGISTER_REFER_KERNEL #undef REGISTER_REFER_KERNEL
...@@ -332,6 +332,28 @@ void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) { ...@@ -332,6 +332,28 @@ void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) {
} }
} }
template <typename T>
void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
for (int w = 0; w < attr->w; ++w) {
const T* src = x + w;
T* dst = y + w;
*dst = static_cast<T>(0);
for (int h = 0; h < attr->h; ++h) {
*dst = *dst + *src;
src += attr->w;
}
}
if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) {
T scalar = static_cast<T>(1);
if (attr->type == SeqPoolType::kAvg) {
scalar = scalar / static_cast<T>(attr->h);
} else {
scalar = scalar / std::sqrt(static_cast<T>(attr->h));
}
VScal<T>(&scalar, y, y, attr->w);
}
}
#define DECLARE_REFER_KERNEL(name, tuples) \ #define DECLARE_REFER_KERNEL(name, tuples) \
template <typename T> \ template <typename T> \
class name##Kernel : public ReferKernel<tuples<T>> { \ class name##Kernel : public ReferKernel<tuples<T>> { \
...@@ -370,6 +392,8 @@ DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples); ...@@ -370,6 +392,8 @@ DECLARE_REFER_KERNEL(LayerNorm, LayerNormTuples);
DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples); DECLARE_REFER_KERNEL(NCHW16CMulNC, NCHW16CMulNCTuples);
DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples);
#undef DECLARE_REFER_KERNEL #undef DECLARE_REFER_KERNEL
} // namespace refer } // namespace refer
......
...@@ -211,6 +211,24 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>, ...@@ -211,6 +211,24 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
} }
}; };
template <typename T>
struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>,
std::vector<T>> {
void operator()(const typename jit::SeqPoolTuples<T>::func_type tgt,
const std::vector<T>& x, const std::vector<T>& yref,
const typename jit::SeqPoolTuples<T>::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(x.size() % yref.size(), 0);
int w = yref.size();
std::vector<T> y(w);
const T* x_data = x.data();
const T* yref_data = yref.data();
T* y_data = y.data();
tgt(x_data, y_data, &attr);
ExpectEQ<T>(y_data, yref_data, w);
}
};
template <paddle::operators::jit::KernelType KT, typename KernelTuples, template <paddle::operators::jit::KernelType KT, typename KernelTuples,
typename PlaceType, typename... Args> typename PlaceType, typename... Args>
void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
...@@ -415,6 +433,31 @@ void TestGRUKernel() { ...@@ -415,6 +433,31 @@ void TestGRUKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void TestSeqPoolKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
std::vector<jit::SeqPoolType> pool_types = {
jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
for (auto type : pool_types) {
for (int w : TestSizes()) {
jit::seq_pool_attr_t attr(w, type);
for (int h : TestSizes()) {
attr.h = h;
auto ref = jit::GetRefer<KT, jit::SeqPoolTuples<T>>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> x(h * w), yref(w);
RandomVec<T>(h * w, x.data(), -2.f, 2.f);
const T* x_data = x.data();
T* yref_data = yref.data();
ref(x_data, yref_data, &attr);
VLOG(10) << attr;
TestAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType, std::vector<T>,
std::vector<T>>(attr, x, yref, attr);
}
}
}
}
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
void TestNCHW16CMulNCKernel() { void TestNCHW16CMulNCKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
...@@ -569,6 +612,12 @@ TEST(JITKernel, kGRUHtPart2) { ...@@ -569,6 +612,12 @@ TEST(JITKernel, kGRUHtPart2) {
TestGRUKernel<jit::kGRUHtPart2, double, paddle::platform::CPUPlace>(); TestGRUKernel<jit::kGRUHtPart2, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kSeqPool) {
namespace jit = paddle::operators::jit;
TestSeqPoolKernel<jit::kSeqPool, float, paddle::platform::CPUPlace>();
TestSeqPoolKernel<jit::kSeqPool, double, paddle::platform::CPUPlace>();
}
TEST(JITKernel, kNCHW16CMulNC) { TEST(JITKernel, kNCHW16CMulNC) {
namespace jit = paddle::operators::jit; namespace jit = paddle::operators::jit;
TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float, TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float,
......
...@@ -51,7 +51,7 @@ math_library(pooling) ...@@ -51,7 +51,7 @@ math_library(pooling)
math_library(selected_rows_functor DEPS selected_rows math_function blas) math_library(selected_rows_functor DEPS selected_rows math_function blas)
math_library(sequence2batch) math_library(sequence2batch)
math_library(sequence_padding) math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function) math_library(sequence_pooling DEPS math_function jit_kernel_helper)
math_library(sequence_scale) math_library(sequence_scale)
math_library(softmax DEPS math_function) math_library(softmax DEPS math_function)
......
...@@ -195,6 +195,10 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> { ...@@ -195,6 +195,10 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input1, const framework::SelectedRows& input1,
framework::Tensor* input2) { framework::Tensor* input2) {
if (UNLIKELY(input1.rows().size() == 0)) {
LOG(WARNING) << "input selected rows is empty!";
return;
}
auto in1_height = input1.height(); auto in1_height = input1.height();
auto in2_dims = input2->dims(); auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_pooling.h" #include "paddle/fluid/operators/math/sequence_pooling.h"
...@@ -239,15 +240,33 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> { ...@@ -239,15 +240,33 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
last_pool(context, input, output); last_pool(context, input, output);
return; return;
} }
if (pooltype == "FIRST") { if (pooltype == "FIRST") {
math::FirstSeqPoolFunctor<T> first_pool; math::FirstSeqPoolFunctor<T> first_pool;
first_pool(context, input, output); first_pool(context, input, output);
return; return;
} }
auto lod = input.lod()[0]; auto lod = input.lod()[0];
if (pooltype == "SUM") {
auto place = context.GetPlace();
PADDLE_ENFORCE(platform::is_cpu_place(place));
const T* src = input.data<T>();
T* dst = output->mutable_data<T>(place);
jit::seq_pool_attr_t attr(
static_cast<int>(input.numel() / input.dims()[0]),
jit::SeqPoolType::kSum);
auto seqpool =
jit::Get<jit::kSeqPool, jit::SeqPoolTuples<T>, platform::CPUPlace>(
attr);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
attr.h = static_cast<int>(lod[i + 1] - lod[i]);
seqpool(src, dst, &attr);
dst += attr.w;
src += attr.h * attr.w;
}
return;
}
auto& place = *context.eigen_device(); auto& place = *context.eigen_device();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
Tensor in_t = Tensor in_t =
input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1])); input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
...@@ -258,15 +277,6 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> { ...@@ -258,15 +277,6 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
auto out_e = EigenVector<T>::Flatten(out_t); auto out_e = EigenVector<T>::Flatten(out_t);
if (pooltype == "AVERAGE") { if (pooltype == "AVERAGE") {
out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}})); out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
} else if (pooltype == "SUM") {
if (h > 0) {
const T* in_data = in_t.data<T>();
T* out_data = out_t.mutable_data<T>(context.GetPlace());
blas.VCOPY(w, in_data, out_data);
for (int64_t r = 1; r != h; ++r) {
blas.AXPY(w, 1., in_data + r * w, out_data);
}
}
} else if (pooltype == "SQRT") { } else if (pooltype == "SQRT") {
out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) / out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
std::sqrt(static_cast<T>(h)); std::sqrt(static_cast<T>(h));
......
...@@ -49,6 +49,7 @@ class SoftmaxGradCUDNNFunctor { ...@@ -49,6 +49,7 @@ class SoftmaxGradCUDNNFunctor {
const framework::Tensor* Y, const framework::Tensor* y_grad, const framework::Tensor* Y, const framework::Tensor* y_grad,
framework::Tensor* x_grad); framework::Tensor* x_grad);
}; };
#endif #endif
} // namespace math } // namespace math
......
...@@ -23,5 +23,7 @@ limitations under the License. */ ...@@ -23,5 +23,7 @@ limitations under the License. */
#include "ops/binary_unnary_op.h" #include "ops/binary_unnary_op.h"
#include "ops/fill_constant_op.h" #include "ops/fill_constant_op.h"
#include "ops/mean_op.h"
#include "ops/mul_op.h" #include "ops/mul_op.h"
#include "ops/scale_op.h"
#include "ops/top_k_op.h" #include "ops/top_k_op.h"
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once #pragma once
#include <string> #include <string>
...@@ -48,4 +47,3 @@ static void BuildUnaryNode( ...@@ -48,4 +47,3 @@ static void BuildUnaryNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#endif
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
template <typename T>
std::shared_ptr<ngraph::Node> ElementwiseScalar(
float scale, std::shared_ptr<ngraph::Node> node) {
auto node_shape = node->get_shape();
auto scale_const = ngraph::op::Constant::create(node->get_element_type(),
node_shape, {scale});
return std::make_shared<T>(scale_const, node);
}
template <typename T>
std::shared_ptr<ngraph::Node> ElementwiseScalar(
std::shared_ptr<ngraph::Node> scale_1d,
std::shared_ptr<ngraph::Node> node) {
auto scale_shape = scale_1d->get_shape();
PADDLE_ENFORCE_EQ(scale_shape.size(), 1, "Supporting 1d scale node");
PADDLE_ENFORCE_EQ(scale_shape.at(0), 1, "scale 1d in in shape {1}");
auto node_shape = node->get_shape();
ngraph::AxisSet axis_set;
for (size_t i = 0; i < node_shape.size(); ++i) {
axis_set.insert(i);
}
node_shape.push_back(1);
auto scale_bcast =
std::make_shared<ngraph::op::Broadcast>(scale_1d, node_shape, axis_set);
auto scale_reshape =
paddle::platform::NgReshaper(scale_bcast, node->get_shape());
return std::make_shared<T>(scale_reshape, node);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once #pragma once
#include <string> #include <string>
...@@ -58,4 +57,3 @@ void BuildFillConstantNode( ...@@ -58,4 +57,3 @@ void BuildFillConstantNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#endif
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildMeanNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
ngraph::AxisSet axes;
for (size_t i = 0; i < input->get_shape().size(); ++i) {
axes.insert(i);
}
auto mean = ngraph::builder::mean(input, axes);
auto mean_1d = std::make_shared<ngraph::op::Reshape>(
mean, ngraph::AxisVector{}, ngraph::Shape{1});
paddle::platform::SetOutputNode(op, "Out", mean_1d, ngb_node_map);
}
void BuildMeanGradNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto og = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
auto x_shape = x->get_shape();
float x_size = std::accumulate(std::begin(x_shape), std::end(x_shape), 1,
std::multiplies<float>());
auto node_const = ngraph::op::Constant::create(og->get_element_type(),
ngraph::Shape{1}, {x_size});
auto node_div = std::make_shared<ngraph::op::Divide>(og, node_const);
auto result = ElementwiseScalar<ngraph::op::Add>(
og / node_const,
ngraph::op::Constant::create(og->get_element_type(), x_shape, {0}));
paddle::platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once #pragma once
#include <string> #include <string>
...@@ -131,4 +130,3 @@ static void BuildMulGradNode( ...@@ -131,4 +130,3 @@ static void BuildMulGradNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#endif
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildScaleNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
float scale = op_attrs.Get<float>("scale");
auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
auto out = ElementwiseScalar<ngraph::op::Multiply>(scale, x);
paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#pragma once #pragma once
#include <string> #include <string>
...@@ -48,4 +47,3 @@ void BuildTopKNode( ...@@ -48,4 +47,3 @@ void BuildTopKNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -58,12 +55,24 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad, ...@@ -58,12 +55,24 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
} // namespace } // namespace
static __device__ __forceinline__ float real_exp(float x) { return expf(x); } static __device__ __forceinline__ platform::float16 exp_on_device(
static __device__ __forceinline__ double real_exp(double x) { return exp(x); } platform::float16 x) {
static __device__ __forceinline__ float real_log(float x) { return ::Eigen::numext::exp(x);
}
static __device__ __forceinline__ float exp_on_device(float x) {
return expf(x);
}
static __device__ __forceinline__ double exp_on_device(double x) {
return exp(x);
}
static __device__ __forceinline__ platform::float16 log_on_device(
platform::float16 x) {
return math::TolerableValue<platform::float16>()(::Eigen::numext::log(x));
}
static __device__ __forceinline__ float log_on_device(float x) {
return math::TolerableValue<float>()(logf(x)); return math::TolerableValue<float>()(logf(x));
} }
static __device__ __forceinline__ double real_log(double x) { static __device__ __forceinline__ double log_on_device(double x) {
return math::TolerableValue<double>()(log(x)); return math::TolerableValue<double>()(log(x));
} }
...@@ -72,25 +81,20 @@ static __device__ __forceinline__ double real_log(double x) { ...@@ -72,25 +81,20 @@ static __device__ __forceinline__ double real_log(double x) {
/* /*
Supposing the x is `logits` and y is `labels`, the equations are as Supposing the x is `logits` and y is `labels`, the equations are as
followings: followings:
cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})] cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})]
= \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})] = \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})]
= \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})] = \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})]
= \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)] = \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)]
= \sum_{j}(-y_i_j * tmp_i_j) = \sum_{j}(-y_i_j * tmp_i_j)
softmax_i_j = e^{tmp_i_j} softmax_i_j = e^{tmp_i_j}
where: where:
max_i = \max_{j}{x_i_j} max_i = \max_{j}{x_i_j}
logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i} logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i}
tmp_i_j = x_i_j - max_i - logDiffMaxSum_i tmp_i_j = x_i_j - max_i - logDiffMaxSum_i
Therefore, the calculation can be separated into 3 steps: Therefore, the calculation can be separated into 3 steps:
Step 1: row-wise operation to calculate max_i Step 1: row-wise operation to calculate max_i
Step 2: row-wise operation to calculate logDiffMaxSum_i Step 2: row-wise operation to calculate logDiffMaxSum_i
Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
To save memory, we can share memory among max_i, logDiffMaxSum_i and To save memory, we can share memory among max_i, logDiffMaxSum_i and
cross\_entropy_i. cross\_entropy_i.
In this way, the 3 steps should be changed to: In this way, the 3 steps should be changed to:
...@@ -134,7 +138,8 @@ static __global__ void RowReductionForMax(const T* logits_data, T* max_data, ...@@ -134,7 +138,8 @@ static __global__ void RowReductionForMax(const T* logits_data, T* max_data,
cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max()); cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max());
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
max_data[blockIdx.x] = cur_max < -64 ? -64 : cur_max; max_data[blockIdx.x] =
cur_max < static_cast<T>(-64) ? static_cast<T>(-64) : cur_max;
} }
} }
...@@ -151,17 +156,17 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data, ...@@ -151,17 +156,17 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
auto block_max = max_data[blockIdx.x]; auto block_max = max_data[blockIdx.x];
softmax[beg_idx] = logits_data[beg_idx] - block_max; softmax[beg_idx] = logits_data[beg_idx] - block_max;
T diff_max_sum = real_exp(softmax[beg_idx]); T diff_max_sum = exp_on_device(softmax[beg_idx]);
auto idx = beg_idx + BlockDim; auto idx = beg_idx + BlockDim;
while (idx < end_idx) { while (idx < end_idx) {
softmax[idx] = logits_data[idx] - block_max; softmax[idx] = logits_data[idx] - block_max;
diff_max_sum += real_exp(softmax[idx]); diff_max_sum += exp_on_device(softmax[idx]);
idx += BlockDim; idx += BlockDim;
} }
diff_max_sum = diff_max_sum =
BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum()); BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum());
if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum); if (threadIdx.x == 0) max_data[blockIdx.x] = log_on_device(diff_max_sum);
if (!CalculateLogSoftmax) return; if (!CalculateLogSoftmax) return;
__syncthreads(); __syncthreads();
...@@ -188,12 +193,12 @@ static __global__ void RowReductionForSoftmaxAndCrossEntropy( ...@@ -188,12 +193,12 @@ static __global__ void RowReductionForSoftmaxAndCrossEntropy(
// log_diff_max_sum shares memory with loss // log_diff_max_sum shares memory with loss
auto block_log_diff_max_sum = loss_data[blockIdx.x]; auto block_log_diff_max_sum = loss_data[blockIdx.x];
auto tmp = softmax[beg_idx] - block_log_diff_max_sum; auto tmp = softmax[beg_idx] - block_log_diff_max_sum;
softmax[beg_idx] = real_exp(tmp); softmax[beg_idx] = exp_on_device(tmp);
auto loss = -labels_data[beg_idx] * tmp; auto loss = -labels_data[beg_idx] * tmp;
beg_idx += BlockDim; beg_idx += BlockDim;
while (beg_idx < end_idx) { while (beg_idx < end_idx) {
tmp = softmax[beg_idx] - block_log_diff_max_sum; tmp = softmax[beg_idx] - block_log_diff_max_sum;
softmax[beg_idx] = real_exp(tmp); softmax[beg_idx] = exp_on_device(tmp);
loss -= (labels_data[beg_idx] * tmp); loss -= (labels_data[beg_idx] * tmp);
beg_idx += BlockDim; beg_idx += BlockDim;
} }
...@@ -218,10 +223,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor { ...@@ -218,10 +223,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor {
auto row_idx = idx / feature_size_; auto row_idx = idx / feature_size_;
auto col_idx = idx % feature_size_; auto col_idx = idx % feature_size_;
if (col_idx != labels_[row_idx]) { if (col_idx != labels_[row_idx]) {
log_softmax_[idx] = real_exp(log_softmax_[idx]); log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
} else { } else {
auto softmax = log_softmax_[idx]; auto softmax = log_softmax_[idx];
log_softmax_[idx] = real_exp(softmax); log_softmax_[idx] = exp_on_device(softmax);
loss_[row_idx] = -softmax; loss_[row_idx] = -softmax;
} }
} }
...@@ -253,10 +258,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx { ...@@ -253,10 +258,10 @@ struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
auto row_idx = idx / feature_size_; auto row_idx = idx / feature_size_;
auto col_idx = idx % feature_size_; auto col_idx = idx % feature_size_;
if (col_idx != labels_[row_idx] || col_idx == ignore_idx_) { if (col_idx != labels_[row_idx] || col_idx == ignore_idx_) {
log_softmax_[idx] = real_exp(log_softmax_[idx]); log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
} else { } else {
auto softmax = log_softmax_[idx]; auto softmax = log_softmax_[idx];
log_softmax_[idx] = real_exp(softmax); log_softmax_[idx] = exp_on_device(softmax);
loss_[row_idx] = -softmax; loss_[row_idx] = -softmax;
} }
} }
...@@ -464,9 +469,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> { ...@@ -464,9 +469,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy, REGISTER_OP_CUDA_KERNEL(
ops::SoftmaxWithCrossEntropyCUDAKernel<float>, softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
ops::SoftmaxWithCrossEntropyCUDAKernel<double>); ops::SoftmaxWithCrossEntropyCUDAKernel<paddle::platform::float16>,
REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad, ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>, REGISTER_OP_CUDA_KERNEL(
ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>); softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
ops::SoftmaxWithCrossEntropyGradCUDAKernel<paddle::platform::float16>,
ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
...@@ -41,7 +41,9 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -41,7 +41,9 @@ class SumOp : public framework::OperatorWithKernel {
return; // skip runtime infershape when is tensor array; return; // skip runtime infershape when is tensor array;
} }
auto x_var_types = ctx->GetInputsVarType("X");
auto x_dims = ctx->GetInputsDim("X"); auto x_dims = ctx->GetInputsDim("X");
size_t N = x_dims.size(); size_t N = x_dims.size();
PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0."); PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
if (N == 1) { if (N == 1) {
...@@ -49,7 +51,13 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -49,7 +51,13 @@ class SumOp : public framework::OperatorWithKernel {
} }
framework::DDim in_dim({0}); framework::DDim in_dim({0});
for (auto& x_dim : x_dims) { for (size_t i = 0; i < x_dims.size(); ++i) {
auto& x_dim = x_dims[i];
// x_dim.size() == 1 means the real dim of selected rows is [0]
if (x_var_types[i] == framework::proto::VarType::SELECTED_ROWS &&
x_dim.size() == 1) {
continue;
}
if (framework::product(x_dim) == 0) { if (framework::product(x_dim) == 0) {
continue; continue;
} }
......
...@@ -15,6 +15,9 @@ ...@@ -15,6 +15,9 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <algorithm> #include <algorithm>
#include <iostream> #include <iostream>
#ifdef _WIN32
#include <numeric>
#endif
#include <random> #include <random>
#define PADDLE_CUDA_FP16 #define PADDLE_CUDA_FP16
......
...@@ -59,7 +59,7 @@ limitations under the License. */ ...@@ -59,7 +59,7 @@ limitations under the License. */
#if !defined(_WIN32) #if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x))) #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else #else
#define PADDLE_ALIGN(x) /*do nothing*/ #define PADDLE_ALIGN(x) __declspec(align(x))
#endif #endif
namespace paddle { namespace paddle {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册