提交 3d769b97 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into all_new_design_exec

...@@ -284,6 +284,27 @@ if(WITH_GPU) ...@@ -284,6 +284,27 @@ if(WITH_GPU)
endif() endif()
endif() endif()
if(WITH_ROCM)
include(hip)
include(miopen) # set miopen libraries, must before configure
endif(WITH_ROCM)
if (NOT WITH_ROCM AND WITH_RCCL)
MESSAGE(WARNING
"Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
set(WITH_RCCL OFF CACHE STRING
"Disable RCCL when compiling without ROCM" FORCE)
endif()
if(WITH_RCCL)
add_definitions("-DPADDLE_WITH_RCCL")
include(rccl)
else()
if(WITH_ROCM)
MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
endif()
endif()
include(third_party) # download, build, install third_party, Contains about 20+ dependencies include(third_party) # download, build, install third_party, Contains about 20+ dependencies
include(flags) # set paddle compile flags include(flags) # set paddle compile flags
...@@ -308,26 +329,6 @@ include(configure) # add paddle env configuration ...@@ -308,26 +329,6 @@ include(configure) # add paddle env configuration
include_directories("${PADDLE_SOURCE_DIR}") include_directories("${PADDLE_SOURCE_DIR}")
if(WITH_ROCM)
include(hip)
endif(WITH_ROCM)
if (NOT WITH_ROCM AND WITH_RCCL)
MESSAGE(WARNING
"Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
set(WITH_RCCL OFF CACHE STRING
"Disable RCCL when compiling without ROCM" FORCE)
endif()
if(WITH_RCCL)
add_definitions("-DPADDLE_WITH_RCCL")
include(rccl)
else()
if(WITH_ROCM)
MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
endif()
endif()
if(WITH_NV_JETSON) if(WITH_NV_JETSON)
set(WITH_ARM ON CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE) set(WITH_ARM ON CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE)
endif() endif()
......
...@@ -143,6 +143,14 @@ elseif(WITH_ROCM) ...@@ -143,6 +143,14 @@ elseif(WITH_ROCM)
add_definitions(-DPADDLE_WITH_HIP) add_definitions(-DPADDLE_WITH_HIP)
add_definitions(-DEIGEN_USE_GPU) add_definitions(-DEIGEN_USE_GPU)
add_definitions(-DEIGEN_USE_HIP) add_definitions(-DEIGEN_USE_HIP)
if(NOT MIOPEN_FOUND)
message(FATAL_ERROR "Paddle needs MIOpen to compile")
endif()
if(${MIOPEN_VERSION} VERSION_LESS 2090)
message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile")
endif()
else() else()
add_definitions(-DHPPL_STUB_FUNC) add_definitions(-DHPPL_STUB_FUNC)
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
......
...@@ -95,11 +95,23 @@ function(select_nvcc_arch_flags out_variable) ...@@ -95,11 +95,23 @@ function(select_nvcc_arch_flags out_variable)
if(${CUDA_ARCH_NAME} STREQUAL "Kepler") if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
set(cuda_arch_bin "30 35") set(cuda_arch_bin "30 35")
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
set(cuda_arch_bin "50") if (WITH_NV_JETSON)
set(cuda_arch_bin "53")
else()
set(cuda_arch_bin "50")
endif()
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
set(cuda_arch_bin "60 61") if (WITH_NV_JETSON)
set(cuda_arch_bin "62")
else()
set(cuda_arch_bin "60 61")
endif()
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
set(cuda_arch_bin "70") if (WITH_NV_JETSON)
set(cuda_arch_bin "72")
else()
set(cuda_arch_bin "70")
endif()
elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
set(cuda_arch_bin "75") set(cuda_arch_bin "75")
elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere") elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
......
...@@ -46,6 +46,7 @@ ExternalProject_Add( ...@@ -46,6 +46,7 @@ ExternalProject_Add(
${BOOST_PROJECT} ${BOOST_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
"${BOOST_DOWNLOAD_CMD}" "${BOOST_DOWNLOAD_CMD}"
URL_MD5 f891e8c2c9424f0565f0129ad9ab4aff
PREFIX ${BOOST_PREFIX_DIR} PREFIX ${BOOST_PREFIX_DIR}
DOWNLOAD_DIR ${BOOST_SOURCE_DIR} DOWNLOAD_DIR ${BOOST_SOURCE_DIR}
SOURCE_DIR ${BOOST_SOURCE_DIR} SOURCE_DIR ${BOOST_SOURCE_DIR}
......
...@@ -110,7 +110,7 @@ if(WIN32) ...@@ -110,7 +110,7 @@ if(WIN32)
add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def)
add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
COMMAND for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) COMMAND echo off && (for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) && echo on)
add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib /machine:x64) COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib /machine:x64)
else(WIN32) else(WIN32)
......
...@@ -24,6 +24,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") ...@@ -24,6 +24,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
IF(WIN32) IF(WIN32)
SET(MKLML_VER "mklml_win_2019.0.5.20190502" CACHE STRING "" FORCE) SET(MKLML_VER "mklml_win_2019.0.5.20190502" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
SET(MKLML_URL_MD5 ff8c5237570f03eea37377ccfc95a08a)
SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
...@@ -33,6 +34,7 @@ ELSE() ...@@ -33,6 +34,7 @@ ELSE()
# Now enable csrmm function in mklml library temporarily, it will be updated as offical version later. # Now enable csrmm function in mklml library temporarily, it will be updated as offical version later.
SET(MKLML_VER "csrmm_mklml_lnx_2019.0.5" CACHE STRING "" FORCE) SET(MKLML_VER "csrmm_mklml_lnx_2019.0.5" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_URL_MD5 bc6a7faea6a2a9ad31752386f3ae87da)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
...@@ -52,6 +54,7 @@ ExternalProject_Add( ...@@ -52,6 +54,7 @@ ExternalProject_Add(
${MKLML_PROJECT} ${MKLML_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
"${MKLML_DOWNLOAD_CMD}" "${MKLML_DOWNLOAD_CMD}"
URL_MD5 ${MKLML_URL_MD5}
PREFIX ${MKLML_PREFIX_DIR} PREFIX ${MKLML_PREFIX_DIR}
DOWNLOAD_DIR ${MKLML_SOURCE_DIR} DOWNLOAD_DIR ${MKLML_SOURCE_DIR}
SOURCE_DIR ${MKLML_SOURCE_DIR} SOURCE_DIR ${MKLML_SOURCE_DIR}
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
INCLUDE(ExternalProject)
SET(ROCKSDB_SOURCES_DIR ${THIRD_PARTY_PATH}/rocksdb)
SET(ROCKSDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocksdb)
SET(ROCKSDB_INCLUDE_DIR "${ROCKSDB_INSTALL_DIR}/include" CACHE PATH "rocksdb include directory." FORCE)
SET(ROCKSDB_LIBRARIES "${ROCKSDB_INSTALL_DIR}/lib/librocksdb.a" CACHE FILEPATH "rocksdb library." FORCE)
SET(ROCKSDB_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
INCLUDE_DIRECTORIES(${ROCKSDB_INCLUDE_DIR})
ExternalProject_Add(
extern_rocksdb
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${ROCKSDB_SOURCES_DIR}
GIT_REPOSITORY "https://github.com/facebook/rocksdb"
GIT_TAG v6.10.1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DWITH_BZ2=OFF
-DWITH_GFLAGS=OFF
-DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
# BUILD_BYPRODUCTS ${ROCKSDB_SOURCES_DIR}/src/extern_rocksdb/librocksdb.a
INSTALL_COMMAND mkdir -p ${ROCKSDB_INSTALL_DIR}/lib/
&& cp ${ROCKSDB_SOURCES_DIR}/src/extern_rocksdb/librocksdb.a ${ROCKSDB_LIBRARIES}
&& cp -r ${ROCKSDB_SOURCES_DIR}/src/extern_rocksdb/include ${ROCKSDB_INSTALL_DIR}/
BUILD_IN_SOURCE 1
)
ADD_DEPENDENCIES(extern_rocksdb snappy)
ADD_LIBRARY(rocksdb STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET rocksdb PROPERTY IMPORTED_LOCATION ${ROCKSDB_LIBRARIES})
ADD_DEPENDENCIES(rocksdb extern_rocksdb)
LIST(APPEND external_project_dependencies rocksdb)
...@@ -24,7 +24,7 @@ SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) ...@@ -24,7 +24,7 @@ SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
# in case of low internet speed # in case of low internet speed
#set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git) #set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git)
set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git) set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
set(WARPCTC_TAG c690fc5755abbdbdc98ef78d51ec10a6748a8cd1) set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE) CACHE PATH "Warp-ctc Directory" FORCE)
......
...@@ -146,12 +146,12 @@ copy(inference_lib_dist ...@@ -146,12 +146,12 @@ copy(inference_lib_dist
SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
DSTS ${dst_dir}) DSTS ${dst_dir})
# Only GPU need cudaErrorMessage.pb # GPU must copy externalErrorMsg.pb
IF(WITH_GPU) IF(WITH_GPU)
set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data") set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/externalError/data")
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${cudaerror_INCLUDE_DIR} SRCS ${externalError_INCLUDE_DIR}
DSTS ${dst_dir}) DSTS ${dst_dir})
ENDIF() ENDIF()
# CMakeCache Info # CMakeCache Info
...@@ -193,10 +193,7 @@ copy(inference_lib_dist ...@@ -193,10 +193,7 @@ copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/* SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/*
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex64.h SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h
...@@ -259,7 +256,7 @@ copy(fluid_lib_dist ...@@ -259,7 +256,7 @@ copy(fluid_lib_dist
set(module "platform") set(module "platform")
set(platform_lib_deps profiler_proto error_codes_proto) set(platform_lib_deps profiler_proto error_codes_proto)
if(WITH_GPU) if(WITH_GPU)
set(platform_lib_deps ${platform_lib_deps} cuda_error_proto) set(platform_lib_deps ${platform_lib_deps} external_error_proto)
endif(WITH_GPU) endif(WITH_GPU)
add_dependencies(fluid_lib_dist ${platform_lib_deps}) add_dependencies(fluid_lib_dist ${platform_lib_deps})
...@@ -323,12 +320,18 @@ function(version version_file) ...@@ -323,12 +320,18 @@ function(version version_file)
"GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
"WITH_MKL: ${WITH_MKL}\n" "WITH_MKL: ${WITH_MKL}\n"
"WITH_MKLDNN: ${WITH_MKLDNN}\n" "WITH_MKLDNN: ${WITH_MKLDNN}\n"
"WITH_GPU: ${WITH_GPU}\n") "WITH_GPU: ${WITH_GPU}\n"
"WITH_ROCM: ${WITH_ROCM}\n")
if(WITH_GPU) if(WITH_GPU)
file(APPEND ${version_file} file(APPEND ${version_file}
"CUDA version: ${CUDA_VERSION}\n" "CUDA version: ${CUDA_VERSION}\n"
"CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n") "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n")
endif() endif()
if(WITH_ROCM)
file(APPEND ${version_file}
"HIP version: ${HIP_VERSION}\n"
"MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n")
endif()
file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n") file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
if(TENSORRT_FOUND) if(TENSORRT_FOUND)
file(APPEND ${version_file} file(APPEND ${version_file}
......
if(NOT WITH_ROCM)
return()
endif()
# Now we don't support ROCm on windows
if(WIN32)
return()
endif()
set(MIOPEN_ROOT ${ROCM_PATH}/miopen CACHE PATH "MIOPEN ROOT")
find_path(MIOPEN_INCLUDE_DIR "miopen/miopen.h"
PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/include ${MIOPEN_ROOT}/local/include
$ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/include $ENV{MIOPEN_ROOT}/local/include
NO_DEFAULT_PATH
)
get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
find_library(MIOPEN_LIBRARY NAMES "libMIOpen.so"
PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/lib ${MIOPEN_ROOT}/lib64 ${__libpath_hist}
$ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/lib $ENV{MIOPEN_ROOT}/lib64
NO_DEFAULT_PATH
DOC "Path to MIOpen library.")
if(MIOPEN_INCLUDE_DIR AND MIOPEN_LIBRARY)
set(MIOPEN_FOUND ON)
else()
set(MIOPEN_FOUND OFF)
endif()
macro(find_miopen_version miopen_header_file)
file(READ ${miopen_header_file} MIOPEN_VERSION_FILE_CONTENTS)
get_filename_component(MIOPEN_LIB_PATH ${MIOPEN_LIBRARY} DIRECTORY)
string(REGEX MATCH "define MIOPEN_VERSION_MAJOR +([0-9]+)" MIOPEN_MAJOR_VERSION
"${MIOPEN_VERSION_FILE_CONTENTS}")
string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR +([0-9]+)" "\\1"
MIOPEN_MAJOR_VERSION "${MIOPEN_MAJOR_VERSION}")
string(REGEX MATCH "define MIOPEN_VERSION_MINOR +([0-9]+)" MIOPEN_MINOR_VERSION
"${MIOPEN_VERSION_FILE_CONTENTS}")
string(REGEX REPLACE "define MIOPEN_VERSION_MINOR +([0-9]+)" "\\1"
MIOPEN_MINOR_VERSION "${MIOPEN_MINOR_VERSION}")
string(REGEX MATCH "define MIOPEN_VERSION_PATCH +([0-9]+)" MIOPEN_PATCH_VERSION
"${MIOPEN_VERSION_FILE_CONTENTS}")
string(REGEX REPLACE "define MIOPEN_VERSION_PATCH +([0-9]+)" "\\1"
MIOPEN_PATCH_VERSION "${MIOPEN_PATCH_VERSION}")
string(REGEX MATCH "define MIOPEN_VERSION_TWEAK +([0-9]+)" MIOPEN_TWEAK_VERSION
"${MIOPEN_VERSION_FILE_CONTENTS}")
string(REGEX REPLACE "define MIOPEN_VERSION_TWEAK +([0-9]+)" "\\1"
MIOPEN_TWEAK_VERSION "${MIOPEN_TWEAK_VERSION}")
if(NOT MIOPEN_MAJOR_VERSION)
set(MIOPEN_VERSION "???")
else()
add_definitions("-DMIOPEN_MAJOR_VERSION=\"${MIOPEN_MAJOR_VERSION}\"")
math(EXPR MIOPEN_VERSION
"${MIOPEN_MAJOR_VERSION} * 1000 +
${MIOPEN_MINOR_VERSION} * 10 + ${MIOPEN_PATCH_VERSION}")
message(STATUS "Current MIOpen header is ${MIOPEN_INCLUDE_DIR}/miopen/miopen.h "
"Current MIOpen version is v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}.${MIOPEN_PATCH_VERSION}. ")
endif()
endmacro()
if(MIOPEN_FOUND)
find_miopen_version(${MIOPEN_INCLUDE_DIR}/miopen/version.h)
endif()
...@@ -108,13 +108,19 @@ ENDMACRO() ...@@ -108,13 +108,19 @@ ENDMACRO()
# 2. NAME: The name of file, that determin the dirname # 2. NAME: The name of file, that determin the dirname
# #
FUNCTION(file_download_and_uncompress URL NAME) FUNCTION(file_download_and_uncompress URL NAME)
MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}") set(options "")
set(oneValueArgs MD5)
set(multiValueArgs "")
cmake_parse_arguments(URL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}, MD5: ${URL_MD5}")
SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data PARENT_SCOPE) SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data PARENT_SCOPE)
ExternalProject_Add( ExternalProject_Add(
extern_download_${NAME} download_${NAME}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${THIRD_PARTY_PATH}/${NAME} PREFIX ${THIRD_PARTY_PATH}/${NAME}
URL ${URL} URL ${URL}
URL_MD5 ${URL_MD5}
TIMEOUT 120
DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/
DOWNLOAD_NO_PROGRESS 1 DOWNLOAD_NO_PROGRESS 1
...@@ -123,7 +129,7 @@ FUNCTION(file_download_and_uncompress URL NAME) ...@@ -123,7 +129,7 @@ FUNCTION(file_download_and_uncompress URL NAME)
UPDATE_COMMAND "" UPDATE_COMMAND ""
INSTALL_COMMAND "" INSTALL_COMMAND ""
) )
set(third_party_deps ${third_party_deps} extern_download_${NAME} PARENT_SCOPE) set(third_party_deps ${third_party_deps} download_${NAME} PARENT_SCOPE)
ENDFUNCTION() ENDFUNCTION()
...@@ -242,8 +248,20 @@ if(WITH_GPU) ...@@ -242,8 +248,20 @@ if(WITH_GPU)
include(external/cub) # download cub include(external/cub) # download cub
list(APPEND third_party_deps extern_cub) list(APPEND third_party_deps extern_cub)
endif() endif()
set(CUDAERROR_URL "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE) set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage file_download_and_uncompress(${URL} "externalError" MD5 c0749523ebb536eb7382487d645d9cd4) # download file externalErrorMsg.tar.gz
if(WITH_TESTING)
# copy externalErrorMsg.pb for unittest 'enforce_test'
set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data)
if(WIN32 AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja"))
set(DST_DIR ${CMAKE_BINARY_DIR}/paddle/fluid/third_party/externalError/data)
else()
set(DST_DIR ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data)
endif()
add_custom_command(TARGET download_externalError POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR}
COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}")
endif()
endif(WITH_GPU) endif(WITH_GPU)
if(WITH_XPU) if(WITH_XPU)
...@@ -304,6 +322,11 @@ if (WITH_PSCORE) ...@@ -304,6 +322,11 @@ if (WITH_PSCORE)
include(external/libmct) # download, build, install libmct include(external/libmct) # download, build, install libmct
list(APPEND third_party_deps extern_libmct) list(APPEND third_party_deps extern_libmct)
if (WITH_HETERPS)
include(external/rocksdb) # download, build, install libmct
list(APPEND third_party_deps extern_rocksdb)
endif()
endif() endif()
if(WITH_XBYAK) if(WITH_XBYAK)
......
...@@ -417,8 +417,10 @@ void FleetWrapper::PushSparseFromTensorWithLabelAsync( ...@@ -417,8 +417,10 @@ void FleetWrapper::PushSparseFromTensorWithLabelAsync(
return; return;
} }
void FleetWrapper::LoadModel(const std::string& path, const int mode) { void FleetWrapper::LoadModel(const std::string& path, const std::string& mode) {
auto ret = pserver_ptr_->_worker_ptr->load(path, std::to_string(mode)); auto* communicator = Communicator::GetInstance();
auto ret = communicator->_worker_ptr->load(path, mode);
// auto ret = pserver_ptr_->_worker_ptr->load(path, std::to_string(mode));
ret.wait(); ret.wait();
if (ret.get() != 0) { if (ret.get() != 0) {
LOG(ERROR) << "load model from path:" << path << " failed"; LOG(ERROR) << "load model from path:" << path << " failed";
...@@ -429,8 +431,11 @@ void FleetWrapper::LoadModel(const std::string& path, const int mode) { ...@@ -429,8 +431,11 @@ void FleetWrapper::LoadModel(const std::string& path, const int mode) {
void FleetWrapper::LoadModelOneTable(const uint64_t table_id, void FleetWrapper::LoadModelOneTable(const uint64_t table_id,
const std::string& path, const int mode) { const std::string& path, const int mode) {
auto* communicator = Communicator::GetInstance();
auto ret = auto ret =
pserver_ptr_->_worker_ptr->load(table_id, path, std::to_string(mode)); communicator->_worker_ptr->load(table_id, path, std::to_string(mode));
// auto ret =
// pserver_ptr_->_worker_ptr->load(table_id, path, std::to_string(mode));
ret.wait(); ret.wait();
if (ret.get() != 0) { if (ret.get() != 0) {
LOG(ERROR) << "load model of table id: " << table_id LOG(ERROR) << "load model of table id: " << table_id
......
...@@ -200,7 +200,7 @@ class FleetWrapper { ...@@ -200,7 +200,7 @@ class FleetWrapper {
void PrintTableStat(const uint64_t table_id); void PrintTableStat(const uint64_t table_id);
// mode = 0, load all feature // mode = 0, load all feature
// mode = 1, load delta feature, which means load diff // mode = 1, load delta feature, which means load diff
void LoadModel(const std::string& path, const int mode); void LoadModel(const std::string& path, const std::string& mode);
// mode = 0, load all feature // mode = 0, load all feature
// mode = 1, load delta feature, which means load diff // mode = 1, load delta feature, which means load diff
void LoadModelOneTable(const uint64_t table_id, const std::string& path, void LoadModelOneTable(const uint64_t table_id, const std::string& path,
......
...@@ -42,17 +42,17 @@ int32_t PsLocalClient::initialize() { ...@@ -42,17 +42,17 @@ int32_t PsLocalClient::initialize() {
::std::future<int32_t> PsLocalClient::load(const std::string& epoch, ::std::future<int32_t> PsLocalClient::load(const std::string& epoch,
const std::string& mode) { const std::string& mode) {
// TODO // TODO
// for (auto& it : _table_map) { for (auto& it : _table_map) {
// load(it.first, epoch, mode); load(it.first, epoch, mode);
//} }
return done(); return done();
} }
::std::future<int32_t> PsLocalClient::load(uint32_t table_id, ::std::future<int32_t> PsLocalClient::load(uint32_t table_id,
const std::string& epoch, const std::string& epoch,
const std::string& mode) { const std::string& mode) {
// TODO // TODO
// auto* table_ptr = table(table_id); auto* table_ptr = table(table_id);
// table_ptr->load(epoch, mode); table_ptr->load(epoch, mode);
return done(); return done();
} }
...@@ -245,7 +245,6 @@ int32_t PsLocalClient::initialize() { ...@@ -245,7 +245,6 @@ int32_t PsLocalClient::initialize() {
::std::future<int32_t> PsLocalClient::push_sparse_raw_gradient( ::std::future<int32_t> PsLocalClient::push_sparse_raw_gradient(
size_t table_id, const uint64_t* keys, const float** update_values, size_t table_id, const uint64_t* keys, const float** update_values,
size_t num, void* callback) { size_t num, void* callback) {
VLOG(1) << "wxx push_sparse_raw_gradient";
PSClientClosure* closure = reinterpret_cast<PSClientClosure*>(callback); PSClientClosure* closure = reinterpret_cast<PSClientClosure*>(callback);
auto* accessor = table_accessor(table_id); auto* accessor = table_accessor(table_id);
auto* table_ptr = table(table_id); auto* table_ptr = table(table_id);
......
...@@ -26,9 +26,14 @@ class PsLocalServer : public PSServer { ...@@ -26,9 +26,14 @@ class PsLocalServer : public PSServer {
PsLocalServer() {} PsLocalServer() {}
virtual ~PsLocalServer() {} virtual ~PsLocalServer() {}
virtual uint64_t start() { return 0; } virtual uint64_t start() { return 0; }
virtual uint64_t start(const std::string& ip, uint32_t port) { return 0; } virtual uint64_t start(const std::string &ip, uint32_t port) { return 0; }
virtual int32_t stop() { return 0; } virtual int32_t stop() { return 0; }
virtual int32_t port() { return 0; } virtual int32_t port() { return 0; }
virtual int32_t configure(
const PSParameter &config, PSEnvironment &env, size_t server_rank,
const std::vector<framework::ProgramDesc> &server_sub_program = {}) {
return 0;
}
private: private:
virtual int32_t initialize() { return 0; } virtual int32_t initialize() { return 0; }
......
...@@ -70,7 +70,7 @@ class PSServer { ...@@ -70,7 +70,7 @@ class PSServer {
virtual int32_t configure( virtual int32_t configure(
const PSParameter &config, PSEnvironment &env, size_t server_rank, const PSParameter &config, PSEnvironment &env, size_t server_rank,
const std::vector<framework::ProgramDesc> &server_sub_program = {}) final; const std::vector<framework::ProgramDesc> &server_sub_program = {});
// return server_ip // return server_ip
virtual std::string ip() { return butil::my_ip_cstr(); } virtual std::string ip() { return butil::my_ip_cstr(); }
......
...@@ -9,15 +9,24 @@ set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS $ ...@@ -9,15 +9,24 @@ set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS $
cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler) cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler)
set_source_files_properties(common_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(common_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(common_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(common_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(ssd_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc set(EXTERN_DEP "")
sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS} if(WITH_HETERPS)
${RPC_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator) set(TABLE_SRC common_sparse_table.cc ssd_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc)
set(EXTERN_DEP rocksdb)
else()
set(TABLE_SRC common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc)
endif()
cc_library(common_table SRCS ${TABLE_SRC} DEPS ${TABLE_DEPS}
${RPC_DEPS} graph_edge graph_node device_context string_helper
simple_threadpool xxhash generator ${EXTERN_DEP})
set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
......
...@@ -25,83 +25,12 @@ class ValueBlock; ...@@ -25,83 +25,12 @@ class ValueBlock;
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
#define PSERVER_SAVE_SUFFIX ".shard"
using boost::lexical_cast;
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
enum SaveMode { all, base, delta }; void CommonSparseTable::ProcessALine(const std::vector<std::string>& columns,
const Meta& meta, const int64_t id,
struct Meta { std::vector<std::vector<float>>* values) {
std::string param;
int shard_id;
std::vector<std::string> names;
std::vector<int> dims;
uint64_t count;
std::unordered_map<std::string, int> dims_map;
explicit Meta(const std::string& metapath) {
std::ifstream file(metapath);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
if (StartWith(line, "#")) {
continue;
}
auto pairs = paddle::string::split_string<std::string>(line, "=");
PADDLE_ENFORCE_EQ(
pairs.size(), 2,
paddle::platform::errors::InvalidArgument(
"info in %s except k=v, but got %s", metapath, line));
if (pairs[0] == "param") {
param = pairs[1];
}
if (pairs[0] == "shard_id") {
shard_id = std::stoi(pairs[1]);
}
if (pairs[0] == "row_names") {
names = paddle::string::split_string<std::string>(pairs[1], ",");
}
if (pairs[0] == "row_dims") {
auto dims_strs =
paddle::string::split_string<std::string>(pairs[1], ",");
for (auto& str : dims_strs) {
dims.push_back(std::stoi(str));
}
}
if (pairs[0] == "count") {
count = std::stoull(pairs[1]);
}
}
for (int x = 0; x < names.size(); ++x) {
dims_map[names[x]] = dims[x];
}
}
Meta(std::string param, int shard_id, std::vector<std::string> row_names,
std::vector<int> dims, uint64_t count) {
this->param = param;
this->shard_id = shard_id;
this->names = row_names;
this->dims = dims;
this->count = count;
}
std::string ToString() {
std::stringstream ss;
ss << "param=" << param << "\n";
ss << "shard_id=" << shard_id << "\n";
ss << "row_names=" << paddle::string::join_strings(names, ',') << "\n";
ss << "row_dims=" << paddle::string::join_strings(dims, ',') << "\n";
ss << "count=" << count << "\n";
return ss.str();
}
};
void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
const int64_t id, std::vector<std::vector<float>>* values) {
auto colunmn_size = columns.size(); auto colunmn_size = columns.size();
auto load_values = auto load_values =
paddle::string::split_string<std::string>(columns[colunmn_size - 1], ","); paddle::string::split_string<std::string>(columns[colunmn_size - 1], ",");
...@@ -134,8 +63,10 @@ void ProcessALine(const std::vector<std::string>& columns, const Meta& meta, ...@@ -134,8 +63,10 @@ void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
} }
} }
void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common, void CommonSparseTable::SaveMetaToText(std::ostream* os,
const size_t shard_idx, const int64_t total) { const CommonAccessorParameter& common,
const size_t shard_idx,
const int64_t total) {
// save meta // save meta
std::stringstream stream; std::stringstream stream;
stream << "param=" << common.table_name() << "\n"; stream << "param=" << common.table_name() << "\n";
...@@ -148,8 +79,10 @@ void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common, ...@@ -148,8 +79,10 @@ void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
os->write(stream.str().c_str(), sizeof(char) * stream.str().size()); os->write(stream.str().c_str(), sizeof(char) * stream.str().size());
} }
int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block, int64_t CommonSparseTable::SaveValueToText(std::ostream* os,
std::shared_ptr<::ThreadPool> pool, const int mode) { std::shared_ptr<ValueBlock> block,
std::shared_ptr<::ThreadPool> pool,
const int mode, int shard_id) {
int64_t save_num = 0; int64_t save_num = 0;
for (auto& table : block->values_) { for (auto& table : block->values_) {
for (auto& value : table) { for (auto& value : table) {
...@@ -186,10 +119,10 @@ int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block, ...@@ -186,10 +119,10 @@ int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
return save_num; return save_num;
} }
int64_t LoadFromText(const std::string& valuepath, const std::string& metapath, int64_t CommonSparseTable::LoadFromText(
const int pserver_id, const int pserver_num, const std::string& valuepath, const std::string& metapath,
const int local_shard_num, const int pserver_id, const int pserver_num, const int local_shard_num,
std::vector<std::shared_ptr<ValueBlock>>* blocks) { std::vector<std::shared_ptr<ValueBlock>>* blocks) {
Meta meta = Meta(metapath); Meta meta = Meta(metapath);
int num_lines = 0; int num_lines = 0;
...@@ -198,7 +131,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath, ...@@ -198,7 +131,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
while (std::getline(file, line)) { while (std::getline(file, line)) {
auto values = paddle::string::split_string<std::string>(line, "\t"); auto values = paddle::string::split_string<std::string>(line, "\t");
auto id = lexical_cast<int64_t>(values[0]); auto id = lexical_cast<uint64_t>(values[0]);
if (id % pserver_num != pserver_id) { if (id % pserver_num != pserver_id) {
VLOG(3) << "will not load " << values[0] << " from " << valuepath VLOG(3) << "will not load " << values[0] << " from " << valuepath
...@@ -388,8 +321,9 @@ int32_t CommonSparseTable::save(const std::string& dirname, ...@@ -388,8 +321,9 @@ int32_t CommonSparseTable::save(const std::string& dirname,
int64_t total_ins = 0; int64_t total_ins = 0;
for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) { for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) {
// save values // save values
auto shard_save_num = SaveValueToText(vs.get(), shard_values_[shard_id], auto shard_save_num =
_shards_task_pool[shard_id], mode); SaveValueToText(vs.get(), shard_values_[shard_id],
_shards_task_pool[shard_id], mode, shard_id);
total_ins += shard_save_num; total_ins += shard_save_num;
} }
vs->close(); vs->close();
......
...@@ -32,11 +32,83 @@ ...@@ -32,11 +32,83 @@
#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
#define PSERVER_SAVE_SUFFIX ".shard"
using boost::lexical_cast;
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
class SparseOptimizer; class SparseOptimizer;
enum SaveMode { all, base, delta };
struct Meta {
std::string param;
int shard_id;
std::vector<std::string> names;
std::vector<int> dims;
uint64_t count;
std::unordered_map<std::string, int> dims_map;
explicit Meta(const std::string& metapath) {
std::ifstream file(metapath);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
if (StartWith(line, "#")) {
continue;
}
auto pairs = paddle::string::split_string<std::string>(line, "=");
PADDLE_ENFORCE_EQ(
pairs.size(), 2,
paddle::platform::errors::InvalidArgument(
"info in %s except k=v, but got %s", metapath, line));
if (pairs[0] == "param") {
param = pairs[1];
}
if (pairs[0] == "shard_id") {
shard_id = std::stoi(pairs[1]);
}
if (pairs[0] == "row_names") {
names = paddle::string::split_string<std::string>(pairs[1], ",");
}
if (pairs[0] == "row_dims") {
auto dims_strs =
paddle::string::split_string<std::string>(pairs[1], ",");
for (auto& str : dims_strs) {
dims.push_back(std::stoi(str));
}
}
if (pairs[0] == "count") {
count = std::stoull(pairs[1]);
}
}
for (int x = 0; x < names.size(); ++x) {
dims_map[names[x]] = dims[x];
}
}
Meta(std::string param, int shard_id, std::vector<std::string> row_names,
std::vector<int> dims, uint64_t count) {
this->param = param;
this->shard_id = shard_id;
this->names = row_names;
this->dims = dims;
this->count = count;
}
std::string ToString() {
std::stringstream ss;
ss << "param=" << param << "\n";
ss << "shard_id=" << shard_id << "\n";
ss << "row_names=" << paddle::string::join_strings(names, ',') << "\n";
ss << "row_dims=" << paddle::string::join_strings(dims, ',') << "\n";
ss << "count=" << count << "\n";
return ss.str();
}
};
class CommonSparseTable : public SparseTable { class CommonSparseTable : public SparseTable {
public: public:
CommonSparseTable() { rwlock_.reset(new framework::RWLock); } CommonSparseTable() { rwlock_.reset(new framework::RWLock); }
...@@ -56,9 +128,25 @@ class CommonSparseTable : public SparseTable { ...@@ -56,9 +128,25 @@ class CommonSparseTable : public SparseTable {
virtual int32_t initialize_optimizer(); virtual int32_t initialize_optimizer();
virtual int32_t initialize_recorder(); virtual int32_t initialize_recorder();
int32_t load(const std::string& path, const std::string& param); virtual int32_t load(const std::string& path, const std::string& param);
virtual int32_t save(const std::string& path, const std::string& param);
void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
const size_t shard_idx, const int64_t total);
int32_t save(const std::string& path, const std::string& param); int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
std::shared_ptr<::ThreadPool> pool, const int mode,
int shard_id);
virtual void ProcessALine(const std::vector<std::string>& columns,
const Meta& meta, const int64_t id,
std::vector<std::vector<float>>* values);
virtual int64_t LoadFromText(
const std::string& valuepath, const std::string& metapath,
const int pserver_id, const int pserver_num, const int local_shard_num,
std::vector<std::shared_ptr<ValueBlock>>* blocks);
virtual std::pair<int64_t, int64_t> print_table_stat(); virtual std::pair<int64_t, int64_t> print_table_stat();
virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value); virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
...@@ -89,7 +177,7 @@ class CommonSparseTable : public SparseTable { ...@@ -89,7 +177,7 @@ class CommonSparseTable : public SparseTable {
virtual int32_t _push_sparse(const uint64_t* keys, const float** values, virtual int32_t _push_sparse(const uint64_t* keys, const float** values,
size_t num); size_t num);
private: protected:
const int task_pool_size_ = 11; const int task_pool_size_ = 11;
std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool; std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
......
...@@ -83,6 +83,7 @@ inline bool probility_entry(VALUE *value, float threshold) { ...@@ -83,6 +83,7 @@ inline bool probility_entry(VALUE *value, float threshold) {
class ValueBlock { class ValueBlock {
public: public:
typedef typename robin_hood::unordered_map<uint64_t, VALUE *> map_type;
explicit ValueBlock(const std::vector<std::string> &value_names, explicit ValueBlock(const std::vector<std::string> &value_names,
const std::vector<int> &value_dims, const std::vector<int> &value_dims,
const std::vector<int> &value_offsets, const std::vector<int> &value_offsets,
...@@ -261,6 +262,18 @@ class ValueBlock { ...@@ -261,6 +262,18 @@ class ValueBlock {
value->is_entry_ = state; value->is_entry_ = state;
} }
void erase(uint64_t feasign) {
size_t hash = _hasher(feasign);
size_t bucket = compute_bucket(hash);
auto &table = values_[bucket];
auto iter = table.find(feasign);
if (iter != table.end()) {
butil::return_object(iter->second);
iter = table.erase(iter);
}
}
void Shrink(const int threshold) { void Shrink(const int threshold) {
for (auto &table : values_) { for (auto &table : values_) {
for (auto iter = table.begin(); iter != table.end();) { for (auto iter = table.begin(); iter != table.end();) {
...@@ -289,6 +302,23 @@ class ValueBlock { ...@@ -289,6 +302,23 @@ class ValueBlock {
} }
} }
map_type::iterator end() {
return values_[SPARSE_SHARD_BUCKET_NUM - 1].end();
}
map_type::iterator Find(uint64_t id) {
size_t hash = _hasher(id);
size_t bucket = compute_bucket(hash);
auto &table = values_[bucket];
auto got = table.find(id);
if (got == table.end()) {
return end();
} else {
return got;
}
}
private: private:
bool Has(const uint64_t id) { bool Has(const uint64_t id) {
size_t hash = _hasher(id); size_t hash = _hasher(id);
...@@ -304,7 +334,7 @@ class ValueBlock { ...@@ -304,7 +334,7 @@ class ValueBlock {
} }
public: public:
robin_hood::unordered_map<uint64_t, VALUE *> values_[SPARSE_SHARD_BUCKET_NUM]; map_type values_[SPARSE_SHARD_BUCKET_NUM];
size_t value_length_ = 0; size_t value_length_ = 0;
std::hash<uint64_t> _hasher; std::hash<uint64_t> _hasher;
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_HETERPS
#include <glog/logging.h>
#include <rocksdb/db.h>
#include <rocksdb/filter_policy.h>
#include <rocksdb/options.h>
#include <rocksdb/slice.h>
#include <rocksdb/table.h>
#include <rocksdb/write_batch.h>
#include <iostream>
#include <string>
namespace paddle {
namespace distributed {
class RocksDBHandler {
public:
RocksDBHandler() {}
~RocksDBHandler() {}
static RocksDBHandler* GetInstance() {
static RocksDBHandler handler;
return &handler;
}
int initialize(const std::string& db_path, const int colnum) {
VLOG(3) << "db path: " << db_path << " colnum: " << colnum;
rocksdb::Options options;
rocksdb::BlockBasedTableOptions bbto;
bbto.block_size = 4 * 1024;
bbto.block_cache = rocksdb::NewLRUCache(64 * 1024 * 1024);
bbto.block_cache_compressed = rocksdb::NewLRUCache(64 * 1024 * 1024);
bbto.cache_index_and_filter_blocks = false;
bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(20, false));
bbto.whole_key_filtering = true;
options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto));
options.keep_log_file_num = 100;
options.max_log_file_size = 50 * 1024 * 1024; // 50MB
options.create_if_missing = true;
options.use_direct_reads = true;
options.max_background_flushes = 5;
options.max_background_compactions = 5;
options.base_background_compactions = 10;
options.write_buffer_size = 256 * 1024 * 1024; // 256MB
options.max_write_buffer_number = 8;
options.max_bytes_for_level_base =
options.max_write_buffer_number * options.write_buffer_size;
options.min_write_buffer_number_to_merge = 1;
options.target_file_size_base = 1024 * 1024 * 1024; // 1024MB
options.memtable_prefix_bloom_size_ratio = 0.02;
options.num_levels = 4;
options.max_open_files = -1;
options.compression = rocksdb::kNoCompression;
options.level0_file_num_compaction_trigger = 8;
options.level0_slowdown_writes_trigger =
1.8 * options.level0_file_num_compaction_trigger;
options.level0_stop_writes_trigger =
3.6 * options.level0_file_num_compaction_trigger;
if (!db_path.empty()) {
std::string rm_cmd = "rm -rf " + db_path;
system(rm_cmd.c_str());
}
rocksdb::Status s = rocksdb::DB::Open(options, db_path, &_db);
assert(s.ok());
_handles.resize(colnum);
for (int i = 0; i < colnum; i++) {
s = _db->CreateColumnFamily(options, "shard_" + std::to_string(i),
&_handles[i]);
assert(s.ok());
}
LOG(INFO) << "DB initialize success, colnum:" << colnum;
return 0;
}
int put(int id, const char* key, int key_len, const char* value,
int value_len) {
rocksdb::WriteOptions options;
options.disableWAL = true;
rocksdb::Status s =
_db->Put(options, _handles[id], rocksdb::Slice(key, key_len),
rocksdb::Slice(value, value_len));
assert(s.ok());
return 0;
}
int put_batch(int id, std::vector<std::pair<char*, int>>& ssd_keys,
std::vector<std::pair<char*, int>>& ssd_values, int n) {
rocksdb::WriteOptions options;
options.disableWAL = true;
rocksdb::WriteBatch batch(n * 128);
for (int i = 0; i < n; i++) {
batch.Put(_handles[id],
rocksdb::Slice(ssd_keys[i].first, ssd_keys[i].second),
rocksdb::Slice(ssd_values[i].first, ssd_values[i].second));
}
rocksdb::Status s = _db->Write(options, &batch);
assert(s.ok());
return 0;
}
int get(int id, const char* key, int key_len, std::string& value) {
rocksdb::Status s = _db->Get(rocksdb::ReadOptions(), _handles[id],
rocksdb::Slice(key, key_len), &value);
if (s.IsNotFound()) {
return 1;
}
assert(s.ok());
return 0;
}
int del_data(int id, const char* key, int key_len) {
rocksdb::WriteOptions options;
options.disableWAL = true;
rocksdb::Status s =
_db->Delete(options, _handles[id], rocksdb::Slice(key, key_len));
assert(s.ok());
return 0;
}
int flush(int id) {
rocksdb::Status s = _db->Flush(rocksdb::FlushOptions(), _handles[id]);
assert(s.ok());
return 0;
}
rocksdb::Iterator* get_iterator(int id) {
return _db->NewIterator(rocksdb::ReadOptions(), _handles[id]);
}
int get_estimate_key_num(uint64_t& num_keys) {
_db->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys);
return 0;
}
private:
std::vector<rocksdb::ColumnFamilyHandle*> _handles;
rocksdb::DB* _db;
};
}
}
#endif
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/distributed/table/ssd_sparse_table.h"
DEFINE_string(rocksdb_path, "database", "path of sparse table rocksdb file");
namespace paddle {
namespace distributed {
int32_t SSDSparseTable::initialize() {
_shards_task_pool.resize(task_pool_size_);
for (int i = 0; i < _shards_task_pool.size(); ++i) {
_shards_task_pool[i].reset(new ::ThreadPool(1));
}
sync = _config.common().sync();
VLOG(1) << "table " << _config.common().table_name() << " is sync: " << sync;
_global_lr = new float(1.0);
auto common = _config.common();
int size = static_cast<int>(common.params().size());
size_t offset = 0;
for (int x = 0; x < size; ++x) {
auto& varname = common.params()[x];
auto& dim = common.dims()[x];
value_idx_[varname] = x;
value_names_.push_back(varname);
value_dims_.push_back(dim);
value_offsets_.push_back(offset);
initializer_attrs_.push_back(common.initializers()[x]);
if (varname == "Param") {
param_dim_ = dim;
param_offset_ = offset;
}
offset += dim;
}
initialize_value();
initialize_optimizer();
initialize_recorder();
_db = paddle::distributed::RocksDBHandler::GetInstance();
_db->initialize(FLAGS_rocksdb_path, task_pool_size_);
return 0;
}
int32_t SSDSparseTable::pull_sparse(float* pull_values,
const PullSparseValue& pull_value) {
auto shard_num = task_pool_size_;
std::vector<std::future<int>> tasks(shard_num);
for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
[this, shard_id, shard_num, &pull_value, &pull_values]() -> int {
auto& block = shard_values_[shard_id];
std::vector<int> offsets;
pull_value.Fission(shard_id, shard_num, &offsets);
for (auto& offset : offsets) {
auto feasign = pull_value.feasigns_[offset];
auto frequencie = pull_value.frequencies_[offset];
float* embedding = nullptr;
auto iter = block->Find(feasign);
// in mem
if (iter == block->end()) {
embedding = iter->second->data_.data();
if (pull_value.is_training_) {
block->AttrUpdate(iter->second, frequencie);
}
} else {
// need create
std::string tmp_str("");
if (_db->get(shard_id, (char*)&feasign, sizeof(uint64_t),
tmp_str) > 0) {
embedding = block->Init(feasign, true, frequencie);
} else {
// in db
int data_size = tmp_str.size() / sizeof(float);
int value_size = block->value_length_;
float* db_value = (float*)const_cast<char*>(tmp_str.c_str());
VALUE* value = block->InitGet(feasign);
// copy to mem
memcpy(value->data_.data(), db_value,
value_size * sizeof(float));
embedding = db_value;
// param, count, unseen_day
value->count_ = db_value[value_size];
value->unseen_days_ = db_value[value_size + 1];
value->is_entry_ = db_value[value_size + 2];
if (pull_value.is_training_) {
block->AttrUpdate(value, frequencie);
}
}
}
std::copy_n(embedding + param_offset_, param_dim_,
pull_values + param_dim_ * offset);
}
return 0;
});
}
for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
tasks[shard_id].wait();
}
return 0;
}
int32_t SSDSparseTable::pull_sparse_ptr(char** pull_values,
const uint64_t* keys, size_t num) {
auto shard_num = task_pool_size_;
std::vector<std::future<int>> tasks(shard_num);
std::vector<std::vector<uint64_t>> offset_bucket;
offset_bucket.resize(task_pool_size_);
for (int x = 0; x < num; ++x) {
auto y = keys[x] % task_pool_size_;
offset_bucket[y].push_back(x);
}
for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
[this, shard_id, &keys, &pull_values, &offset_bucket]() -> int {
auto& block = shard_values_[shard_id];
auto& offsets = offset_bucket[shard_id];
for (auto& offset : offsets) {
auto feasign = keys[offset];
auto iter = block->Find(feasign);
VALUE* value = nullptr;
// in mem
if (iter != block->end()) {
value = iter->second;
} else {
// need create
std::string tmp_str("");
if (_db->get(shard_id, (char*)&feasign, sizeof(uint64_t),
tmp_str) > 0) {
value = block->InitGet(feasign);
} else {
// in db
int data_size = tmp_str.size() / sizeof(float);
int value_size = block->value_length_;
float* db_value = (float*)const_cast<char*>(tmp_str.c_str());
value = block->InitGet(feasign);
// copy to mem
memcpy(value->data_.data(), db_value,
value_size * sizeof(float));
// param, count, unseen_day
value->count_ = db_value[value_size];
value->unseen_days_ = db_value[value_size + 1];
value->is_entry_ = db_value[value_size + 2];
}
}
pull_values[offset] = (char*)value;
}
return 0;
});
}
for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
tasks[shard_id].wait();
}
return 0;
}
int32_t SSDSparseTable::shrink(const std::string& param) { return 0; }
int32_t SSDSparseTable::update_table() {
int count = 0;
int value_size = shard_values_[0]->value_length_;
int db_size = 3 + value_size;
float tmp_value[db_size];
for (size_t i = 0; i < task_pool_size_; ++i) {
auto& block = shard_values_[i];
for (auto& table : block->values_) {
for (auto iter = table.begin(); iter != table.end();) {
VALUE* value = iter->second;
if (value->unseen_days_ >= 1) {
tmp_value[value_size] = value->count_;
tmp_value[value_size + 1] = value->unseen_days_;
tmp_value[value_size + 2] = value->is_entry_;
memcpy(tmp_value, value->data_.data(), sizeof(float) * value_size);
_db->put(i, (char*)&(iter->first), sizeof(uint64_t), (char*)tmp_value,
db_size * sizeof(float));
count++;
butil::return_object(iter->second);
iter = table.erase(iter);
} else {
++iter;
}
}
}
_db->flush(i);
}
VLOG(1) << "Table>> update count: " << count;
return 0;
}
int64_t SSDSparseTable::SaveValueToText(std::ostream* os,
std::shared_ptr<ValueBlock> block,
std::shared_ptr<::ThreadPool> pool,
const int mode, int shard_id) {
int64_t save_num = 0;
for (auto& table : block->values_) {
for (auto& value : table) {
if (mode == SaveMode::delta && !value.second->need_save_) {
continue;
}
++save_num;
std::stringstream ss;
auto* vs = value.second->data_.data();
auto id = value.first;
ss << id << "\t" << value.second->count_ << "\t"
<< value.second->unseen_days_ << "\t" << value.second->is_entry_
<< "\t";
for (int i = 0; i < block->value_length_ - 1; i++) {
ss << std::to_string(vs[i]) << ",";
}
ss << std::to_string(vs[block->value_length_ - 1]);
ss << "\n";
os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
if (mode == SaveMode::base || mode == SaveMode::delta) {
value.second->need_save_ = false;
}
}
}
if (mode != 1) {
int value_size = block->value_length_;
auto* it = _db->get_iterator(shard_id);
for (it->SeekToFirst(); it->Valid(); it->Next()) {
float* value = (float*)const_cast<char*>(it->value().data());
std::stringstream ss;
ss << *((uint64_t*)const_cast<char*>(it->key().data())) << "\t"
<< value[value_size] << "\t" << value[value_size + 1] << "\t"
<< value[value_size + 2] << "\t";
for (int i = 0; i < block->value_length_ - 1; i++) {
ss << std::to_string(value[i]) << ",";
}
ss << std::to_string(value[block->value_length_ - 1]);
ss << "\n";
os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
}
}
return save_num;
}
int32_t SSDSparseTable::load(const std::string& path,
const std::string& param) {
rwlock_->WRLock();
VLOG(3) << "ssd sparse table load with " << path << " with meta " << param;
LoadFromText(path, param, _shard_idx, _shard_num, task_pool_size_,
&shard_values_);
rwlock_->UNLock();
return 0;
}
int64_t SSDSparseTable::LoadFromText(
const std::string& valuepath, const std::string& metapath,
const int pserver_id, const int pserver_num, const int local_shard_num,
std::vector<std::shared_ptr<ValueBlock>>* blocks) {
Meta meta = Meta(metapath);
int num_lines = 0;
std::ifstream file(valuepath);
std::string line;
int value_size = shard_values_[0]->value_length_;
int db_size = 3 + value_size;
float tmp_value[db_size];
while (std::getline(file, line)) {
auto values = paddle::string::split_string<std::string>(line, "\t");
auto id = lexical_cast<uint64_t>(values[0]);
if (id % pserver_num != pserver_id) {
VLOG(3) << "will not load " << values[0] << " from " << valuepath
<< ", please check id distribution";
continue;
}
auto shard_id = id % local_shard_num;
auto block = blocks->at(shard_id);
std::vector<std::vector<float>> kvalues;
ProcessALine(values, meta, id, &kvalues);
block->Init(id, false);
VALUE* value_instant = block->GetValue(id);
if (values.size() == 5) {
value_instant->count_ = lexical_cast<int>(values[1]);
value_instant->unseen_days_ = lexical_cast<int>(values[2]);
value_instant->is_entry_ =
static_cast<bool>(lexical_cast<int>(values[3]));
}
std::vector<float*> block_values = block->Get(id, meta.names, meta.dims);
auto blas = GetBlas<float>();
for (int x = 0; x < meta.names.size(); ++x) {
blas.VCOPY(meta.dims[x], kvalues[x].data(), block_values[x]);
}
VLOG(3) << "loading: " << id
<< "unseen day: " << value_instant->unseen_days_;
if (value_instant->unseen_days_ >= 1) {
tmp_value[value_size] = value_instant->count_;
tmp_value[value_size + 1] = value_instant->unseen_days_;
tmp_value[value_size + 2] = value_instant->is_entry_;
memcpy(tmp_value, value_instant->data_.data(),
sizeof(float) * value_size);
_db->put(shard_id, (char*)&(id), sizeof(uint64_t), (char*)tmp_value,
db_size * sizeof(float));
block->erase(id);
}
}
return 0;
}
} // namespace ps
} // namespace paddle
#endif
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/distributed/table/common_sparse_table.h"
#include "paddle/fluid/distributed/table/depends/rocksdb_warpper.h"
#ifdef PADDLE_WITH_HETERPS
namespace paddle {
namespace distributed {
class SSDSparseTable : public CommonSparseTable {
public:
SSDSparseTable() {}
virtual ~SSDSparseTable() {}
virtual int32_t initialize() override;
void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
const size_t shard_idx, const int64_t total);
int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
std::shared_ptr<::ThreadPool> pool, const int mode,
int shard_id);
virtual int64_t LoadFromText(
const std::string& valuepath, const std::string& metapath,
const int pserver_id, const int pserver_num, const int local_shard_num,
std::vector<std::shared_ptr<ValueBlock>>* blocks);
virtual int32_t load(const std::string& path, const std::string& param);
// exchange data
virtual int32_t update_table();
virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys,
size_t num);
virtual int32_t flush() override { return 0; }
virtual int32_t shrink(const std::string& param) override;
virtual void clear() override {}
private:
RocksDBHandler* _db;
int64_t _cache_tk_size;
};
} // namespace ps
} // namespace paddle
#endif
...@@ -21,6 +21,9 @@ ...@@ -21,6 +21,9 @@
#include "paddle/fluid/distributed/table/common_graph_table.h" #include "paddle/fluid/distributed/table/common_graph_table.h"
#include "paddle/fluid/distributed/table/common_sparse_table.h" #include "paddle/fluid/distributed/table/common_sparse_table.h"
#include "paddle/fluid/distributed/table/sparse_geo_table.h" #include "paddle/fluid/distributed/table/sparse_geo_table.h"
#ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/distributed/table/ssd_sparse_table.h"
#endif
#include "paddle/fluid/distributed/table/tensor_accessor.h" #include "paddle/fluid/distributed/table/tensor_accessor.h"
#include "paddle/fluid/distributed/table/tensor_table.h" #include "paddle/fluid/distributed/table/tensor_table.h"
...@@ -29,6 +32,9 @@ namespace distributed { ...@@ -29,6 +32,9 @@ namespace distributed {
REGISTER_PSCORE_CLASS(Table, GraphTable); REGISTER_PSCORE_CLASS(Table, GraphTable);
REGISTER_PSCORE_CLASS(Table, CommonDenseTable); REGISTER_PSCORE_CLASS(Table, CommonDenseTable);
REGISTER_PSCORE_CLASS(Table, CommonSparseTable); REGISTER_PSCORE_CLASS(Table, CommonSparseTable);
#ifdef PADDLE_WITH_HETERPS
REGISTER_PSCORE_CLASS(Table, SSDSparseTable);
#endif
REGISTER_PSCORE_CLASS(Table, SparseGeoTable); REGISTER_PSCORE_CLASS(Table, SparseGeoTable);
REGISTER_PSCORE_CLASS(Table, BarrierTable); REGISTER_PSCORE_CLASS(Table, BarrierTable);
REGISTER_PSCORE_CLASS(Table, TensorTable); REGISTER_PSCORE_CLASS(Table, TensorTable);
......
...@@ -16,15 +16,14 @@ limitations under the License. */ ...@@ -16,15 +16,14 @@ limitations under the License. */
#include <cstdint> #include <cstdint>
#include <string> #include <string>
#include "complex128.h" // NOLINT #include "complex.h" // NOLINT
#include "complex64.h" // NOLINT
#include "ext_exception.h" // NOLINT #include "ext_exception.h" // NOLINT
#include "float16.h" // NOLINT #include "float16.h" // NOLINT
namespace paddle { namespace paddle {
using complex64 = paddle::platform::complex64; using complex64 = paddle::platform::complex<float>;
using complex128 = paddle::platform::complex128; using complex128 = paddle::platform::complex<double>;
using float16 = paddle::platform::float16; using float16 = paddle::platform::float16;
enum class DataType { enum class DataType {
......
...@@ -19,8 +19,7 @@ limitations under the License. */ ...@@ -19,8 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/custom_tensor_utils.h" #include "paddle/fluid/framework/custom_tensor_utils.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
...@@ -238,9 +237,9 @@ template PD_DLL_DECL Tensor ...@@ -238,9 +237,9 @@ template PD_DLL_DECL Tensor
Tensor::copy_to<int16_t>(const PlaceType &target_place) const; Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
template PD_DLL_DECL Tensor template PD_DLL_DECL Tensor
Tensor::copy_to<bool>(const PlaceType &target_place) const; Tensor::copy_to<bool>(const PlaceType &target_place) const;
template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>( template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<float>>(
const PlaceType &target_place) const; const PlaceType &target_place) const;
template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>( template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<double>>(
const PlaceType &target_place) const; const PlaceType &target_place) const;
template PD_DLL_DECL Tensor template PD_DLL_DECL Tensor
Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const; Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
...@@ -253,10 +252,10 @@ template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const; ...@@ -253,10 +252,10 @@ template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const;
template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const; template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const;
template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const; template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const;
template PD_DLL_DECL bool *Tensor::data<bool>() const; template PD_DLL_DECL bool *Tensor::data<bool>() const;
template PD_DLL_DECL paddle::platform::complex64 * template PD_DLL_DECL paddle::platform::complex<float>
Tensor::data<paddle::platform::complex64>() const; *Tensor::data<paddle::platform::complex<float>>() const;
template PD_DLL_DECL paddle::platform::complex128 * template PD_DLL_DECL paddle::platform::complex<double>
Tensor::data<paddle::platform::complex128>() const; *Tensor::data<paddle::platform::complex<double>>() const;
template PD_DLL_DECL paddle::platform::float16 * template PD_DLL_DECL paddle::platform::float16 *
Tensor::data<paddle::platform::float16>() const; Tensor::data<paddle::platform::float16>() const;
...@@ -268,10 +267,10 @@ template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>(); ...@@ -268,10 +267,10 @@ template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(); template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(); template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
template PD_DLL_DECL bool *Tensor::mutable_data<bool>(); template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
template PD_DLL_DECL paddle::platform::complex64 * template PD_DLL_DECL paddle::platform::complex<float>
Tensor::mutable_data<paddle::platform::complex64>(); *Tensor::mutable_data<paddle::platform::complex<float>>();
template PD_DLL_DECL paddle::platform::complex128 * template PD_DLL_DECL paddle::platform::complex<double>
Tensor::mutable_data<paddle::platform::complex128>(); *Tensor::mutable_data<paddle::platform::complex<double>>();
template PD_DLL_DECL paddle::platform::float16 * template PD_DLL_DECL paddle::platform::float16 *
Tensor::mutable_data<paddle::platform::float16>(); Tensor::mutable_data<paddle::platform::float16>();
...@@ -289,10 +288,10 @@ template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>( ...@@ -289,10 +288,10 @@ template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>( template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
const PlaceType &place); const PlaceType &place);
template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place); template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
template PD_DLL_DECL paddle::platform::complex64 * template PD_DLL_DECL paddle::platform::complex<float> *
Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place); Tensor::mutable_data<paddle::platform::complex<float>>(const PlaceType &place);
template PD_DLL_DECL paddle::platform::complex128 * template PD_DLL_DECL paddle::platform::complex<double> *
Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place); Tensor::mutable_data<paddle::platform::complex<double>>(const PlaceType &place);
template PD_DLL_DECL paddle::platform::float16 * template PD_DLL_DECL paddle::platform::float16 *
Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place); Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
...@@ -356,13 +355,13 @@ Tensor Tensor::cast(const DataType &target_type) const { ...@@ -356,13 +355,13 @@ Tensor Tensor::cast(const DataType &target_type) const {
dst_type, CastDataType<uint8_t>(*tensor, rlt_tensor_, ctx)); dst_type, CastDataType<uint8_t>(*tensor, rlt_tensor_, ctx));
break; break;
case framework::proto::VarType::COMPLEX64: case framework::proto::VarType::COMPLEX64:
framework::VisitDataType( framework::VisitDataType(dst_type,
dst_type, CastDataType<paddle::platform::complex<float>>(
CastDataType<paddle::platform::complex64>(*tensor, rlt_tensor_, ctx)); *tensor, rlt_tensor_, ctx));
break; break;
case framework::proto::VarType::COMPLEX128: case framework::proto::VarType::COMPLEX128:
framework::VisitDataType(dst_type, framework::VisitDataType(dst_type,
CastDataType<paddle::platform::complex128>( CastDataType<paddle::platform::complex<double>>(
*tensor, rlt_tensor_, ctx)); *tensor, rlt_tensor_, ctx));
break; break;
case framework::proto::VarType::FP16: case framework::proto::VarType::FP16:
......
...@@ -27,7 +27,22 @@ add_subdirectory(fleet) ...@@ -27,7 +27,22 @@ add_subdirectory(fleet)
add_subdirectory(io) add_subdirectory(io)
#ddim lib #ddim lib
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
proto_library(op_def_proto SRCS op_def.proto) proto_library(op_def_proto SRCS op_def.proto)
cc_library(op_def_api SRCS op_def_api.cc DEPS op_def_proto)
FILE(GLOB OP_DEF_FILES ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/compat/*.pbtxt)
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt
"namespace { \n"
"const std::unordered_map<std::string, std::string> op_def_map = { \n")
foreach(OP_DEF_FILE ${OP_DEF_FILES})
FILE(READ ${OP_DEF_FILE} OP_DEF_CONTENT)
get_filename_component(OP_NAME ${OP_DEF_FILE} NAME_WE)
FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt
"{\"${OP_NAME}\",R\"(${OP_DEF_CONTENT})\"},\n")
endforeach(OP_DEF_FILE)
FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "{\"\",\"\"}};\n}")
proto_library(heter_service_proto SRCS heter_service.proto) proto_library(heter_service_proto SRCS heter_service.proto)
proto_library(data_feed_proto SRCS data_feed.proto) proto_library(data_feed_proto SRCS data_feed.proto)
proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
......
...@@ -109,9 +109,9 @@ void GroupTestCopy() { ...@@ -109,9 +109,9 @@ void GroupTestCopy() {
TestCopyTensor<int8_t>(); TestCopyTensor<int8_t>();
VLOG(2) << "uint8 cpu-cpu-gpu-gpu-cpu"; VLOG(2) << "uint8 cpu-cpu-gpu-gpu-cpu";
TestCopyTensor<uint8_t>(); TestCopyTensor<uint8_t>();
VLOG(2) << "complex64 cpu-cpu-gpu-gpu-cpu"; VLOG(2) << "complex<float> cpu-cpu-gpu-gpu-cpu";
TestCopyTensor<paddle::complex64>(); TestCopyTensor<paddle::complex64>();
VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu"; VLOG(2) << "complex<double> cpu-cpu-gpu-gpu-cpu";
TestCopyTensor<paddle::complex128>(); TestCopyTensor<paddle::complex128>();
VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu"; VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu";
TestCopyTensor<paddle::float16>(); TestCopyTensor<paddle::float16>();
...@@ -132,9 +132,9 @@ void GroupTestCast() { ...@@ -132,9 +132,9 @@ void GroupTestCast() {
TestCast<uint8_t>(paddle::DataType::FLOAT32); TestCast<uint8_t>(paddle::DataType::FLOAT32);
VLOG(2) << "float cast"; VLOG(2) << "float cast";
TestCast<float>(paddle::DataType::FLOAT32); TestCast<float>(paddle::DataType::FLOAT32);
VLOG(2) << "complex64 cast"; VLOG(2) << "complex<float> cast";
TestCast<paddle::complex64>(paddle::DataType::FLOAT32); TestCast<paddle::complex64>(paddle::DataType::FLOAT32);
VLOG(2) << "complex128 cast"; VLOG(2) << "complex<double> cast";
TestCast<paddle::complex128>(paddle::DataType::FLOAT32); TestCast<paddle::complex128>(paddle::DataType::FLOAT32);
VLOG(2) << "float16 cast"; VLOG(2) << "float16 cast";
TestCast<paddle::float16>(paddle::DataType::FLOAT16); TestCast<paddle::float16>(paddle::DataType::FLOAT16);
......
...@@ -26,6 +26,13 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, ...@@ -26,6 +26,13 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
platform::errors::Unavailable("Currently, model parallelism is only " platform::errors::Unavailable("Currently, model parallelism is only "
"supported between CPU and CUDA.")); "supported between CPU and CUDA."));
// NOTE(zhiqiu): Special case for CPU->NPU, avoid stream sync.
if (platform::is_cpu_place(in.place()) && platform::is_npu_place(dst_place)) {
TensorCopy(in, dst_place,
*platform::DeviceContextPool::Instance().Get(dst_place), out);
return;
}
// NOTE(yy): TransDataDevice should wait for computation of input. // NOTE(yy): TransDataDevice should wait for computation of input.
if (!platform::is_cuda_pinned_place(in.place())) { if (!platform::is_cuda_pinned_place(in.place())) {
platform::DeviceContextPool::Instance().Get(in.place())->Wait(); platform::DeviceContextPool::Instance().Get(in.place())->Wait();
......
...@@ -638,25 +638,34 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe( ...@@ -638,25 +638,34 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
const char* str = reader.get(); const char* str = reader.get();
std::string line = std::string(str); std::string line = std::string(str);
// VLOG(3) << line;
char* endptr = const_cast<char*>(str); char* endptr = const_cast<char*>(str);
int pos = 0; int pos = 0;
for (size_t i = 0; i < use_slots_index_.size(); ++i) { for (size_t i = 0; i < use_slots_index_.size(); ++i) {
int idx = use_slots_index_[i]; int idx = use_slots_index_[i];
int num = strtol(&str[pos], &endptr, 10); int num = strtol(&str[pos], &endptr, 10);
PADDLE_ENFORCE_NE(
num, 0, if (num <= 0) {
platform::errors::InvalidArgument( std::stringstream ss;
"The number of ids can not be zero, you need padding " ss << "\n\nGot unexpected input, maybe something wrong with it.\n";
"it in data generator; or if there is something wrong with " ss << "\n----------------------\n";
"the data, please check if the data contains unresolvable " ss << "The Origin Input Data:\n";
"characters.\nplease check this error line: %s, \n Specifically, " ss << "----------------------\n";
"something wrong happened(the length of this slot's feasign is 0)"
"when we parse the %d th slots." ss << line << "\n";
"Maybe something wrong around this slot"
"\nWe detect the feasign number of this slot is %d, " ss << "\n----------------------\n";
"which is illegal.", ss << "Some Possible Errors:\n";
str, i, num)); ss << "----------------------\n";
ss << "1. The number of ids can not be zero, you need padding.\n";
ss << "2. The input data contains unresolvable characters.\n";
ss << "3. We detect the slot " << i << "'s feasign number is " << num
<< " which is illegal.\n";
ss << "\n";
PADDLE_THROW(platform::errors::InvalidArgument(ss.str()));
}
if (idx != -1) { if (idx != -1) {
(*instance)[idx].Init(all_slots_type_[i]); (*instance)[idx].Init(all_slots_type_[i]);
if ((*instance)[idx].GetType()[0] == 'f') { // float if ((*instance)[idx].GetType()[0] == 'f') { // float
......
...@@ -19,8 +19,6 @@ limitations under the License. */ ...@@ -19,8 +19,6 @@ limitations under the License. */
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/eigen_ext.h" #include "paddle/fluid/platform/eigen_ext.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
...@@ -28,8 +26,8 @@ limitations under the License. */ ...@@ -28,8 +26,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct bfloat16; struct bfloat16;
struct complex128; template <typename T>
struct complex64; struct complex;
struct float16; struct float16;
template <typename T> template <typename T>
struct complex; struct complex;
...@@ -53,35 +51,31 @@ struct DataTypeTrait<void> { ...@@ -53,35 +51,31 @@ struct DataTypeTrait<void> {
#define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \ #define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \
callback(cpp_type, ::paddle::framework::proto::VarType::proto_type); callback(cpp_type, ::paddle::framework::proto::VarType::proto_type);
#define _ForEachDataType_(callback) \ #define _ForEachDataType_(callback) \
_ForEachDataTypeHelper_(callback, float, FP32); \ _ForEachDataTypeHelper_(callback, float, FP32); \
_ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \ _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \
_ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \ _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \
_ForEachDataTypeHelper_(callback, double, FP64); \ _ForEachDataTypeHelper_(callback, double, FP64); \
_ForEachDataTypeHelper_(callback, int, INT32); \ _ForEachDataTypeHelper_(callback, int, INT32); \
_ForEachDataTypeHelper_(callback, int64_t, INT64); \ _ForEachDataTypeHelper_(callback, int64_t, INT64); \
_ForEachDataTypeHelper_(callback, bool, BOOL); \ _ForEachDataTypeHelper_(callback, bool, BOOL); \
_ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \
_ForEachDataTypeHelper_(callback, int16_t, INT16); \ _ForEachDataTypeHelper_(callback, int16_t, INT16); \
_ForEachDataTypeHelper_(callback, int8_t, INT8); \ _ForEachDataTypeHelper_(callback, int8_t, INT8); \
_ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>, \ _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>, \
COMPLEX64); \ COMPLEX64); \
_ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>, \ _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>, \
COMPLEX128); \ COMPLEX128);
_ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \
_ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128); #define _ForEachDataTypeSmall_(callback) \
_ForEachDataTypeHelper_(callback, float, FP32); \
#define _ForEachDataTypeSmall_(callback) \ _ForEachDataTypeHelper_(callback, double, FP64); \
_ForEachDataTypeHelper_(callback, float, FP32); \ _ForEachDataTypeHelper_(callback, int, INT32); \
_ForEachDataTypeHelper_(callback, double, FP64); \ _ForEachDataTypeHelper_(callback, int64_t, INT64); \
_ForEachDataTypeHelper_(callback, int, INT32); \ _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>, \
_ForEachDataTypeHelper_(callback, int64_t, INT64); \ COMPLEX64); \
_ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>, \ _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>, \
COMPLEX64); \ COMPLEX128);
_ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>, \
COMPLEX128); \
_ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \
_ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128);
// For the use of thrust, as index-type elements can be only integers. // For the use of thrust, as index-type elements can be only integers.
#define _ForEachDataTypeTiny_(callback) \ #define _ForEachDataTypeTiny_(callback) \
......
...@@ -119,12 +119,12 @@ void TransComplexToReal(const proto::VarType::Type& dst_type, ...@@ -119,12 +119,12 @@ void TransComplexToReal(const proto::VarType::Type& dst_type,
// complex -> real // complex -> real
switch (src_type) { switch (src_type) {
case proto::VarType::COMPLEX64: case proto::VarType::COMPLEX64:
framework::VisitDataType(dst_type, framework::VisitDataType(
CastDataType<platform::complex64>(in, out, ctx)); dst_type, CastDataType<platform::complex<float>>(in, out, ctx));
break; break;
case proto::VarType::COMPLEX128: case proto::VarType::COMPLEX128:
framework::VisitDataType( framework::VisitDataType(
dst_type, CastDataType<platform::complex128>(in, out, ctx)); dst_type, CastDataType<platform::complex<double>>(in, out, ctx));
break; break;
default: default:
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
......
...@@ -159,10 +159,6 @@ static void PrintNanInf(const T* value, const size_t numel, int print_num, ...@@ -159,10 +159,6 @@ static void PrintNanInf(const T* value, const size_t numel, int print_num,
#pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in) #pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in)
#pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \ #pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \
omp_in) omp_in)
#pragma omp declare reduction(+ : paddle::platform::complex64 : omp_out += \
omp_in)
#pragma omp declare reduction(+ : paddle::platform::complex128 : omp_out += \
omp_in)
#pragma omp declare reduction(+ : paddle::platform::complex < \ #pragma omp declare reduction(+ : paddle::platform::complex < \
float > : omp_out += omp_in) float > : omp_out += omp_in)
#pragma omp declare reduction(+ : paddle::platform::complex < \ #pragma omp declare reduction(+ : paddle::platform::complex < \
...@@ -222,58 +218,6 @@ void CheckNanInf<paddle::platform::bfloat16>( ...@@ -222,58 +218,6 @@ void CheckNanInf<paddle::platform::bfloat16>(
} }
} }
template <>
void CheckNanInf<paddle::platform::complex64>(
const paddle::platform::complex64* value, const size_t numel, int print_num,
const std::string& op_type, const std::string& var_name) {
float real_sum = 0.0f;
#pragma omp parallel for reduction(+ : real_sum)
for (size_t i = 0; i < numel; ++i) {
real_sum += (value[i].real - value[i].real);
}
float imag_sum = 0.0f;
#pragma omp parallel for reduction(+ : imag_sum)
for (size_t i = 0; i < numel; ++i) {
imag_sum += (value[i].imag - value[i].imag);
}
if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
std::isinf(imag_sum)) {
// hot fix for compile failed in gcc4.8
// here also need print detail info of nan or inf later
PADDLE_THROW(platform::errors::PreconditionNotMet(
"There are `nan` or `inf` in tensor (%s) of operator (%s).", var_name,
op_type));
}
}
template <>
void CheckNanInf<paddle::platform::complex128>(
const paddle::platform::complex128* value, const size_t numel,
int print_num, const std::string& op_type, const std::string& var_name) {
double real_sum = 0.0;
#pragma omp parallel for reduction(+ : real_sum)
for (size_t i = 0; i < numel; ++i) {
real_sum += (value[i].real - value[i].real);
}
double imag_sum = 0.0;
#pragma omp parallel for reduction(+ : imag_sum)
for (size_t i = 0; i < numel; ++i) {
imag_sum += (value[i].imag - value[i].imag);
}
if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
std::isinf(imag_sum)) {
// hot fix for compile failed in gcc4.8
// here also need print detail info of nan or inf later
PADDLE_THROW(platform::errors::PreconditionNotMet(
"There are `nan` or `inf` in tensor (%s) of operator (%s).", var_name,
op_type));
}
}
template <> template <>
void CheckNanInf<paddle::platform::complex<float>>( void CheckNanInf<paddle::platform::complex<float>>(
const paddle::platform::complex<float>* value, const size_t numel, const paddle::platform::complex<float>* value, const size_t numel,
......
...@@ -29,9 +29,7 @@ template <typename T> ...@@ -29,9 +29,7 @@ template <typename T>
static ::DLDataType GetDLDataTypeCode() { static ::DLDataType GetDLDataTypeCode() {
::DLDataType dtype; ::DLDataType dtype;
if (std::is_same<T, platform::complex<float>>::value || if (std::is_same<T, platform::complex<float>>::value ||
std::is_same<T, platform::complex<double>>::value || std::is_same<T, platform::complex<double>>::value) {
std::is_same<T, platform::complex64>::value ||
std::is_same<T, platform::complex128>::value) {
// The current dlpack library version is v0.2, and does not define // The current dlpack library version is v0.2, and does not define
// kDLComplex value. But kDLComplex is defined by 5U in v0.4, so we set // kDLComplex value. But kDLComplex is defined by 5U in v0.4, so we set
// dtype.code to 5U directly here. After the dlpack library version being // dtype.code to 5U directly here. After the dlpack library version being
......
...@@ -29,9 +29,7 @@ namespace { // NOLINT ...@@ -29,9 +29,7 @@ namespace { // NOLINT
template <typename T> template <typename T>
constexpr uint8_t GetDLDataTypeCode() { constexpr uint8_t GetDLDataTypeCode() {
if (std::is_same<T, platform::complex<float>>::value || if (std::is_same<T, platform::complex<float>>::value ||
std::is_same<T, platform::complex<double>>::value || std::is_same<T, platform::complex<double>>::value) {
std::is_same<T, platform::complex64>::value ||
std::is_same<T, platform::complex128>::value) {
return static_cast<uint8_t>(5); return static_cast<uint8_t>(5);
} }
......
...@@ -50,7 +50,7 @@ if (WITH_TESTING) ...@@ -50,7 +50,7 @@ if (WITH_TESTING)
endif(WITH_TESTING) endif(WITH_TESTING)
cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS ${GRAPH_PATTERN_DETECTOR_DEPS}) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS ${GRAPH_PATTERN_DETECTOR_DEPS})
cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector) cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector op_def_api)
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS graph_pattern_detector executor) cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS graph_pattern_detector executor)
cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS op_compat_sensible_pass) cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS op_compat_sensible_pass)
cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass) cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass)
...@@ -171,7 +171,7 @@ if (WITH_MKLDNN) ...@@ -171,7 +171,7 @@ if (WITH_MKLDNN)
cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util)
cc_test(test_fc_act_mkldnn_fuse_pass SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc DEPS fc_act_mkldnn_fuse_pass pass_test_util) cc_test(test_fc_act_mkldnn_fuse_pass SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc DEPS fc_act_mkldnn_fuse_pass pass_test_util)
cc_test(test_batch_norm_act_fuse_pass SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass pass_test_util) cc_test(test_batch_norm_act_fuse_pass SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass pass_test_util)
set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass conv_op conv_transpose_op math_function im2col vol2col batch_norm_op gelu_op activation_op elementwise_add_op concat_and_split naive_executor device_context) set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass conv_op conv_transpose_op math_function im2col vol2col batch_norm_op gelu_op activation_op elementwise_add_op concat_and_split naive_executor device_context eigen_function)
if (WITH_GPU OR WITH_ROCM) if (WITH_GPU OR WITH_ROCM)
set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv) set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv)
endif() endif()
......
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/ir/fc_fuse_pass.h" #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
#include <string> #include <string>
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -23,6 +23,65 @@ namespace paddle { ...@@ -23,6 +23,65 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
FCFusePass::FCFusePass() {
AddOpCompat(OpCompat("mul"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("x_num_col_dims")
.IsNumGE(1)
.End()
.AddAttr("y_num_col_dims")
.End();
AddOpCompat(OpCompat("elementwise_add"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("axis")
.End();
AddOpCompat(OpCompat("relu"))
.AddInput("X")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End();
AddOpCompat(OpCompat("fc"))
.AddInput("Input")
.IsTensor()
.End()
.AddInput("W")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("in_num_col_dims")
.IsNumGE(1)
.End()
.AddAttr("activation_type")
.IsStringIn({"relu", ""})
.End();
}
void FCFusePass::ApplyImpl(ir::Graph* graph) const { void FCFusePass::ApplyImpl(ir::Graph* graph) const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
...@@ -52,6 +111,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const { ...@@ -52,6 +111,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
LOG(WARNING) << "The subgraph is empty."; LOG(WARNING) << "The subgraph is empty.";
return; return;
} }
if (!IsCompat(subgraph, g)) {
LOG(WARNING) << "Pass in op compat failed.";
return;
}
VLOG(4) << "handle FC fuse"; VLOG(4) << "handle FC fuse";
GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
...@@ -159,6 +222,11 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const { ...@@ -159,6 +222,11 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
} }
desc.Flush(); desc.Flush();
if (!IsCompat(desc)) {
LOG(WARNING) << "Fc fuse pass in out fc op compat failed.";
return;
}
auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied.
if (with_relu) { if (with_relu) {
GraphSafeRemoveNodes( GraphSafeRemoveNodes(
......
...@@ -30,6 +30,7 @@ class Graph; ...@@ -30,6 +30,7 @@ class Graph;
class FCFusePass : public FusePassBase { class FCFusePass : public FusePassBase {
public: public:
FCFusePass();
virtual ~FCFusePass() {} virtual ~FCFusePass() {}
protected: protected:
......
...@@ -15,4 +15,4 @@ cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_ ...@@ -15,4 +15,4 @@ cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_
cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass) cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass)
cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op) cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op eigen_function)
...@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <memory>
#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h" #include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
#include <memory>
#include <mutex>
#include <unordered_map>
#include "paddle/fluid/framework/op_def_api.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
...@@ -50,18 +53,17 @@ AttrCompat& AttrCompat::IsIntIn(const std::set<int>& candidates) { ...@@ -50,18 +53,17 @@ AttrCompat& AttrCompat::IsIntIn(const std::set<int>& candidates) {
return *this; return *this;
} }
//! Todo: append the definition.
AttrCompat& AttrCompat::IsLeftDefault() { AttrCompat& AttrCompat::IsLeftDefault() {
const std::string& op_name = op_compat_->Name(); const std::string& op_name = op_compat_->Name();
if (!OpInfoMap::Instance().Has(op_name)) { if (!OpInfoMap::Instance().Has(op_name)) {
VLOG(3) << "Op (" << op_name << ") is not registered!"; LOG(WARNING) << "Op (" << op_name << ") is not registered!";
conditions_.emplace_back([](const Attribute& attr) { return false; }); conditions_.emplace_back([](const Attribute& attr) { return false; });
return *this; return *this;
} }
const OpInfo& op_info = OpInfoMap::Instance().Get(op_name); const OpInfo& op_info = OpInfoMap::Instance().Get(op_name);
const AttributeMap attrs = op_info.Checker()->GetAttrsDefaultValuesMap(); const AttributeMap attrs = op_info.Checker()->GetAttrsDefaultValuesMap();
if (attrs.find(attr_name_) == attrs.end()) { if (attrs.find(attr_name_) == attrs.end()) {
VLOG(3) << "Op (" << op_name << ") has no default attr:" << attr_name_; LOG(WARNING) << "Op (" << op_name << ") has no default attr:" << attr_name_;
conditions_.emplace_back([](const Attribute& attr) { return false; }); conditions_.emplace_back([](const Attribute& attr) { return false; });
} else { } else {
Attribute default_attr = attrs.at(attr_name_); Attribute default_attr = attrs.at(attr_name_);
...@@ -77,6 +79,10 @@ bool AttrCompat::operator()(const OpDesc& op_desc) { ...@@ -77,6 +79,10 @@ bool AttrCompat::operator()(const OpDesc& op_desc) {
return true; return true;
} }
if (!op_desc.HasAttr(attr_name_)) { if (!op_desc.HasAttr(attr_name_)) {
if (!optional_) {
LOG(WARNING) << "The non-optional Attr(" << attr_name_ << ") of Op ("
<< op_compat_->Name() << ") not find ! ";
}
return optional_; return optional_;
} }
const Attribute attr = op_desc.GetAttr(attr_name_); const Attribute attr = op_desc.GetAttr(attr_name_);
...@@ -149,19 +155,35 @@ InputOrOutputCompat& OpCompat::AddOutput(const std::string& name) { ...@@ -149,19 +155,35 @@ InputOrOutputCompat& OpCompat::AddOutput(const std::string& name) {
} }
bool OpCompat::Judge(const OpDesc& op_desc) { bool OpCompat::Judge(const OpDesc& op_desc) {
if (is_first_judge_) {
is_first_judge_ = false;
const proto::OpDef& op_def = GetOpDef(op_name_);
if (op_def.has_extra()) {
for (const proto::OpDef_AttrDef& attr : op_def.extra().attrs()) {
extra_attrs_.emplace(attr.name());
}
}
}
for (auto& attr_map : op_desc.GetAttrMap()) { for (auto& attr_map : op_desc.GetAttrMap()) {
if (attr_compats_.find(attr_map.first) == attr_compats_.end()) { if (attr_compats_.find(attr_map.first) == attr_compats_.end()) {
if (extra_attrs_.find(attr_map.first) != extra_attrs_.end()) {
continue;
}
if (!AttrCompat(attr_map.first, this).IsLeftDefault()(op_desc)) { if (!AttrCompat(attr_map.first, this).IsLeftDefault()(op_desc)) {
VLOG(3) << "The Attr(" << attr_map.first << ") of Op (" << op_name_ LOG(WARNING)
<< ") not reigistered in OpCompat, not equal to default value!"; << "The Attr(" << attr_map.first << ") of Op (" << op_name_
<< ") not reigistered in OpCompat, not in extra attribute, not "
"equal to default value!";
return false; return false;
} }
} }
} }
for (auto& attr_compat : attr_compats_) { for (auto& attr_compat : attr_compats_) {
if (!attr_compat.second(op_desc)) { if (!attr_compat.second(op_desc)) {
VLOG(3) << " Check the Attr(" << attr_compat.first << ") of Op(" LOG(WARNING) << " Check the Attr(" << attr_compat.first << ") of Op("
<< op_name_ << ") failed!"; << op_name_ << ") failed!";
return false; return false;
} }
} }
...@@ -170,8 +192,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) { ...@@ -170,8 +192,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
for (auto& input_desc : inputs_map) { for (auto& input_desc : inputs_map) {
if (input_compats_.find(input_desc.first) == input_compats_.end()) { if (input_compats_.find(input_desc.first) == input_compats_.end()) {
if (!input_desc.second.empty()) { if (!input_desc.second.empty()) {
VLOG(3) << "The Input (" << input_desc.first << ") of Operator (" LOG(WARNING) << "The Input (" << input_desc.first << ") of Operator ("
<< op_name_ << ") not reigistered in OpCompat!"; << op_name_ << ") not reigistered in OpCompat!";
return false; return false;
} }
} }
...@@ -179,14 +201,15 @@ bool OpCompat::Judge(const OpDesc& op_desc) { ...@@ -179,14 +201,15 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
for (auto& input_val : input_compats_) { for (auto& input_val : input_compats_) {
if (inputs_map.find(input_val.first) == inputs_map.end()) { if (inputs_map.find(input_val.first) == inputs_map.end()) {
if (!input_val.second.Optional()) { if (!input_val.second.Optional()) {
VLOG(3) << "The No optional Input (" << input_val.first LOG(WARNING) << "The No optional Input (" << input_val.first
<< ") of Operator (" << op_name_ << ") not find in op_desc!"; << ") of Operator (" << op_name_
<< ") not find in op_desc!";
return false; return false;
} }
} else { } else {
if (!input_val.second(inputs_map.at(input_val.first))) { if (!input_val.second(inputs_map.at(input_val.first))) {
VLOG(3) << "The Input (" << input_val.first << ") of Operator (" LOG(WARNING) << "The Input (" << input_val.first << ") of Operator ("
<< op_name_ << ") compat check failed!"; << op_name_ << ") compat check failed!";
return false; return false;
} }
} }
...@@ -196,8 +219,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) { ...@@ -196,8 +219,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
for (auto& output_desc : outputs_map) { for (auto& output_desc : outputs_map) {
if (output_compats_.find(output_desc.first) == output_compats_.end()) { if (output_compats_.find(output_desc.first) == output_compats_.end()) {
if (!output_desc.second.empty()) { if (!output_desc.second.empty()) {
VLOG(3) << "The Output (" << output_desc.first << ") of Operator (" LOG(WARNING) << "The Output (" << output_desc.first << ") of Operator ("
<< op_name_ << ") not reigistered in OpCompat!"; << op_name_ << ") not reigistered in OpCompat!";
return false; return false;
} }
} }
...@@ -205,14 +228,15 @@ bool OpCompat::Judge(const OpDesc& op_desc) { ...@@ -205,14 +228,15 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
for (auto& output_val : output_compats_) { for (auto& output_val : output_compats_) {
if (outputs_map.find(output_val.first) == outputs_map.end()) { if (outputs_map.find(output_val.first) == outputs_map.end()) {
if (!output_val.second.Optional()) { if (!output_val.second.Optional()) {
VLOG(3) << "The No optional Output (" << output_val.first LOG(WARNING) << "The No optional Output (" << output_val.first
<< ") of Operator (" << op_name_ << ") not find in op_desc!"; << ") of Operator (" << op_name_
<< ") not find in op_desc!";
return false; return false;
} }
} else { } else {
if (!output_val.second(outputs_map.at(output_val.first))) { if (!output_val.second(outputs_map.at(output_val.first))) {
VLOG(3) << "The Output (" << output_val.first << ") of Operator (" LOG(WARNING) << "The Output (" << output_val.first << ") of Operator ("
<< op_name_ << ") compat check failed!"; << op_name_ << ") compat check failed!";
return false; return false;
} }
} }
......
...@@ -140,6 +140,8 @@ class OpCompat { ...@@ -140,6 +140,8 @@ class OpCompat {
std::unordered_map<std::string, AttrCompat> attr_compats_; std::unordered_map<std::string, AttrCompat> attr_compats_;
std::unordered_map<std::string, InputOrOutputCompat> input_compats_; std::unordered_map<std::string, InputOrOutputCompat> input_compats_;
std::unordered_map<std::string, InputOrOutputCompat> output_compats_; std::unordered_map<std::string, InputOrOutputCompat> output_compats_;
std::unordered_set<std::string> extra_attrs_;
bool is_first_judge_ = true;
}; };
/** /**
...@@ -203,6 +205,7 @@ class OpCompatSensiblePass : public Pass { ...@@ -203,6 +205,7 @@ class OpCompatSensiblePass : public Pass {
if (!node_pair.second->IsOp()) continue; if (!node_pair.second->IsOp()) continue;
auto op_type = node_pair.second->Op()->Type(); auto op_type = node_pair.second->Op()->Type();
if (!op_compat_judgers_.count(op_type)) { if (!op_compat_judgers_.count(op_type)) {
LOG(WARNING) << op_type << "compat not registered!";
return false; return false;
} }
auto& judger = *op_compat_judgers_.at(op_type); auto& judger = *op_compat_judgers_.at(op_type);
......
...@@ -27,7 +27,6 @@ TEST(OpCompatSensiblePass, compatOp) { ...@@ -27,7 +27,6 @@ TEST(OpCompatSensiblePass, compatOp) {
compat.AddAttr("in_num_col_dims") compat.AddAttr("in_num_col_dims")
.IsIntIn({1, 2}) .IsIntIn({1, 2})
.IsNumLE(1) .IsNumLE(1)
.IsLeftDefault()
.End() .End()
.AddAttr("activation_type") .AddAttr("activation_type")
.IsStringIn({"tanh", "sigmoid"}) .IsStringIn({"tanh", "sigmoid"})
...@@ -68,7 +67,7 @@ TEST(OpCompatSensiblePass, compatOp) { ...@@ -68,7 +67,7 @@ TEST(OpCompatSensiblePass, compatOp) {
fc_op.SetOutput("Out", std::vector<std::string>{"test_output"}); fc_op.SetOutput("Out", std::vector<std::string>{"test_output"});
EXPECT_STREQ(compat.Name().c_str(), "fc"); EXPECT_STREQ(compat.Name().c_str(), "fc");
EXPECT_FALSE(compat.Judge(fc_op)); EXPECT_TRUE(compat.Judge(fc_op));
} }
TEST(OpCompatSensiblePass, compatOpAttribute) { TEST(OpCompatSensiblePass, compatOpAttribute) {
...@@ -92,6 +91,18 @@ TEST(OpCompatSensiblePass, compatOpAttribute) { ...@@ -92,6 +91,18 @@ TEST(OpCompatSensiblePass, compatOpAttribute) {
delete info.checker_; delete info.checker_;
} }
TEST(OpCompatSensiblePass, opDefNotFound) {
OpCompat compat("fc_1");
OpDesc fc_op;
compat.Judge(fc_op);
OpCompat compat_1("");
compat_1.Judge(fc_op);
}
TEST(OpCompatSensiblePass, compatOpAttributeOptional) { TEST(OpCompatSensiblePass, compatOpAttributeOptional) {
OpCompat compat("fc"); OpCompat compat("fc");
compat.AddAttr("activation_type") compat.AddAttr("activation_type")
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined _WIN32 || defined __APPLE__
#else
#define _LINUX
#endif
#include "paddle/fluid/framework/op_def_api.h"
#include <fstream>
#include <mutex>
#include <string>
#include <unordered_map>
#ifdef _LINUX
#include <stdio_ext.h>
#include <sys/mman.h>
#include <sys/stat.h>
#endif
#include <google/protobuf/io/zero_copy_stream_impl.h>
#include <google/protobuf/text_format.h>
#include "glog/logging.h"
#include "io/fs.h"
#include "paddle/fluid/framework/op_def.pb.h"
/*
// op_def.pbtxt
namespace {
const std::unordered_map<std::string, std::std::string> op_def_map = {...};
}
*/
#include "paddle/fluid/framework/op_def.pbtxt" //NOLINT
namespace paddle {
namespace framework {
const proto::OpDef& GetOpDef(const std::string& op_name) {
static std::unordered_map<std::string, proto::OpDef> ops_definition;
static std::mutex mtx;
if (ops_definition.find(op_name) == ops_definition.end()) {
std::lock_guard<std::mutex> lk(mtx);
if (ops_definition.find(op_name) == ops_definition.end()) {
proto::OpDef op_def;
if (op_def_map.find(op_name) == op_def_map.end()) {
LOG(WARNING) << op_name << ".pbtxt not exist!";
} else {
if (!::google::protobuf::TextFormat::ParseFromString(
op_def_map.at(op_name), &op_def)) {
LOG(WARNING) << "Failed to parse " << op_name;
}
}
if (op_def.type() != op_name) {
LOG(WARNING) << op_name << ".pbtxt has error type :" << op_def.type();
ops_definition.emplace(std::make_pair(op_name, proto::OpDef()));
} else {
ops_definition.emplace(std::make_pair(op_name, std::move(op_def)));
}
}
}
return ops_definition.at(op_name);
}
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,11 +12,12 @@ ...@@ -12,11 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/increment_op.h" #pragma once
namespace ops = paddle::operators; #include "paddle/fluid/framework/op_def.pb.h"
REGISTER_OP_CUDA_KERNEL(
increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>, namespace paddle {
ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>, namespace framework {
ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>, const proto::OpDef& GetOpDef(const std::string& op_name);
ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>); }
}
...@@ -317,8 +317,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -317,8 +317,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__) __VA_ARGS__)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
#else
#define REGISTER_OP_CUDA_KERNEL(op_type, ...)
#endif
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \ #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
......
...@@ -1532,7 +1532,12 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -1532,7 +1532,12 @@ Scope* OperatorWithKernel::PrepareData(
// the rest iterations to save the elapsed time. // the rest iterations to save the elapsed time.
// We do not support skipping PrepareData in while block, because the Op's // We do not support skipping PrepareData in while block, because the Op's
// input may be changed by subsequent Ops, which may cause an error. // input may be changed by subsequent Ops, which may cause an error.
if (pre_scope_ == &scope && new_scope == nullptr) {
// For inference, ops that behind conditional branch aren't supported well,
// so disable prepare optimization conservatively.
bool force_prepare_data = HasAttr("inference_force_prepare_data") &&
Attr<bool>("inference_force_prepare_data");
if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) {
need_prepare_data_ = false; need_prepare_data_ = false;
} }
......
...@@ -22,8 +22,7 @@ limitations under the License. */ ...@@ -22,8 +22,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
...@@ -1137,9 +1136,9 @@ std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) { ...@@ -1137,9 +1136,9 @@ std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
} }
template <> template <>
std::ostream& print_tensor<paddle::platform::complex64>( std::ostream& print_tensor<paddle::platform::complex<float>>(
std::ostream& os, const framework::Tensor& tensor) { std::ostream& os, const framework::Tensor& tensor) {
auto inspect = tensor.data<paddle::platform::complex64>(); auto inspect = tensor.data<paddle::platform::complex<float>>();
auto element_num = tensor.numel(); auto element_num = tensor.numel();
os << " - data: ["; os << " - data: [";
...@@ -1155,9 +1154,9 @@ std::ostream& print_tensor<paddle::platform::complex64>( ...@@ -1155,9 +1154,9 @@ std::ostream& print_tensor<paddle::platform::complex64>(
} }
template <> template <>
std::ostream& print_tensor<paddle::platform::complex128>( std::ostream& print_tensor<paddle::platform::complex<double>>(
std::ostream& os, const framework::Tensor& tensor) { std::ostream& os, const framework::Tensor& tensor) {
auto inspect = tensor.data<paddle::platform::complex128>(); auto inspect = tensor.data<paddle::platform::complex<double>>();
auto element_num = tensor.numel(); auto element_num = tensor.numel();
os << " - data: ["; os << " - data: [";
......
...@@ -24,8 +24,7 @@ ...@@ -24,8 +24,7 @@
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -200,8 +199,8 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { ...@@ -200,8 +199,8 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
PADDLE_TENSOR_ADD(double); PADDLE_TENSOR_ADD(double);
// NOTE(chenweihang): only support complex grad tensor accumulated, // NOTE(chenweihang): only support complex grad tensor accumulated,
// support selected rows if needed in the future // support selected rows if needed in the future
PADDLE_TENSOR_ADD(platform::complex64); PADDLE_TENSOR_ADD(platform::complex<float>);
PADDLE_TENSOR_ADD(platform::complex128); PADDLE_TENSOR_ADD(platform::complex<double>);
#endif #endif
#undef PADDLE_TENSOR_ADD #undef PADDLE_TENSOR_ADD
......
...@@ -270,7 +270,46 @@ bool AnalysisPredictor::CreateExecutor() { ...@@ -270,7 +270,46 @@ bool AnalysisPredictor::CreateExecutor() {
executor_.reset(new paddle::framework::NaiveExecutor(place_)); executor_.reset(new paddle::framework::NaiveExecutor(place_));
return true; return true;
} }
static bool IsPrepareDataOptTargetOp(framework::OpDesc *op) {
// here is prepare data optimization related bad cases:
// let's assume an op behind conditional_block and if conditional_block
// chooses branch 1, the op need to call prepare data. else the op don't need
// to call prepare data. In running, if predictor chooses branch 2, then
// optimization takes effect, later issue is followed if predictor chooses
// branch 1, because the op lost chance to prepare data.
std::vector<std::string> op_type = {"conditional_block_infer",
"select_input"};
for (const auto &type : op_type) {
if (op->Type() == type) {
return true;
}
}
return false;
}
static void DisablePrepareDataOpt(
std::shared_ptr<framework::ProgramDesc> inference_program, int block,
bool pre_disable_opt) {
bool disable_opt = false;
auto &infer_block = inference_program->Block(block);
for (auto *op : infer_block.AllOps()) {
if (disable_opt || pre_disable_opt) {
op->SetAttr("inference_force_prepare_data", true);
}
if (op->HasAttr("sub_block")) {
int blockID = op->GetBlockAttrId("sub_block");
DisablePrepareDataOpt(inference_program, blockID,
disable_opt || pre_disable_opt);
}
// disable prepare data if unfriendly op is found
disable_opt = IsPrepareDataOptTargetOp(op);
}
}
bool AnalysisPredictor::PrepareExecutor() { bool AnalysisPredictor::PrepareExecutor() {
DisablePrepareDataOpt(inference_program_, 0, false);
executor_->Prepare(sub_scope_, *inference_program_, 0, executor_->Prepare(sub_scope_, *inference_program_, 0,
config_.use_feed_fetch_ops_); config_.use_feed_fetch_ops_);
...@@ -1197,6 +1236,9 @@ USE_TRT_CONVERTER(roi_align); ...@@ -1197,6 +1236,9 @@ USE_TRT_CONVERTER(roi_align);
USE_TRT_CONVERTER(affine_channel); USE_TRT_CONVERTER(affine_channel);
USE_TRT_CONVERTER(multiclass_nms); USE_TRT_CONVERTER(multiclass_nms);
USE_TRT_CONVERTER(nearest_interp); USE_TRT_CONVERTER(nearest_interp);
USE_TRT_CONVERTER(reshape);
USE_TRT_CONVERTER(reduce_sum);
USE_TRT_CONVERTER(gather_nd);
#endif #endif
namespace paddle_infer { namespace paddle_infer {
......
...@@ -12,6 +12,9 @@ nv_library(tensorrt_converter ...@@ -12,6 +12,9 @@ nv_library(tensorrt_converter
affine_channel_op.cc affine_channel_op.cc
multiclass_nms_op.cc multiclass_nms_op.cc
nearest_interp_op.cc nearest_interp_op.cc
reshape_op.cc
reduce_op.cc
gather_nd_op.cc
DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
......
...@@ -103,11 +103,18 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, ...@@ -103,11 +103,18 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
static_cast<void*>(bias_data), bias_size}; static_cast<void*>(bias_data), bias_size};
auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input, // In conv2d_transpose and depthwise_conv2d_transpose,
nv_ksize, weight, bias); // output channels = filter_dims[1] * groups
PADDLE_ENFORCE_NOT_NULL(layer, auto* layer = (op_desc.Type() == "conv2d_transpose" ||
platform::errors::Fatal("TensorRT create conv2d" op_desc.Type() == "depthwise_conv2d_transpose")
" layer error.")); ? fadd_layer(const_cast<nvinfer1::ITensor*>(X),
n_input * groups, nv_ksize, weight, bias)
: fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output,
nv_ksize, weight, bias);
PADDLE_ENFORCE_NOT_NULL(
layer, platform::errors::Fatal("TensorRT create conv2d/conv2d_transpose"
" layer failed."));
layer->setStride(nv_strides); layer->setStride(nv_strides);
layer->setPadding(nv_paddings); layer->setPadding(nv_paddings);
layer->setNbGroups(groups); layer->setNbGroups(groups);
...@@ -134,7 +141,6 @@ class Conv2dOpConverter : public OpConverter { ...@@ -134,7 +141,6 @@ class Conv2dOpConverter : public OpConverter {
ConvertConv2d( ConvertConv2d(
engine_, op, scope, test_mode, engine_, op, scope, test_mode,
[&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */ [&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */
int n_input, /* Conv input maps */
nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight, nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* { TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* {
auto* layer = auto* layer =
...@@ -156,7 +162,6 @@ class Deconv2dOpConverter : public OpConverter { ...@@ -156,7 +162,6 @@ class Deconv2dOpConverter : public OpConverter {
ConvertConv2d( ConvertConv2d(
engine_, op, scope, test_mode, engine_, op, scope, test_mode,
[&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */ [&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */
int n_input, /* Deconv output maps */
nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight, nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* { TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
auto* layer = auto* layer =
......
...@@ -40,10 +40,19 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { ...@@ -40,10 +40,19 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
auto word_emb_name = op_desc.Input("WordEmbedding").front(); auto word_emb_name = op_desc.Input("WordEmbedding").front();
auto pos_emb_name = op_desc.Input("PosEmbedding").front(); auto pos_emb_name = op_desc.Input("PosEmbedding").front();
auto sent_emb_name = op_desc.Input("SentEmbedding").front(); auto sent_emb_name = op_desc.Input("SentEmbedding").front();
std::vector<std::string> id_names = {word_id_name, pos_id_name,
sent_id_name}; std::vector<std::string> id_names;
std::vector<std::string> emb_names = {word_emb_name, pos_emb_name, std::vector<std::string> emb_names;
sent_emb_name};
if (engine_->use_oss()) {
id_names =
std::vector<std::string>{word_id_name, pos_id_name, sent_id_name};
emb_names =
std::vector<std::string>{word_emb_name, pos_emb_name, sent_emb_name};
} else {
id_names = op_desc.Input("Ids");
emb_names = op_desc.Input("Embs");
}
int input_num = id_names.size(); int input_num = id_names.size();
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class GatherNdOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a paddle gather_nd op to tensorrt gather_nd plugin";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
std::vector<nvinfer1::ITensor*> inputs;
auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
auto* index = engine_->GetITensor(op_desc.Input("Index")[0]);
inputs.emplace_back(input);
inputs.emplace_back(index);
nvinfer1::ILayer* layer = nullptr;
bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::GatherNdPluginDynamic* plugin =
new plugin::GatherNdPluginDynamic(with_fp16);
layer = engine_->AddDynamicPlugin(inputs.data(), inputs.size(), plugin);
std::string layer_name = "gather_nd (Output: ";
auto output_name = op_desc.Output("Out")[0];
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
layer_name += output_name;
if (test_mode) {
engine_->DeclareOutput(output_name);
}
layer->setName((layer_name + ")").c_str());
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(gather_nd, GatherNdOpConverter);
...@@ -127,6 +127,13 @@ class OpConverter { ...@@ -127,6 +127,13 @@ class OpConverter {
it, platform::errors::Unimplemented("no OpConverter for optype [%s]", it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
op_desc.Type())); op_desc.Type()));
} }
// reshape2 == reshape
if (op_desc.Type() == "reshape2") {
it = Registry<OpConverter>::Global().Lookup("reshape");
PADDLE_ENFORCE_NOT_NULL(
it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
op_desc.Type()));
}
if (!it) { if (!it) {
it = Registry<OpConverter>::Global().Lookup(op_desc.Type()); it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
} }
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <NvInfer.h>
#include <sys/types.h>
#include <cstddef>
#include <cstdint>
#include <vector>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace framework {
class Scope;
namespace proto {
class OpDesc;
} // namespace proto
} // namespace framework
} // namespace paddle
namespace paddle {
namespace inference {
namespace tensorrt {
class ReduceSumOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert a paddle reduce_sum op to tensorrt reduce layer";
framework::OpDesc op_desc(op, nullptr);
auto* x = engine_->GetITensor(op_desc.Input("X").front());
nvinfer1::Dims input_shape = x->getDimensions();
int input_dims = input_shape.nbDims;
bool keep_dim = BOOST_GET_CONST(bool, op_desc.GetAttr("keep_dim"));
std::vector<int32_t> dim =
BOOST_GET_CONST(std::vector<int32_t>, op_desc.GetAttr("dim"));
bool reduce_all = BOOST_GET_CONST(bool, op_desc.GetAttr("reduce_all"));
// Now we only support dynamic_shape mode.
nvinfer1::IReduceLayer* layer = nullptr;
if (reduce_all) {
uint32_t reduce_dim = 0;
for (int i = 0; i < input_dims; ++i) {
reduce_dim |= 1 << i;
}
layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
nvinfer1::ReduceOperation::kSUM, reduce_dim,
keep_dim);
} else {
auto CvtToBitMask = [&](const std::vector<int32_t>& dims) -> uint32_t {
uint32_t res = 0;
for (auto x : dims) {
if (x < 0) {
res |= 1 << (x + input_dims);
} else {
res |= 1 << x;
}
}
return res;
};
layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
nvinfer1::ReduceOperation::kSUM,
CvtToBitMask(dim), keep_dim);
}
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "reduce_sum", {output_name}, test_mode);
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(reduce_sum, ReduceSumOpConverter);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace framework {
class Scope;
namespace proto {
class OpDesc;
} // namespace proto
} // namespace framework
} // namespace paddle
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* ReshapeOp
*/
class ReshapeOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
std::vector<int> shape =
BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("shape"));
int nbDims_num = shape.size();
nvinfer1::Dims reshape_dim;
if (engine_->with_dynamic_shape()) { // running the TRT Dynamic Shape mode
reshape_dim.nbDims = nbDims_num;
for (int i = 0; i < nbDims_num; ++i) {
reshape_dim.d[i] = shape[i];
}
} else { // running the TRT Static Shape mode
reshape_dim.nbDims = nbDims_num - 1;
for (int i = 0; i < nbDims_num - 1; ++i) {
reshape_dim.d[i] = shape[i + 1];
}
}
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
layer->setReshapeDimensions(reshape_dim);
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "reshape", {output_name}, test_mode);
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(reshape, ReshapeOpConverter);
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/tensorrt/op_teller.h" #include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
...@@ -49,6 +50,10 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -49,6 +50,10 @@ struct SimpleOpTypeSetTeller : public Teller {
#endif #endif
#if IS_TRT_VERSION_GE(7130) #if IS_TRT_VERSION_GE(7130)
teller_set.insert("group_norm"); teller_set.insert("group_norm");
#endif
#if CUDA_VERSION >= 10200
teller_set.insert("reshape");
teller_set.insert("reshape2");
#endif #endif
} }
...@@ -118,11 +123,13 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -118,11 +123,13 @@ struct SimpleOpTypeSetTeller : public Teller {
"flatten2", "flatten2",
"flatten", "flatten",
"gather", "gather",
"gather_nd",
"yolo_box", "yolo_box",
"roi_align", "roi_align",
"affine_channel", "affine_channel",
"nearest_interp", "nearest_interp",
"anchor_generator", "anchor_generator",
"reduce_sum",
}; };
}; };
...@@ -143,19 +150,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, ...@@ -143,19 +150,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings")); BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
if (paddings.size() > 2) return false; if (paddings.size() > 2) return false;
// strides > 1 is only supported by trt7.0 above
#if !IS_TRT_VERSION_GE(7000)
if (desc.HasAttr("strides")) {
const std::vector<int> strides =
BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides"));
// there is no issue if strides.size() less than 2
if (strides.size() > 1) {
for (size_t i = 0; i < strides.size(); i++) {
if (strides[i] > 1) return false;
}
}
}
#endif
} }
if (op_type == "pool2d") { if (op_type == "pool2d") {
...@@ -239,15 +233,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, ...@@ -239,15 +233,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
return false; return false;
} }
// strides > 1 is only supported by trt7.0 above // strides > 1 and 'SAME' is only supported by trt7.0 above
#if !IS_TRT_VERSION_GE(7000) #if !IS_TRT_VERSION_GE(7000)
if (desc.HasAttr("strides")) { if (op_type == "conv2d" || op_type == "conv2d_fusion" ||
const std::vector<int> strides = op_type == "depthwise_conv2d") {
BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides")); if (desc.HasAttr("padding_algorithm") && with_dynamic_shape) {
// there is no issue if strides.size() less than 2 auto padding_algorithm =
if (strides.size() > 1) { BOOST_GET_CONST(std::string, desc.GetAttr("padding_algorithm"));
for (size_t i = 0; i < strides.size(); i++) { if (padding_algorithm == "SAME" && desc.HasAttr("strides")) {
if (strides[i] > 1) return false; const std::vector<int> strides =
BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides"));
// there is no issue if strides.size() less than 2
if (strides.size() > 1) {
for (size_t i = 0; i < strides.size(); i++) {
if (strides[i] > 1) return false;
}
}
} }
} }
} }
...@@ -326,6 +327,30 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, ...@@ -326,6 +327,30 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false; if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
} }
if (op_type == "gather_nd") {
auto* block = desc.Block();
auto x_var_name = desc.Input("X")[0];
auto index_var_name = desc.Input("Index")[0];
auto* x_var_desc = block->FindVar(x_var_name);
auto* index_var_desc = block->FindVar(index_var_name);
// The index input must be int32 datatype.
if (index_var_desc->GetDataType() !=
paddle::framework::proto::VarType_Type::VarType_Type_INT32) {
VLOG(3) << "gather_nd op Index input data type must be int32";
return false;
}
const auto index_shape = index_var_desc->GetShape();
const auto x_shape = x_var_desc->GetShape();
if (x_shape.size() != index_shape.size()) {
VLOG(3) << "gather_nd op Index input dims size [" << index_shape.size()
<< " ] not equal to x dims size [" << x_shape.size() << "]";
return false;
}
if (!with_dynamic_shape) return false;
}
if (op_type == "yolo_box") { if (op_type == "yolo_box") {
if (with_dynamic_shape) return false; if (with_dynamic_shape) return false;
bool has_attrs = bool has_attrs =
...@@ -673,6 +698,33 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, ...@@ -673,6 +698,33 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
return false; return false;
} }
} }
if (op_type == "reshape" || op_type == "reshape2") {
if (!desc.HasAttr("shape")) {
return false;
// Paddle-TRT does not support the input tensors: Shape and ShapeTensor
} else if (desc.Input("Shape").size() >= 1 ||
desc.Input("ShapeTensor").size() >= 1) {
return false;
} else {
std::vector<int> shape =
BOOST_GET_CONST(std::vector<int>, desc.GetAttr("shape"));
if (shape.size() >= nvinfer1::Dims::MAX_DIMS) return false;
}
}
if (op_type == "reduce_sum") {
if (!with_dynamic_shape) {
VLOG(3) << "the reduce_sum does not support static shape yet";
return false;
}
if (!(desc.HasAttr("keep_dim") && desc.HasAttr("dim") &&
desc.HasAttr("reduce_all"))) {
VLOG(3) << "the reduce_sum does not have attr (keep_dim or dim or "
"reduce_all)";
return false;
}
}
if ((*teller)(op_type, desc, use_no_calib_int8)) return true; if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
} }
......
...@@ -8,6 +8,7 @@ nv_library(tensorrt_plugin ...@@ -8,6 +8,7 @@ nv_library(tensorrt_plugin
anchor_generator_op_plugin.cu anchor_generator_op_plugin.cu
yolo_box_op_plugin.cu yolo_box_op_plugin.cu
roi_align_op_plugin.cu roi_align_op_plugin.cu
gather_nd_op_plugin.cu
DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_fp16.h>
#include <algorithm>
#include <cstdint>
#include <functional>
#include <numeric>
#include <sstream>
#include "NvInferRuntimeCommon.h"
#include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
#if IS_TRT_VERSION_GE(6000)
template <typename T, typename IndexT = int>
__global__ void GatherNdCUDAKernel(const T* input, const int32_t* input_dims,
const IndexT* indices, T* output,
int32_t remain_size, int32_t slice_size,
int32_t end_size) {
CUDA_KERNEL_LOOP(i, remain_size * slice_size) {
int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice
IndexT gather_i = 0;
int32_t temp = slice_size;
for (int32_t j = end_size - 1; j >= 0; --j) {
auto index_value = indices[indices_i * end_size + j];
PADDLE_ENFORCE(
index_value >= 0 && index_value < input_dims[j],
"The index is out of bounds, "
"please check whether the dimensions of index and "
"input meet the requirements. It should "
"be less than [%d] and greater or equal to 0, but received [%d]",
input_dims[j], index_value);
gather_i += (index_value * temp);
temp *= input_dims[j];
}
IndexT input_i = gather_i + slice_i;
*(output + i) = *(input + input_i);
}
}
int GatherNdPluginDynamic::initialize() { return 0; }
size_t GatherNdPluginDynamic::getSerializationSize() const {
return SerializedSize(with_fp16_);
}
void GatherNdPluginDynamic::serialize(void* buffer) const {
SerializeValue(&buffer, with_fp16_);
}
nvinfer1::DimsExprs GatherNdPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& expr_builder) {
PADDLE_ENFORCE_EQ(
nb_inputs, 2,
platform::errors::InvalidArgument(
"The gather_nd plugin should have 2 input, but got %d.", nb_inputs));
PADDLE_ENFORCE_EQ(output_index, 0,
platform::errors::InvalidArgument(
"When GetOutputDimensions in gather_nd "
"plugin, the output_index should be 0."));
nvinfer1::DimsExprs x_dims = inputs[0];
nvinfer1::DimsExprs index_dims = inputs[1];
int32_t x_dims_size = x_dims.nbDims;
int32_t index_dims_size = index_dims.nbDims;
// TODO(wilber): The result dims shoule be Index.shape[:-1] +
// X.shape[Index.shape[-1]:], but the trt DimsExprs is an expression we can't
// get the actual value. So we only support one scenario: input_dims.size ==
// index_dims.size.
nvinfer1::DimsExprs ret(x_dims);
for (int i = 0; i < index_dims_size - 1; ++i) {
ret.d[i] = index_dims.d[i];
}
return ret;
}
bool GatherNdPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
int nb_outputs) {
PADDLE_ENFORCE_NOT_NULL(
in_out, platform::errors::InvalidArgument(
"The input of gather_nd plugin should not be nullptr."));
PADDLE_ENFORCE_LT(
pos, nb_inputs + nb_outputs,
platform::errors::InvalidArgument("The pos(%d) should be less than the "
"num(%d) of the input and the output.",
pos, nb_inputs + nb_outputs));
(in_out && pos < (nb_inputs + nb_outputs));
const nvinfer1::PluginTensorDesc& in = in_out[pos];
if (pos == 0) {
if (with_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
} else {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
} else if (pos == 1) {
return in.type == nvinfer1::DataType::kINT32 &&
in.format == nvinfer1::TensorFormat::kLINEAR;
} else if (pos == 2) {
return in.type == in_out[0].type &&
in.format == nvinfer1::TensorFormat::kLINEAR;
}
return true;
}
nvinfer1::DataType GatherNdPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
return input_types[0];
}
int GatherNdPluginDynamic::enqueue(
const nvinfer1::PluginTensorDesc* input_desc,
const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs,
void* const* outputs, void* workspace, cudaStream_t stream) {
auto input_dims = input_desc[0].dims;
auto index_dims = input_desc[1].dims;
auto input_dims_size = input_dims.nbDims;
auto index_dims_size = index_dims.nbDims;
std::vector<int32_t> input_shape, index_shape, out_shape;
for (int i = 0; i < input_dims.nbDims; i++)
input_shape.push_back(input_dims.d[i]);
for (int i = 0; i < index_dims.nbDims; i++)
index_shape.push_back(index_dims.d[i]);
// The out_shape is
// Index.shape[:-1] + X.shape[Index.shape[-1]:]
for (int i = 0; i < index_dims_size - 1; ++i) {
out_shape.emplace_back(index_shape[i]);
}
for (int i = index_shape[index_dims_size - 1]; i < input_dims_size; ++i) {
out_shape.emplace_back(input_shape[i]);
}
// final dim
int end_size = index_shape[index_dims_size - 1];
// remain dim
std::vector<int> remain_ddim(index_shape.begin(), index_shape.end() - 1);
int remain_numel = std::accumulate(remain_ddim.begin(), remain_ddim.end(), 1,
std::multiplies<int>());
// slice size
int slice_size = 1;
for (int i = end_size; i < input_dims_size; ++i) {
slice_size *= input_shape[i];
}
auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. gather_nd-->fp32";
const float* p_input = static_cast<const float*>(inputs[0]);
const int32_t* p_index = static_cast<const int32_t*>(inputs[1]);
float* p_output = static_cast<float*>(outputs[0]);
if (input_dims_data_ == nullptr) {
cudaMalloc(&input_dims_data_, input_shape.size() * sizeof(int));
}
cudaMemcpyAsync(input_dims_data_, input_shape.data(),
sizeof(int) * input_shape.size(), cudaMemcpyHostToDevice,
stream);
int block = 512;
int n = slice_size * remain_numel;
int grid = (n + block - 1) / block;
GatherNdCUDAKernel<float, int32_t><<<grid, block, 0, stream>>>(
p_input, input_dims_data_, p_index, p_output, remain_numel, slice_size,
end_size);
} else if (input_type == nvinfer1::DataType::kHALF) {
VLOG(1) << "TRT Plugin DataType selected. gather_nd-->fp16";
const half* p_input = static_cast<const half*>(inputs[0]);
const int32_t* p_index = static_cast<const int32_t*>(inputs[1]);
half* p_output = static_cast<half*>(outputs[0]);
if (input_dims_data_ == nullptr) {
cudaMalloc(&input_dims_data_, input_shape.size() * sizeof(int));
}
cudaMemcpyAsync(input_dims_data_, input_shape.data(),
sizeof(int) * input_shape.size(), cudaMemcpyHostToDevice,
stream);
int block = 512;
int n = slice_size * remain_numel;
int grid = (n + block - 1) / block;
GatherNdCUDAKernel<half, int32_t><<<grid, block, 0, stream>>>(
p_input, input_dims_data_, p_index, p_output, remain_numel, slice_size,
end_size);
}
return cudaGetLastError() != cudaSuccess;
}
#endif
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <thrust/device_vector.h>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
#if IS_TRT_VERSION_GE(6000)
class GatherNdPluginDynamic : public DynamicPluginTensorRT {
public:
explicit GatherNdPluginDynamic(bool with_fp16) { with_fp16_ = with_fp16; }
GatherNdPluginDynamic(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override {
return new GatherNdPluginDynamic(with_fp16_);
}
const char* getPluginType() const override { return "gather_nd_plugin"; }
int getNbOutputs() const override { return 1; }
int initialize() override;
size_t getSerializationSize() const override;
void serialize(void* buffer) const override;
nvinfer1::DimsExprs getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
nvinfer1::IExprBuilder& exprBuilder) override;
bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut,
int nbInputs, int nbOutputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override {
return 0;
}
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) override;
nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes,
int nbInputs) const override;
void destroy() override {
if (input_dims_data_) {
cudaFree(input_dims_data_);
}
delete this;
}
private:
int32_t* input_dims_data_{nullptr};
};
class GatherNdPluginDynamicCreator : public nvinfer1::IPluginCreator {
public:
GatherNdPluginDynamicCreator() {}
const char* getPluginName() const override { return "gather_nd_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new GatherNdPluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(GatherNdPluginDynamicCreator);
#endif
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -23,7 +23,30 @@ function(inference_download INSTALL_DIR URL FILENAME) ...@@ -23,7 +23,30 @@ function(inference_download INSTALL_DIR URL FILENAME)
) )
endfunction() endfunction()
function(inference_download_and_uncompress INSTALL_DIR URL FILENAME) function(inference_download_and_uncompress INSTALL_DIR URL FILENAME CHECK_SUM)
message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
string(REGEX REPLACE "[-%./\\]" "_" FILENAME_EX ${FILENAME})
string(REGEX MATCH "[^/\\]+$" DOWNLOAD_NAME ${FILENAME})
set(EXTERNAL_PROJECT_NAME "extern_download_${FILENAME_EX}")
set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
ExternalProject_Add(
${EXTERNAL_PROJECT_NAME}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${INSTALL_DIR}
URL ${URL}/${FILENAME}
URL_HASH MD5=${CHECK_SUM}
DOWNLOAD_DIR ${INSTALL_DIR}
DOWNLOAD_NO_EXTRACT 1
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR}
${CMAKE_COMMAND} -E tar xzf ${DOWNLOAD_NAME}
UPDATE_COMMAND ""
INSTALL_COMMAND ""
)
endfunction()
function(inference_download_and_uncompress_without_verify INSTALL_DIR URL FILENAME)
message(STATUS "Download inference test stuff from ${URL}/${FILENAME}") message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
string(REGEX REPLACE "[-%./\\]" "_" FILENAME_EX ${FILENAME}) string(REGEX REPLACE "[-%./\\]" "_" FILENAME_EX ${FILENAME})
string(REGEX MATCH "[^/\\]+$" DOWNLOAD_NAME ${FILENAME}) string(REGEX MATCH "[^/\\]+$" DOWNLOAD_NAME ${FILENAME})
...@@ -47,13 +70,13 @@ endfunction() ...@@ -47,13 +70,13 @@ endfunction()
set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
if(NOT EXISTS ${WORD2VEC_INSTALL_DIR}/word2vec.inference.model.tar.gz) if(NOT EXISTS ${WORD2VEC_INSTALL_DIR}/word2vec.inference.model.tar.gz)
inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") inference_download_and_uncompress_without_verify(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
endif() endif()
set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model") set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
set(IMG_CLS_RESNET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/image_classification_resnet") set(IMG_CLS_RESNET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/image_classification_resnet")
if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model.tgz) if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model.tgz)
inference_download_and_uncompress(${IMG_CLS_RESNET_INSTALL_DIR} ${INFERENCE_URL} "image_classification_resnet.inference.model.tgz") inference_download_and_uncompress_without_verify(${IMG_CLS_RESNET_INSTALL_DIR} ${INFERENCE_URL} "image_classification_resnet.inference.model.tgz")
endif() endif()
set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model") set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model")
......
...@@ -115,9 +115,9 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_fun ...@@ -115,9 +115,9 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_fun
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_cc_function) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function)
if (WITH_GPU OR WITH_ROCM) if (WITH_GPU OR WITH_ROCM)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor eigen_cu_function) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
endif() endif()
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
...@@ -171,7 +171,7 @@ endif() ...@@ -171,7 +171,7 @@ endif()
if (WITH_ASCEND_CL) if (WITH_ASCEND_CL)
cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor) cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_cc_function scope device_context enforce executor compare_op) cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_function scope device_context enforce executor compare_op)
endif() endif()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
......
...@@ -14,8 +14,6 @@ ...@@ -14,8 +14,6 @@
#include "paddle/fluid/operators/abs_op.h" #include "paddle/fluid/operators/abs_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
......
...@@ -789,6 +789,27 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { ...@@ -789,6 +789,27 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
} }
}; };
template <typename T>
class SigmoidDoubleGradMaker
: public ::paddle::framework::SingleGradOpMaker<T> {
public:
using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("sigmoid_grad_grad");
// input1: Out
op->SetInput("Out", this->Input("Out"));
// input2: ddx
op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
op->SetAttrMap(this->Attrs());
// output: ddy
op->SetOutput("DOutNew", this->InputGrad("Out"));
op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
}
};
template <typename T> template <typename T>
class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> { class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
public: public:
...@@ -1068,6 +1089,47 @@ namespace plat = paddle::platform; ...@@ -1068,6 +1089,47 @@ namespace plat = paddle::platform;
FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
/* ========================== sigmoid register =============================
*/
// 1. Register Sigmoid Operator
REGISTER_OPERATOR(
sigmoid, ops::ActivationOp, ops::SigmoidOpMaker,
ops::ActivationOpInferVarType,
ops::ActivationGradOpMaker<ops::SigmoidGradFunctor<float>::FwdDeps(),
paddle::framework::OpDesc>,
ops::ActivationGradOpMaker<ops::SigmoidGradFunctor<float>::FwdDeps(),
paddle::imperative::OpBase>,
std::conditional<ops::CanInplaceAct<ops::SigmoidGradFunctor<float>>(),
ops::ActFwdInplaceInferer, void>::type);
// 2. Register Sigmoid Grad Operator
REGISTER_OPERATOR(sigmoid_grad, ops::ActivationOpGrad,
ops::ActivationGradOpInplaceInferer,
ops::SigmoidDoubleGradMaker<paddle::framework::OpDesc>,
ops::SigmoidDoubleGradMaker<paddle::imperative::OpBase>)
// 3. Register Sigmoid DoubleGrad Operator
REGISTER_OPERATOR(
sigmoid_grad_grad,
ops::ActivationOpDoubleGrad<ops::SigmoidGradFunctor<float>::FwdDeps()>,
ops::ActivationDoubleGradOpInplaceInferer);
// Register Sigmoid/GradSigmoid Kernels
REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor,
SigmoidGradFunctor);
// Register DoubleGrad Kernel
REGISTER_OP_CPU_KERNEL(
sigmoid_grad_grad,
ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
ops::SigmoidGradGradFunctor<float>>,
ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
ops::SigmoidGradGradFunctor<double>>,
ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
ops::SigmoidGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* ========================== tanh register ============================= */ /* ========================== tanh register ============================= */
REGISTER_OPERATOR( REGISTER_OPERATOR(
tanh, ops::ActivationOp, ops::TanhOpMaker, ops::ActivationOpInferVarType, tanh, ops::ActivationOp, ops::TanhOpMaker, ops::ActivationOpInferVarType,
......
...@@ -1481,6 +1481,21 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -1481,6 +1481,21 @@ REGISTER_OP_CUDA_KERNEL(
#endif #endif
/* ========================================================================== */ /* ========================================================================== */
/* =========================== sigmoid register ============================
*/
REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
CudaSigmoidGradFunctor);
REGISTER_OP_CUDA_KERNEL(
sigmoid_grad_grad,
ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::SigmoidGradGradFunctor<float>>,
ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::SigmoidGradGradFunctor<double>>,
ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
ops::SigmoidGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* =========================== tanh register ============================ */ /* =========================== tanh register ============================ */
REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor, REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor,
CudaTanhGradFunctor); CudaTanhGradFunctor);
...@@ -1595,7 +1610,6 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -1595,7 +1610,6 @@ REGISTER_OP_CUDA_KERNEL(
/* ========================================================================== */ /* ========================================================================== */
#define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ #define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \
__macro(sigmoid, Sigmoid, CudaSigmoidFunctor, CudaSigmoidGradFunctor); \
__macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \ __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \
__macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \
CudaLogSigmoidGradFunctor); \ CudaLogSigmoidGradFunctor); \
......
...@@ -258,6 +258,43 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> { ...@@ -258,6 +258,43 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
}; };
/*
Out
DOut -> SigmoidGradGrad -> DOutNew
DDX DDOut
DDOut = (1-Out)*Out*DDX
DOutNew = (1-2*Out)*DOut*DDX
*/
template <typename T>
struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
template <typename Device>
void operator()(const Device& dev, const framework::Tensor* Out,
const framework::Tensor* ddX, const framework::Tensor* dOut,
framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
auto out = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
if (dOutNew) {
auto dout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
auto dout_new = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad"));
dout_new.device(*d) =
(static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
}
if (ddOut) {
auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad"));
ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
}
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
};
// silu(x) = x / (1 + exp(-x)) // silu(x) = x / (1 + exp(-x))
template <typename T> template <typename T>
struct SiluFunctor : public BaseActivationFunctor<T> { struct SiluFunctor : public BaseActivationFunctor<T> {
...@@ -1789,6 +1826,50 @@ inline void ExtractDoubleGradTensorWithInputDOut( ...@@ -1789,6 +1826,50 @@ inline void ExtractDoubleGradTensorWithInputDOut(
} }
} }
template <typename DeviceContext, typename Functor>
class SigmoidDoubleGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& ctx) const override {
const framework::Tensor *Out, *ddX, *dOut;
framework::Tensor *dOutNew, *ddOut;
Out = ddX = dOut = nullptr;
dOutNew = ddOut = nullptr;
// extract ddx(input) and out(input)
ddX = ctx.Input<framework::Tensor>("DDX");
Out = ctx.Input<framework::Tensor>("Out");
PADDLE_ENFORCE_NOT_NULL(
ddX, platform::errors::NotFound(
"Cannot get input Variable ddX, variable name = %s",
ctx.InputName("DDX")));
PADDLE_ENFORCE_NOT_NULL(
Out, platform::errors::NotFound(
"Cannot get input Variable Out, variable name = %s",
ctx.InputName("Out")));
// set output ddout
ddOut = ctx.Output<framework::Tensor>("DDOut");
// extract dOut(intput)
dOut = ctx.Input<framework::Tensor>("DOut");
PADDLE_ENFORCE_NOT_NULL(
dOut, platform::errors::NotFound(
"Cannot get input Variable dOut, variable name = %s",
ctx.InputName("DOut")));
// set output dout_new
dOutNew = ctx.Output<framework::Tensor>("DOutNew");
if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
auto& place = ctx.template device_context<DeviceContext>();
Functor functor;
functor(place, Out, ddX, dOut, dOutNew, ddOut);
}
};
template <typename DeviceContext, typename Functor> template <typename DeviceContext, typename Functor>
class TanhDoubleGradKernel class TanhDoubleGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> { : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
...@@ -2153,7 +2234,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> { ...@@ -2153,7 +2234,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
} // namespace paddle } // namespace paddle
#define FOR_EACH_ACTIVATION_OP(__macro) \ #define FOR_EACH_ACTIVATION_OP(__macro) \
__macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor); \
__macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \
__macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \
__macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \
......
...@@ -35,10 +35,10 @@ class PowNPUKernel : public framework::OpKernel<T> { ...@@ -35,10 +35,10 @@ class PowNPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Power", {*x}, {*out}, const auto& runner = NpuOpRunner("Power", {*x}, {*out},
{{"power", factor}, {{"power", factor},
{"scale", static_cast<float>(1.0)}, {"scale", static_cast<float>(1.0)},
{"shift", static_cast<float>(0.0)}}); {"shift", static_cast<float>(0.0)}});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
...@@ -68,8 +68,8 @@ class PowGradNPUKernel : public framework::OpKernel<T> { ...@@ -68,8 +68,8 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
// Step1: Compute x_pow = x.pow(factor-1) // Step1: Compute x_pow = x.pow(factor-1)
Tensor x_pow(x->type()); Tensor x_pow(x->type());
x_pow.mutable_data<T>(x->dims(), place); x_pow.mutable_data<T>(x->dims(), place);
auto runner_pow = NpuOpRunner("Power", {*x}, {x_pow}, const auto& runner_pow = NpuOpRunner(
{{"power", factor - static_cast<float>(1)}}); "Power", {*x}, {x_pow}, {{"power", factor - static_cast<float>(1)}});
runner_pow.Run(stream); runner_pow.Run(stream);
// Step 2: Construct a broadcast factor, which has the same shape with x. // Step 2: Construct a broadcast factor, which has the same shape with x.
...@@ -83,20 +83,21 @@ class PowGradNPUKernel : public framework::OpKernel<T> { ...@@ -83,20 +83,21 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
// factor. // factor.
Tensor factor_bc_tensor(framework::proto::VarType::FP32); Tensor factor_bc_tensor(framework::proto::VarType::FP32);
factor_bc_tensor.mutable_data<float>(x_dims, place); factor_bc_tensor.mutable_data<float>(x_dims, place);
auto runner_bc = NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor}, const auto& runner_bc =
{{"dims", framework::vectorize(x_dims)}}); NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor},
{{"dims", framework::vectorize(x_dims)}});
runner_bc.Run(stream); runner_bc.Run(stream);
// Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1) // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1)
Tensor x_power_mul_factor(x->type()); Tensor x_power_mul_factor(x->type());
x_power_mul_factor.mutable_data<T>(x->dims(), place); x_power_mul_factor.mutable_data<T>(x->dims(), place);
auto runner_mul_1 = const auto& runner_mul_1 =
NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {}); NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {});
runner_mul_1.Run(stream); runner_mul_1.Run(stream);
// Step 4: Compute dx = dout * factor * x.pow(factor-1) // Step 4: Compute dx = dout * factor * x.pow(factor-1)
dx->mutable_data<T>(place); dx->mutable_data<T>(place);
auto runner_mul_2 = const auto& runner_mul_2 =
NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {}); NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {});
runner_mul_2.Run(stream); runner_mul_2.Run(stream);
} }
...@@ -111,11 +112,11 @@ class ReluNPUKernel : public framework::OpKernel<T> { ...@@ -111,11 +112,11 @@ class ReluNPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Relu", const auto& runner = NpuOpRunner("Relu",
{ {
*x, *x,
}, },
{*out}, {}); {*out}, {});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
...@@ -137,7 +138,7 @@ class ReluGradNPUKernel : public framework::OpKernel<T> { ...@@ -137,7 +138,7 @@ class ReluGradNPUKernel : public framework::OpKernel<T> {
.stream(); .stream();
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {}); const auto& runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {});
runner.Run(stream); runner.Run(stream);
} }
...@@ -159,7 +160,7 @@ class SqrtNPUKernel : public framework::OpKernel<T> { ...@@ -159,7 +160,7 @@ class SqrtNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Sqrt", {*x}, {*out}, {}); const auto& runner = NpuOpRunner("Sqrt", {*x}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
...@@ -181,8 +182,8 @@ class SqrtGradNPUKernel : public framework::OpKernel<T> { ...@@ -181,8 +182,8 @@ class SqrtGradNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto dx_runner = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {}); const auto& runner_dx = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {});
dx_runner.Run(stream); runner_dx.Run(stream);
} }
}; };
...@@ -204,16 +205,16 @@ class LogNPUKernel : public framework::OpKernel<T> { ...@@ -204,16 +205,16 @@ class LogNPUKernel : public framework::OpKernel<T> {
Tensor one(x->type()); Tensor one(x->type());
one.mutable_data<T>(x->dims(), place); one.mutable_data<T>(x->dims(), place);
auto one_runner = NpuOpRunner("OnesLike", {*x}, {one}, {}); const auto& runner_one = NpuOpRunner("OnesLike", {*x}, {one}, {});
one_runner.Run(stream); runner_one.Run(stream);
Tensor sub(x->type()); Tensor sub(x->type());
sub.mutable_data<T>(x->dims(), place); sub.mutable_data<T>(x->dims(), place);
auto sub_runner = NpuOpRunner("Sub", {*x, one}, {sub}, {}); const auto& runner_sub = NpuOpRunner("Sub", {*x, one}, {sub}, {});
sub_runner.Run(stream); runner_sub.Run(stream);
auto out_runner = NpuOpRunner("Log1p", {sub}, {*out}, {}); const auto& runner_out = NpuOpRunner("Log1p", {sub}, {*out}, {});
out_runner.Run(stream); runner_out.Run(stream);
} }
}; };
...@@ -233,7 +234,7 @@ class LogGradNPUKernel : public framework::OpKernel<T> { ...@@ -233,7 +234,7 @@ class LogGradNPUKernel : public framework::OpKernel<T> {
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {}); const auto& runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
...@@ -254,7 +255,7 @@ class TanhNPUKernel : public framework::OpKernel<T> { ...@@ -254,7 +255,7 @@ class TanhNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Tanh", {*x}, {*out}, {}); const auto& runner = NpuOpRunner("Tanh", {*x}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
...@@ -276,8 +277,8 @@ class TanhGradNPUKernel : public framework::OpKernel<T> { ...@@ -276,8 +277,8 @@ class TanhGradNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto dx_runner = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {}); const auto& runner_dx = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {});
dx_runner.Run(stream); runner_dx.Run(stream);
} }
}; };
...@@ -297,7 +298,7 @@ class SquareNPUKernel : public framework::OpKernel<T> { ...@@ -297,7 +298,7 @@ class SquareNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Square", {*x}, {*out}, {}); const auto& runner = NpuOpRunner("Square", {*x}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
......
...@@ -29,7 +29,8 @@ class AllocFloatStatusKernel : public framework::OpKernel<T> { ...@@ -29,7 +29,8 @@ class AllocFloatStatusKernel : public framework::OpKernel<T> {
auto* float_status = ctx.Output<framework::Tensor>("FloatStatus"); auto* float_status = ctx.Output<framework::Tensor>("FloatStatus");
float_status->mutable_data<T>(ctx.GetPlace()); float_status->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status}); const auto& runner =
NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
......
...@@ -42,13 +42,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> { ...@@ -42,13 +42,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
found_inf->mutable_data<bool>(ctx.GetPlace()); found_inf->mutable_data<bool>(ctx.GetPlace());
bool found_inf_data = false;
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
// step1: inverse scale(RealDiv) // step1: inverse scale
Tensor const_tensor; Tensor const_tensor;
const_tensor.mutable_data<T>({1}, ctx.GetPlace()); const_tensor.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0)); FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
...@@ -58,7 +56,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> { ...@@ -58,7 +56,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
Tensor inverse_out(scale->type()); Tensor inverse_out(scale->type());
inverse_out.Resize(scale->dims()); inverse_out.Resize(scale->dims());
inverse_out.mutable_data<T>(ctx.GetPlace()); inverse_out.mutable_data<T>(ctx.GetPlace());
auto runner_inverse = const auto& runner_inverse =
NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {}); NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {});
runner_inverse.Run(stream); runner_inverse.Run(stream);
tmp_inverse_out = &inverse_out; tmp_inverse_out = &inverse_out;
...@@ -66,55 +64,41 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> { ...@@ -66,55 +64,41 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
// NOTE(zhiqiu): // NOTE(zhiqiu):
Tensor tmp; Tensor tmp;
tmp.mutable_data<float>({8}, ctx.GetPlace()); tmp.mutable_data<float>({8}, ctx.GetPlace());
// NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place. // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
// tmp is only placeholder. // tmp is only placeholder.
auto runner_float_status = const auto& runner_float_status =
NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp}, NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp},
{{"message", std::string("check_nan_and_inf")}}); {{"message", std::string("check_nan_and_inf")}});
runner_float_status.Run(stream); runner_float_status.Run(stream);
Tensor sum; Tensor sum;
sum.mutable_data<float>({1}, ctx.GetPlace()); sum.mutable_data<float>({1}, ctx.GetPlace());
auto runner_reduce_sum = const auto& runner_reduce_sum =
NpuOpRunner("ReduceSumD", {*float_status}, {sum}, NpuOpRunner("ReduceSumD", {*float_status}, {sum},
{{"axes", std::vector<int>{0}}, {"keep_dims", true}}); {{"axes", std::vector<int>{0}}, {"keep_dims", true}});
runner_reduce_sum.Run(stream); runner_reduce_sum.Run(stream);
std::vector<float> sum_vec; const auto& runner_greater =
TensorToVector( NpuOpRunner("GreaterEqual", {sum, const_tensor}, {*found_inf}, {});
sum, ctx.template device_context<paddle::platform::NPUDeviceContext>(), runner_greater.Run(stream);
&sum_vec);
found_inf_data = (sum_vec[0] > 1); // NOTE(zhiqiu): The normal logic is :
// out = in, if found_inf = true
VLOG(4) << "found_inf_data:" << found_inf_data; // out = in/scale, if found_inf = false
// However, on NPU, in order to avoid stream sync, we do not copy the
// found_inf data to cpu to check whether to unscale or not.
// Instead, we do the Mul no matter found_inf or not.
// And, a fact is, only few steps contains nan/inf during training.
for (size_t i = 0; i < xs.size(); ++i) { for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i]; const auto* x = xs[i];
auto* out = outs[i]; auto* out = outs[i];
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
if (!found_inf_data) { const auto& runner_mul =
// MatMul NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
auto runner_matmul = runner_mul.Run(stream);
NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
runner_matmul.Run(stream);
}
} }
// set found_inf to true const auto& runner_clear_status =
VLOG(4) << "found overflow:" << found_inf_data;
Tensor found_inf_tensor;
found_inf_tensor.Resize({1});
bool* is_found_inf =
found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
*is_found_inf = found_inf_data;
framework::TensorCopy(
found_inf_tensor, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), found_inf);
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
auto runner_clear_status =
NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp}); NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
runner_clear_status.Run(stream); runner_clear_status.Run(stream);
} }
......
...@@ -43,18 +43,18 @@ void Update(const platform::NPUDeviceContext& ctx, ...@@ -43,18 +43,18 @@ void Update(const platform::NPUDeviceContext& ctx,
Tensor factor_tensor(bad_out_tensor->type()); Tensor factor_tensor(bad_out_tensor->type());
factor_tensor.mutable_data<int>({1}, place); factor_tensor.mutable_data<int>({1}, place);
FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1)); FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor}, const auto& runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
{*bad_out_tensor}, {}); {*bad_out_tensor}, {});
runner_p2.Run(stream); runner_p2.Run(stream);
std::vector<int> bad_out_data; std::vector<int> bad_out_data;
TensorToVector(*bad_out_tensor, ctx, &bad_out_data); TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
if (bad_out_data[0] == decr_every_n_nan_or_inf) { if (bad_out_data[0] == decr_every_n_nan_or_inf) {
auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor}, {*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)}, {{"power", static_cast<float>(1)},
{"scale", decr_ratio}, {"scale", decr_ratio},
{"shift", static_cast<float>(0)}}); {"shift", static_cast<float>(0)}});
runner_p3.Run(stream); runner_p3.Run(stream);
...@@ -62,11 +62,11 @@ void Update(const platform::NPUDeviceContext& ctx, ...@@ -62,11 +62,11 @@ void Update(const platform::NPUDeviceContext& ctx,
TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
if (new_loss_scaling[0] < static_cast<T>(1)) { if (new_loss_scaling[0] < static_cast<T>(1)) {
// updated_loss_scaling_data = 1 // updated_loss_scaling_data = 1
auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor}, {*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)}, {{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)}, {"scale", static_cast<float>(0)},
{"shift", static_cast<float>(1)}}); {"shift", static_cast<float>(1)}});
runner_p4.Run(stream); runner_p4.Run(stream);
} }
...@@ -86,30 +86,30 @@ void Update(const platform::NPUDeviceContext& ctx, ...@@ -86,30 +86,30 @@ void Update(const platform::NPUDeviceContext& ctx,
Tensor factor_tensor(good_out_tensor->type()); Tensor factor_tensor(good_out_tensor->type());
factor_tensor.mutable_data<int>({1}, place); factor_tensor.mutable_data<int>({1}, place);
FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1)); FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor}, const auto& runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
{*good_out_tensor}, {}); {*good_out_tensor}, {});
runner_p2.Run(stream); runner_p2.Run(stream);
std::vector<int> good_out_data; std::vector<int> good_out_data;
TensorToVector(*good_out_tensor, ctx, &good_out_data); TensorToVector(*good_out_tensor, ctx, &good_out_data);
if (good_out_data[0] == incr_every_n_steps) { if (good_out_data[0] == incr_every_n_steps) {
auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor}, {*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)}, {{"power", static_cast<float>(1)},
{"scale", incr_ratio}, {"scale", incr_ratio},
{"shift", static_cast<float>(0)}}); {"shift", static_cast<float>(0)}});
runner_p3.Run(stream); runner_p3.Run(stream);
std::vector<T> new_loss_scaling; std::vector<T> new_loss_scaling;
TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
if (!std::isfinite(new_loss_scaling[0])) { if (!std::isfinite(new_loss_scaling[0])) {
// updated_loss_scaling_data = pre_loss_scaling_data // updated_loss_scaling_data = pre_loss_scaling_data
auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor}, {*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)}, {{"power", static_cast<float>(1)},
{"scale", static_cast<float>(1)}, {"scale", static_cast<float>(1)},
{"shift", static_cast<float>(0)}}); {"shift", static_cast<float>(0)}});
runner_p4.Run(stream); runner_p4.Run(stream);
} }
...@@ -165,7 +165,7 @@ class LazyZerosNPU { ...@@ -165,7 +165,7 @@ class LazyZerosNPU {
} }
zero_tensor->mutable_data<T>(place); zero_tensor->mutable_data<T>(place);
auto runner_zeros = const auto& runner_zeros =
NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor}); NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor});
runner_zeros.Run(stream); runner_zeros.Run(stream);
zero_tensor->check_memory_size(); zero_tensor->check_memory_size();
......
...@@ -43,7 +43,7 @@ class AssignNPUKernel : public framework::OpKernel<T> { ...@@ -43,7 +43,7 @@ class AssignNPUKernel : public framework::OpKernel<T> {
auto* out = ctx.Output<framework::LoDTensor>("Out"); auto* out = ctx.Output<framework::LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {}); const auto& runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
......
cc_test(op_tester SRCS op_tester.cc op_tester_config.cc cc_test(op_tester SRCS op_tester.cc op_tester_config.cc
DEPS memory timer framework_proto proto_desc lod_tensor op_registry DEPS memory timer framework_proto proto_desc lod_tensor op_registry
device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} eigen_function)
...@@ -78,8 +78,8 @@ class CastNPUKernel : public framework::OpKernel<T> { ...@@ -78,8 +78,8 @@ class CastNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Cast", {*x}, {*out}, const auto& runner = NpuOpRunner(
{{"dst_type", static_cast<int32_t>(aclDtype)}}); "Cast", {*x}, {*out}, {{"dst_type", static_cast<int32_t>(aclDtype)}});
runner.Run(stream); runner.Run(stream);
} }
}; };
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_embedding_op.h"
namespace paddle {
namespace operators {
class CEmbeddingOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CEmbeddingOp");
OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "CEmbeddingOp");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CEmbeddingOp");
auto table_dims = ctx->GetInputDim("W");
auto ids_dims = ctx->GetInputDim("Ids");
int ids_rank = ids_dims.size();
VLOG(5) << "ids rank is " << ids_rank << std::endl;
PADDLE_ENFORCE_EQ(
table_dims.size(), 2,
platform::errors::InvalidArgument(
"ShapeError: The dimensions of the 'c_embedding' must be 2. "
"But received c_embedding's dimensions = %d, "
"c_embedding's shape = [%s].",
table_dims.size(), table_dims));
auto output_dims = framework::vectorize(ids_dims);
output_dims.push_back(table_dims[1]);
ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
if (ctx->GetOutputsVarType("Out")[0] ==
framework::proto::VarType::LOD_TENSOR) {
ctx->ShareLoD("Ids", /*->*/ "Out");
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "W");
return framework::OpKernelType(data_type, ctx.device_context());
}
};
class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("W",
"(Tensor) The input represents embedding tensors, "
"which is a learnable parameter.");
AddInput("Ids",
"An input with type int64 "
"contains the ids to be looked up in W.");
AddOutput("Out", "The lookup results, which have the same type as W.");
AddAttr<int64_t>("start_index",
"(int64, default 0), The starting index is indeed, "
"and the out-of-bounds will be set to 0 ")
.SetDefault(0);
AddComment(R"DOC(
c_embedding Operator.
This operator is used to perform lookups on the parameter W,
then concatenated into a dense tensor.
The input Ids can carry the LoD (Level of Details) information,
or not. And the output only shares the LoD information with input Ids.
)DOC");
}
};
DECLARE_NO_NEED_BUFFER_VARS_INFERER(CEmbeddingGradOpNoBufferVarsInferer, "W");
template <typename T>
class CEmbeddingGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("c_embedding_grad");
op->SetInput("W", this->Input("W"));
op->SetInput("Ids", this->Input("Ids"));
op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
op->SetOutput(framework::GradVarName("W"), this->InputGrad("W"));
op->SetAttrMap(this->Attrs());
}
};
class CEmbeddingOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
auto table_dims = ctx->GetInputDim("W");
ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out"));
return framework::OpKernelType(data_type, ctx.device_context());
}
};
class CEmbeddingOpGradVarTypeInference : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext* ctx) const override {
auto out_var_name = framework::GradVarName("W");
VLOG(3) << "c_embedding_grad op " << framework::GradVarName("W")
<< " is set to LoDTensor";
ctx->SetOutputType(out_var_name, framework::proto::VarType::LOD_TENSOR);
ctx->SetOutputDataType(out_var_name, ctx->GetInputDataType("W"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(c_embedding, ops::CEmbeddingOp, ops::CEmbeddingOpMaker,
ops::CEmbeddingGradOpMaker<paddle::framework::OpDesc>,
ops::CEmbeddingGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(c_embedding_grad, ops::CEmbeddingOpGrad,
ops::CEmbeddingGradOpNoBufferVarsInferer,
ops::CEmbeddingOpGradVarTypeInference);
REGISTER_OP_CPU_KERNEL(c_embedding, ops::CEmbeddingOpCPUKernel<float>,
ops::CEmbeddingOpCPUKernel<double>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/collective/c_embedding_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096;
static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaxinumNumBlocks);
}
template <typename T, typename IndexT>
__global__ void CEmbedding(T *out, const T *table, const IndexT *ids,
const int rows, const int columns, const int64_t N,
const int64_t start_idx, const int64_t end_idx,
const int64_t limit) {
CUDA_KERNEL_LOOP(i, limit) {
size_t row = i / columns;
size_t col = i % columns;
auto id = ids[row];
if (id >= start_idx && id < end_idx) {
auto real_idx = id - start_idx;
PADDLE_ENFORCE(real_idx < N,
"The index is out of bounds, "
"please check whether the dimensions of index and "
"input meet the requirements. It should "
"be less than [%d], but received [%d]",
N, real_idx);
out[i] = table[real_idx * columns + col];
} else {
out[i] = static_cast<T>(0);
}
}
}
template <typename T, typename IndexT>
__global__ void CEmbeddingGrad(T *table, const T *output, const IndexT *ids,
const int rows, const int columns,
const int64_t N, const int64_t start_idx,
const int64_t end_idx, const int64_t limit) {
CUDA_KERNEL_LOOP(i, limit) {
size_t row = i / columns;
size_t col = i % columns;
auto id = ids[row];
if (id >= start_idx && id < end_idx) {
auto real_idx = id - start_idx;
paddle::platform::CudaAtomicAdd(&table[real_idx * columns + col],
output[i]);
}
}
}
template <typename T>
class CEmbeddingCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *table_t = context.Input<LoDTensor>("W");
auto *ids_t = context.Input<LoDTensor>("Ids");
auto *output_t = context.Output<LoDTensor>("Out");
const auto &dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
const int64_t start_idx = context.Attr<int64_t>("start_index");
size_t N = table_t->dims()[0];
size_t D = table_t->dims()[1];
size_t K = ids_t->numel();
const int64_t end_idx = start_idx + N;
auto *table = table_t->data<T>();
auto *output = output_t->mutable_data<T>(context.GetPlace());
auto limit = K * D;
int blocks = NumBlocks(limit);
int threads = kNumCUDAThreads;
const auto &index_type = ids_t->type();
if (index_type == framework::proto::VarType::INT32) {
CEmbedding<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
output, table, ids_t->data<int32_t>(), K, D, N, start_idx, end_idx,
limit);
} else if (index_type == framework::proto::VarType::INT64) {
CEmbedding<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
output, table, ids_t->data<int64_t>(), K, D, N, start_idx, end_idx,
limit);
}
}
};
template <typename T>
class CEmbeddingGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
const auto &dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
const int64_t start_idx = context.Attr<int64_t>("start_index");
auto ids_t = context.Input<LoDTensor>("Ids");
auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto d_table_t = context.Output<LoDTensor>(framework::GradVarName("W"));
int N = d_table_t->dims()[0];
int D = d_table_t->dims()[1];
int K = ids_t->numel();
const int64_t end_idx = start_idx + N;
auto limit = K * D;
int blocks = NumBlocks(limit);
int threads = kNumCUDAThreads;
const T *d_output = d_output_t->data<T>();
T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*d_table_t);
t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
const auto &index_type = ids_t->type();
if (index_type == framework::proto::VarType::INT32) {
CEmbeddingGrad<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
d_table, d_output, ids_t->data<int32_t>(), K, D, N, start_idx,
end_idx, limit);
} else if (index_type == framework::proto::VarType::INT64) {
CEmbeddingGrad<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
d_table, d_output, ids_t->data<int64_t>(), K, D, N, start_idx,
end_idx, limit);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(c_embedding, ops::CEmbeddingCUDAKernel<float>,
ops::CEmbeddingCUDAKernel<double>,
ops::CEmbeddingCUDAKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(c_embedding_grad, ops::CEmbeddingGradCUDAKernel<float>,
ops::CEmbeddingGradCUDAKernel<double>,
ops::CEmbeddingGradCUDAKernel<plat::float16>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -12,12 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/sign_op.h" #pragma once
#include "paddle/fluid/platform/float16.h"
REGISTER_OP_CUDA_KERNEL( #include <algorithm>
sign, #include <utility>
paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>, #include <vector>
paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, double>,
paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, #include "paddle/fluid/framework/data_type.h"
paddle::platform::float16>); #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
template <typename T>
class CEmbeddingOpCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_THROW(platform::errors::Unavailable(
"Do not support c_embedding for cpu kernel now."));
}
};
} // namespace operators
} // namespace paddle
...@@ -45,6 +45,12 @@ class CSplitOp : public framework::OperatorWithKernel { ...@@ -45,6 +45,12 @@ class CSplitOp : public framework::OperatorWithKernel {
rank, nranks)); rank, nranks));
framework::DDim dim = ctx->GetInputDim("X"); framework::DDim dim = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(
dim[dim.size() - 1] % nranks, 0,
platform::errors::InvalidArgument("The last dimension (%d) of the X "
"should be divisible by nranks (%d)",
dim[dim.size() - 1], nranks));
dim[dim.size() - 1] = dim[dim.size() - 1] / nranks; dim[dim.size() - 1] = dim[dim.size() - 1] / nranks;
if (dim[0] < 0) dim[0] = -1; if (dim[0] < 0) dim[0] = -1;
ctx->SetOutputDim("Out", dim); ctx->SetOutputDim("Out", dim);
......
...@@ -16,10 +16,38 @@ limitations under the License. */ ...@@ -16,10 +16,38 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_split_op.h" #include "paddle/fluid/operators/collective/c_split_op.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096;
static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaxinumNumBlocks);
}
template <typename T>
__global__ void SplitFromRank(const T* input, T* output, const int rows,
const int columns, const int rank,
const int nranks, const int limit) {
CUDA_KERNEL_LOOP(i, limit) {
int row = i / columns;
int col = i % columns;
int block = columns / nranks;
int start = block * rank;
int end = start + block;
if (col >= start && col < end) {
int idx = block * row + col % block;
output[idx] = input[i];
}
}
}
template <typename T> template <typename T>
class CSplitOpCUDAKernel : public framework::OpKernel<T> { class CSplitOpCUDAKernel : public framework::OpKernel<T> {
public: public:
...@@ -47,24 +75,25 @@ class CSplitOpCUDAKernel : public framework::OpKernel<T> { ...@@ -47,24 +75,25 @@ class CSplitOpCUDAKernel : public framework::OpKernel<T> {
rank, nranks)); rank, nranks));
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
std::vector<const framework::Tensor*> shape_refer;
std::vector<framework::Tensor*> results;
size_t numel = x->numel();
auto dims = x->dims(); auto dims = x->dims();
numel /= nranks; auto dims_size = dims.size();
int axis = dims.size() - 1; // final dim
dims[dims.size() - 1] /= nranks; int64_t end_size = dims[dims_size - 1];
for (int i = 0; i < nranks; i++) {
framework::Tensor* out = new framework::Tensor();
out->mutable_data<T>(dims, place);
shape_refer.emplace_back(out);
results.emplace_back(out);
}
math::SplitFunctor<platform::CUDADeviceContext, T> functor; // remain dim
functor(dev_ctx, *x, shape_refer, axis, &results); auto remain_ddim = framework::slice_ddim(dims, 0, dims_size - 1);
int64_t remain_numel = framework::product(remain_ddim);
int limit = x->numel();
int blocks = NumBlocks(limit);
int threads = kNumCUDAThreads;
dims[dims_size - 1] /= nranks;
out->mutable_data<T>(dims, place); out->mutable_data<T>(dims, place);
paddle::framework::TensorCopySync(*results[rank], out->place(), out);
SplitFromRank<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
x->data<T>(), out->data<T>(), remain_numel, end_size, rank, nranks,
limit);
} }
}; };
} // namespace operators } // namespace operators
......
type: "batch_norm"
def {
inputs {
name: "X"
}
inputs {
name: "Scale"
}
inputs {
name: "Bias"
}
inputs {
name: "Mean"
}
inputs {
name: "Variance"
}
outputs {
name: "Y"
}
attrs {
name: "epsilon"
type: FLOAT
}
}
extra {
inputs {
name: "MomentumTensor"
}
attrs {
name: "is_test"
type: BOOLEAN
}
attrs {
name: "momentum"
type: FLOAT
}
attrs {
name: "data_layout"
type: STRING
}
attrs {
name: "use_mkldnn"
type: BOOLEAN
}
attrs {
name: "fuse_with_relu"
type: BOOLEAN
}
attrs {
name: "use_global_stats"
type: BOOLEAN
}
attrs {
name: "trainable_statistics"
type: BOOLEAN
}
outputs {
name: "MeanOut"
}
outputs {
name: "VarianceOut"
}
outputs {
name: "SavedMean"
}
outputs {
name: "SavedVariance"
}
outputs {
name: "ReserveSpace"
}
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "concat"
def {
inputs {
name: "X"
}
inputs {
name: "AxisTensor"
}
outputs {
name: "Out"
}
attrs {
name: "axis"
type: INT
}
}
extra {
attrs {
name: "use_mkldnn"
type: BOOLEAN
}
attrs {
name: "use_quantizer"
type: BOOLEAN
}
attrs {
name: "mkldnn_data_type"
type: STRING
}
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
type: "conv2d"
def {
inputs {
name: "Input"
}
inputs {
name: "Filter"
}
inputs {
name: "Bias"
}
outputs {
name: "Output"
}
attrs {
name: "strides"
type: INTS
}
attrs {
name: "paddings"
type: INTS
}
attrs {
name: "padding_algorithm"
type: STRING
}
attrs {
name: "groups"
type: INT
}
attrs {
name: "dilations"
type: INTS
}
}
extra {
inputs {
name: "ResidualData"
}
attrs {
name: "is_test"
type: BOOLEAN
}
attrs {
name: "use_cudnn"
type: BOOLEAN
}
attrs {
name: "fuse_relu_before_depthwise_conv"
type: BOOLEAN
}
attrs {
name: "use_mkldnn"
type: BOOLEAN
}
attrs {
name: "use_quantizer"
type: BOOLEAN
}
attrs {
name: "mkldnn_data_type"
type: STRING
}
attrs {
name: "fuse_relu"
type: BOOLEAN
}
attrs {
name: "fuse_brelu"
type: BOOLEAN
}
attrs {
name: "fuse_brelu_threshold"
type: FLOAT
}
attrs {
name: "fuse_activation"
type: STRING
}
attrs {
name: "fuse_alpha"
type: FLOAT
}
attrs {
name: "fuse_beta"
type: FLOAT
}
attrs {
name: "use_addto"
type: BOOLEAN
}
attrs {
name: "fuse_residual_connection"
type: BOOLEAN
}
attrs {
name: "Scale_in"
type: FLOAT
}
attrs {
name: "Scale_out"
type: FLOAT
}
attrs {
name: "Scale_in_eltwise"
type: FLOAT
}
attrs {
name: "Scale_weights"
type: FLOATS
}
attrs {
name: "force_fp32_output"
type: BOOLEAN
}
attrs {
name: "data_format"
type: STRING
}
attrs {
name: "workspace_size_MB"
type: INT
}
attrs {
name: "exhaustive_search"
type: BOOLEAN
}
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
type: "fake_channel_wise_quantize_abs_max"
def {
inputs {
name: "X"
}
outputs {
name: "Out"
}
outputs {
name: "OutScale"
}
attrs {
name: "quant_axis"
type: INT
}
attrs {
name: "bit_length"
type: INT
}
}
extra {
attrs {
name: "is_test"
type: BOOLEAN
}
attrs {
name: "op_role"
type: INT
}
attrs {
name: "op_role_var"
type: STRINGS
}
attrs {
name: "op_namescope"
type: STRING
}
attrs {
name: "op_callstack"
type: STRINGS
}
attrs {
name: "op_device"
type: STRING
}
}
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册